Till: Install necessary dependencies from nuggets or preferred market
create a console app and add reference to class lib project
//Code
using System.Net;
using System.Net.Mime;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Microsoft.Extensions.DependencyInjection;
using Polly;
namespace EmailFinder.Finders;
public class Finder
{
private IServiceProvider _service;
private IHttpClientFactory? _httpclient;
HashSet<string> urls = new HashSet<string>();
HashSet<string> emails = new HashSet<string>();
private string html = String.Empty;
public Finder()
{
_service = new ServiceCollection().AddHttpClient().BuildServiceProvider();
_httpclient = _service.GetService<IHttpClientFactory>();
}
public async Task<HashSet<string>> WebScrapper(string website)
{
try{
var policy = Policy.Handle<TaskCanceledException>().OrResult<HttpResponseMessage>(r=> r.StatusCode == HttpStatusCode.RequestTimeout).WaitAndRetryAsync(3, retries => TimeSpan.FromSeconds(Math.Pow(2, retries)), onRetryAsync: async (exception, retrycount, context) => {
if(context != null)
{
var request = new HttpRequestMessage(HttpMethod.Get, context.OperationKey.ToString());
}
});
using(var client = _httpclient.CreateClient())
{
HttpRequestMessage requestMessage = new HttpRequestMessage()
{
RequestUri = new Uri(website),
Headers = {
{"Connection", "keep-alive"},
{"User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36"},
{"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"},
{"Accept-Language", "en-US,en;q=0.5"},
},
Method = HttpMethod.Get
};
var response = await policy.ExecuteAsync(async() => await client.SendAsync(requestMessage));
html = response.Content.ReadAsStringAsync().Result;
}
HtmlDocument document = new HtmlDocument();
document.LoadHtml(html);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//ul[li/a[@href]]//a[@href]");
foreach(var url in nodes)
{
string link = url.GetAttributeValue("href", "");
string fulllink = String.Empty;
if(!link.Contains(website[..^1]))
{
fulllink = website + link;
urls.Add(fulllink);
}else
{
urls.Add(link);
}
}
var checkhomepage = await GetEmail(website);
if(checkhomepage.Count > 0)
{
Console.WriteLine("Email Found!");
}else
{
var priortizelist = urls.OrderByDescending(url => url.ToLower().Contains("about") || url.ToLower().Contains("about"));
foreach(var url in priortizelist)
{
var checkotherpage = await GetEmail(url);
if(checkotherpage.Count > 0)
{
Console.WriteLine("Email Found");
break;
}else
{
Console.WriteLine("Email not Found");
}
}
}
}catch(HttpRequestException ex)
{
Console.WriteLine(ex.Message);
}
return urls;
}
public async Task<HashSet<string>> GetEmail(string website)
{
string html = string.Empty;
try
{
var retrypolicy = Policy.Handle<TaskCanceledException>().OrResult<HttpResponseMessage>(r=> r.StatusCode == HttpStatusCode.RequestTimeout).WaitAndRetryAsync(3, retries => TimeSpan.FromSeconds(Math.Pow(2, retries)));
using(var client = _httpclient.CreateClient())
{
HttpRequestMessage request = new HttpRequestMessage()
{
RequestUri = new Uri(website),
Headers = {
{"Connection", "keep-alive"},
{"User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36"},
{"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"},
{"Accept-Language", "en-US,en;q=0.5"},
},
Method = HttpMethod.Get
};
var response = await retrypolicy.ExecuteAsync(async() => await client.SendAsync(request));
html = response.Content.ReadAsStringAsync().Result;
}
string emailpattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b";
MatchCollection emailmatch = Regex.Matches(html, emailpattern);
foreach(Match item in emailmatch)
{
if(!emails.Contains(item.Value.Trim()))
{
await using(var sw = File.AppendText("leads.txt"))
{
await sw.WriteLineAsync(item.Value.Trim().ToString());
}
}
emails.Add(item.Value.Trim());
}
// HtmlDocument document = new HtmlDocument();
// document.LoadHtml(html);
// HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//text()[matches(., '[^@]+@[^@]+')]");
// foreach(var item in nodes)
// {
// emails.Add(item.InnerText.ToString());
// }
}catch(HttpRequestException ex)
{
Console.WriteLine(ex.Message);
}
return emails;
}
}
create a console app and add reference to class lib project
//Code
using System.Net;
using System.Net.Mime;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Microsoft.Extensions.DependencyInjection;
using Polly;
namespace EmailFinder.Finders;
public class Finder
{
private IServiceProvider _service;
private IHttpClientFactory? _httpclient;
HashSet<string> urls = new HashSet<string>();
HashSet<string> emails = new HashSet<string>();
private string html = String.Empty;
public Finder()
{
_service = new ServiceCollection().AddHttpClient().BuildServiceProvider();
_httpclient = _service.GetService<IHttpClientFactory>();
}
public async Task<HashSet<string>> WebScrapper(string website)
{
try{
var policy = Policy.Handle<TaskCanceledException>().OrResult<HttpResponseMessage>(r=> r.StatusCode == HttpStatusCode.RequestTimeout).WaitAndRetryAsync(3, retries => TimeSpan.FromSeconds(Math.Pow(2, retries)), onRetryAsync: async (exception, retrycount, context) => {
if(context != null)
{
var request = new HttpRequestMessage(HttpMethod.Get, context.OperationKey.ToString());
}
});
using(var client = _httpclient.CreateClient())
{
HttpRequestMessage requestMessage = new HttpRequestMessage()
{
RequestUri = new Uri(website),
Headers = {
{"Connection", "keep-alive"},
{"User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36"},
{"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"},
{"Accept-Language", "en-US,en;q=0.5"},
},
Method = HttpMethod.Get
};
var response = await policy.ExecuteAsync(async() => await client.SendAsync(requestMessage));
html = response.Content.ReadAsStringAsync().Result;
}
HtmlDocument document = new HtmlDocument();
document.LoadHtml(html);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//ul[li/a[@href]]//a[@href]");
foreach(var url in nodes)
{
string link = url.GetAttributeValue("href", "");
string fulllink = String.Empty;
if(!link.Contains(website[..^1]))
{
fulllink = website + link;
urls.Add(fulllink);
}else
{
urls.Add(link);
}
}
var checkhomepage = await GetEmail(website);
if(checkhomepage.Count > 0)
{
Console.WriteLine("Email Found!");
}else
{
var priortizelist = urls.OrderByDescending(url => url.ToLower().Contains("about") || url.ToLower().Contains("about"));
foreach(var url in priortizelist)
{
var checkotherpage = await GetEmail(url);
if(checkotherpage.Count > 0)
{
Console.WriteLine("Email Found");
break;
}else
{
Console.WriteLine("Email not Found");
}
}
}
}catch(HttpRequestException ex)
{
Console.WriteLine(ex.Message);
}
return urls;
}
public async Task<HashSet<string>> GetEmail(string website)
{
string html = string.Empty;
try
{
var retrypolicy = Policy.Handle<TaskCanceledException>().OrResult<HttpResponseMessage>(r=> r.StatusCode == HttpStatusCode.RequestTimeout).WaitAndRetryAsync(3, retries => TimeSpan.FromSeconds(Math.Pow(2, retries)));
using(var client = _httpclient.CreateClient())
{
HttpRequestMessage request = new HttpRequestMessage()
{
RequestUri = new Uri(website),
Headers = {
{"Connection", "keep-alive"},
{"User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36"},
{"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"},
{"Accept-Language", "en-US,en;q=0.5"},
},
Method = HttpMethod.Get
};
var response = await retrypolicy.ExecuteAsync(async() => await client.SendAsync(request));
html = response.Content.ReadAsStringAsync().Result;
}
string emailpattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b";
MatchCollection emailmatch = Regex.Matches(html, emailpattern);
foreach(Match item in emailmatch)
{
if(!emails.Contains(item.Value.Trim()))
{
await using(var sw = File.AppendText("leads.txt"))
{
await sw.WriteLineAsync(item.Value.Trim().ToString());
}
}
emails.Add(item.Value.Trim());
}
// HtmlDocument document = new HtmlDocument();
// document.LoadHtml(html);
// HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//text()[matches(., '[^@]+@[^@]+')]");
// foreach(var item in nodes)
// {
// emails.Add(item.InnerText.ToString());
// }
}catch(HttpRequestException ex)
{
Console.WriteLine(ex.Message);
}
return emails;
}
}