.net crawler code to find emails from websites. (provide list of websites in a file)

beezblock · 03.08.2024

Till: Install necessary dependencies from nuggets or preferred market
create a console app and add reference to class lib project

//Code

using System.Net;
using System.Net.Mime;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Microsoft.Extensions.DependencyInjection;
using Polly;
namespace EmailFinder.Finders;
public class Finder
{
private IServiceProvider _service;
private IHttpClientFactory? _httpclient;
HashSet<string> urls = new HashSet<string>();
HashSet<string> emails = new HashSet<string>();
private string html = String.Empty;
public Finder()
{
_service = new ServiceCollection().AddHttpClient().BuildServiceProvider();
_httpclient = _service.GetService<IHttpClientFactory>();
}
public async Task<HashSet<string>> WebScrapper(string website)
{
try{

var policy = Policy.Handle<TaskCanceledException>().OrResult<HttpResponseMessage>(r=> r.StatusCode == HttpStatusCode.RequestTimeout).WaitAndRetryAsync(3, retries => TimeSpan.FromSeconds(Math.Pow(2, retries)), onRetryAsync: async (exception, retrycount, context) => {
if(context != null)
{
var request = new HttpRequestMessage(HttpMethod.Get, context.OperationKey.ToString());
}
});

using(var client = _httpclient.CreateClient())
{
HttpRequestMessage requestMessage = new HttpRequestMessage()
{
RequestUri = new Uri(website),
Headers = {
{"Connection", "keep-alive"},
{"User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36"},
{"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"},
{"Accept-Language", "en-US,en;q=0.5"},
},
Method = HttpMethod.Get
};
var response = await policy.ExecuteAsync(async() => await client.SendAsync(requestMessage));
html = response.Content.ReadAsStringAsync().Result;
}
HtmlDocument document = new HtmlDocument();
document.LoadHtml(html);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//ul[li/a[@href]]//a[@href]");
foreach(var url in nodes)
{
string link = url.GetAttributeValue("href", "");
string fulllink = String.Empty;
if(!link.Contains(website[..^1]))
{
fulllink = website + link;
urls.Add(fulllink);
}else
{
urls.Add(link);
}
}
var checkhomepage = await GetEmail(website);
if(checkhomepage.Count > 0)
{
Console.WriteLine("Email Found!");
}else
{
var priortizelist = urls.OrderByDescending(url => url.ToLower().Contains("about") || url.ToLower().Contains("about"));
foreach(var url in priortizelist)
{
var checkotherpage = await GetEmail(url);
if(checkotherpage.Count > 0)
{
Console.WriteLine("Email Found");
break;
}else
{
Console.WriteLine("Email not Found");
}
}
}
}catch(HttpRequestException ex)
{
Console.WriteLine(ex.Message);
}
return urls;
}
public async Task<HashSet<string>> GetEmail(string website)
{
string html = string.Empty;

try
{
var retrypolicy = Policy.Handle<TaskCanceledException>().OrResult<HttpResponseMessage>(r=> r.StatusCode == HttpStatusCode.RequestTimeout).WaitAndRetryAsync(3, retries => TimeSpan.FromSeconds(Math.Pow(2, retries)));
using(var client = _httpclient.CreateClient())
{
HttpRequestMessage request = new HttpRequestMessage()
{
RequestUri = new Uri(website),
Headers = {
{"Connection", "keep-alive"},
{"User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36"},
{"Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"},
{"Accept-Language", "en-US,en;q=0.5"},
},
Method = HttpMethod.Get
};
var response = await retrypolicy.ExecuteAsync(async() => await client.SendAsync(request));
html = response.Content.ReadAsStringAsync().Result;
}
string emailpattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b";
MatchCollection emailmatch = Regex.Matches(html, emailpattern);
foreach(Match item in emailmatch)
{
if(!emails.Contains(item.Value.Trim()))
{
await using(var sw = File.AppendText("leads.txt"))
{
await sw.WriteLineAsync(item.Value.Trim().ToString());
}
}
emails.Add(item.Value.Trim());
}
// HtmlDocument document = new HtmlDocument();
// document.LoadHtml(html);
// HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//text()[matches(., '[^@]+@[^@]+')]");
// foreach(var item in nodes)
// {
// emails.Add(item.InnerText.ToString());
// }
}catch(HttpRequestException ex)
{
Console.WriteLine(ex.Message);
}
return emails;
}
}

beezblock · 03.08.2024

Logic Behind this code,
The code will visit the website and first, it will check the index page for email , or hidden mailto: value. if email not found on the index page, the code will get list of nav links in the page and check every nav link but nav link that contains /about or /contact will be priortized and checked first because emails are likely to be in about or contact page if not on the index page. and when email is found, it saves to a file.
Note: Code is using Exponential Backoff policy. You can edit policy according to your needs.

.net crawler code to find emails from websites. (provide list of websites in a file)

beezblock

floppy-диск

beezblock

floppy-диск