A simple web crawler in C# using HtmlAgilityPack
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using System.Net;
namespace LinkChecker.WebSpider
{
/// <summary>
/// A result encapsulating the Url and the HtmlDocument
/// </summary>
public abstract class WebPage
{
public Uri Url { get; set; }
/// <summary>
/// Get every WebPage.Internal on a web site (or part of a web site) visiting all internal links just once
/// plus every external page (or other Url) linked to the web site as a WebPage.External
/// </summary>
/// <remarks>
/// Use .OfType WebPage.Internal to get just the internal ones if that's what you want
/// </remarks>
public static IEnumerable<WebPage> GetAllPagesUnder(Uri urlRoot)
{
var queue = new Queue<Uri>();
var allSiteUrls = new HashSet<Uri>();
queue.Enqueue(urlRoot);
allSiteUrls.Add(urlRoot);
while (queue.Count > 0)
{
Uri url = queue.Dequeue();
HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url);
oReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5";
HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse();
WebPage result;
if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase))
{
HtmlDocument doc = new HtmlDocument();
try
{
var resultStream = resp.GetResponseStream();
doc.Load(resultStream); // The HtmlAgilityPack
result = new Internal() { Url = url, HtmlDocument = doc };
}
catch (System.Net.WebException ex)
{
result = new WebPage.Error() { Url = url, Exception = ex };
}
catch (Exception ex)
{
ex.Data.Add("Url", url); // Annotate the exception with the Url
throw;
}
// Success, hand off the page
yield return new WebPage.Internal() { Url = url, HtmlDocument = doc };
// And and now queue up all the links on this page
foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
if (att == null) continue;
string href = att.Value;
if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase)) continue; // ignore javascript on buttons using a tags
Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute);
// Make it absolute if it's relative
if (!urlNext.IsAbsoluteUri)
{
urlNext = new Uri(urlRoot, urlNext);
}
if (!allSiteUrls.Contains(urlNext))
{
allSiteUrls.Add(urlNext); // keep track of every page we've handed off
if (urlRoot.IsBaseOf(urlNext))
{
queue.Enqueue(urlNext);
}
else
{
yield return new WebPage.External() { Url = urlNext };
}
}
}
}
}
}
///// <summary>
///// In the future might provide all the images too??
///// </summary>
//public class Image : WebPage
//{
//}
/// <summary>
/// Error loading page
/// </summary>
public class Error : WebPage
{
public int HttpResult { get; set; }
public Exception Exception { get; set; }
}
/// <summary>
/// External page - not followed
/// </summary>
/// <remarks>
/// No body - go load it yourself
/// </remarks>
public class External : WebPage
{
}
/// <summary>
/// Internal page
/// </summary>
public class Internal : WebPage
{
/// <summary>
/// For internal pages we load the document for you
/// </summary>
public virtual HtmlDocument HtmlDocument { get; internal set; }
}
}
}
Comments are closed.
about 2 years ago
I would like to post some of your ideas in my blog. Can I ,please take them?
about 2 years ago
It looks like the solution I am searching for.
But I did not understand, how to call the class from my code.
Can you give me an example?
Thank you in advance,
Bernd
about 1 year ago
HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url);
Where is WebRequest defined?
about 1 year ago
WebRequest is in System.Net. See http://msdn.microsoft.com/en-us/library/bw00b1dc.aspx
about 1 year ago
Ahh, excellent! Here I was thinking C# didn’t have anything like that… I should have googled. Thanks a bunch! XD
about 1 year ago
Hi i used the HTML agility Pack’s method to get the HtmlDocument instead of WebRequest.
Is there any difference in these two approaches
I used:
HtmlWeb web = new HtmlWeb();
doc=web.Load(urlRoot.AbsolutePath);
about 1 year ago
By using WebRequest you can get more control over the process, e.g. checking the mime type first, or handling redirects (not shown here).