The Blog of Ian Mercer.

A simple web crawler in C# using HtmlAgilityPack

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Net;

namespace LinkChecker.WebSpider
{
    /// \<summary> 
    /// A result encapsulating the Url and the HtmlDocument 
    /// \</summary> 
    public abstract class WebPage
    {
        public Uri Url { get; set; }

        /// \<summary> 
        /// Get every WebPage.Internal on a web site (or part of a web site) visiting all internal links just once 
        /// plus every external page (or other Url) linked to the web site as a WebPage.External 
        /// \</summary> 
        /// \<remarks> 
        /// Use .OfType WebPage.Internal to get just the internal ones if that's what you want
        /// \</remarks> 
        public static IEnumerable<WebPage> GetAllPagesUnder(Uri urlRoot)
        {
            var queue = new Queue<Uri>();
            var allSiteUrls = new HashSet<Uri>();

            queue.Enqueue(urlRoot);
            allSiteUrls.Add(urlRoot);

            while (queue.Count > 0)
            {
                Uri url = queue.Dequeue();

                HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url);
                oReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US;rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5";

                HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse();

                WebPage result;

                if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase))
                {
                    HtmlDocument doc = new HtmlDocument();
                    try
                    {
                        var resultStream = resp.GetResponseStream();
                        doc.Load(resultStream);
                        // The HtmlAgilityPack 
                        result = new Internal() { Url = url, HtmlDocument = doc };
                    }
                    catch (System.Net.WebException ex)
                    {
                        result = new WebPage.Error() { Url = url, Exception = ex };
                    }
                    catch (Exception ex)
                    {
                        ex.Data.Add("Url", url); // Annotate the exception with the Url 
                        throw;
                    }

                    // Success, hand off the page yield return new WebPage.Internal() { Url = url, HtmlDocument = doc };

                    // And and now queue up all the links on this page
                    foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//a[@href]"))
                    {
                        HtmlAttribute att = link.Attributes["href"];
                        if (att == null) continue;
                        string href = att.Value;
                        if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase)) continue;
                        // ignore javascript on buttons using a tags

                        Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute);

                        // Make it absolute if it's relative if (!urlNext.IsAbsoluteUri) {
                        urlNext = new Uri(urlRoot, urlNext);


                        // keep track of every page we've handed off
                        if (!allSiteUrls.Contains(urlNext))
                        {
                            allSiteUrls.Add(urlNext);

                            if (urlRoot.IsBaseOf(urlNext))
                            {
                                queue.Enqueue(urlNext);
                            }
                            else
                            {
                                yield return new WebPage.External() { Url = urlNext };
                            }
                        }
                    }
                }
            }
        }
 
        ///// <summary> 
        ///// In the future might provide all the images too??
        ///// </summary> 
        //public class Image : WebPage 
        //{
        //}

        /// \<summary> 
        /// Error loading page 
        /// \</summary>
        public class Error : WebPage
        {
            public int HttpResult { get; set; }
            public Exception Exception { get; set; }
        }

        /// <summary> 
        /// External page - not followed 
        /// </summary> 
        /// <remarks> 
        /// No body - go load it yourself 
        /// </remarks>
        public class External : WebPage { }

        /// \<summary>
        /// Internal page 
        /// </summary>
        public class Internal : WebPage
        {
            /// \<summary> 
            /// For internal pages we load the document for you 
            /// \</summary> 
            public virtual HtmlDocument HtmlDocument { get; internal set; }
        }
    }
}

Related Stories

Cover Image for Xamarin Forms Application For Home Automation

Xamarin Forms Application For Home Automation

Building a Xamarin Forms application to control my home automation system

Ian Mercer
Ian Mercer

JSON Patch - a C# implementation

Ian Mercer
Ian Mercer

Dynamically building 'Or' Expressions in LINQ

How to create a LINQ expression that logically ORs together a set of predicates

Ian Mercer
Ian Mercer

VariableWithHistory - making persistence invisible, making history visible

A novel approach to adding history to variables in a programming language

Ian Mercer
Ian Mercer

Updated Release of the Abodit State Machine

A hierarchical state machine for .NET

Ian Mercer
Ian Mercer

Building a better .NET State Machine

A state machine for .NET that I've released on Nuget

Ian Mercer
Ian Mercer
Cover Image for The Internet of Dogs

The Internet of Dogs

Connecting our dog into the home automation

Ian Mercer
Ian Mercer

A simple state machine in C#

State machines are useful in many contexts but especially for home automation

Ian Mercer
Ian Mercer

Convert a property getter to a setter

Ian Mercer
Ian Mercer

MongoDB Map-Reduce - Hints and Tips

Ian Mercer
Ian Mercer
Cover Image for Weather Forecasting for Home Automation

Weather Forecasting for Home Automation

Ian Mercer
Ian Mercer

Lengthening short Urls in C#

Ian Mercer
Ian Mercer

ASP.NET MVC SEO - Solution Part 1

Ian Mercer
Ian Mercer

Building sitemap.xml for SEO ASP.NET MVC

Ian Mercer
Ian Mercer

Tip: getting the index in a foreeach statement

A tip on using LINQ's Select expression with an index

Ian Mercer
Ian Mercer

WCF and the SYSTEM account

Namespace reservations and http.sys, my, oh my!

Ian Mercer
Ian Mercer

404 errors on IIS6 with ASP.NET 4 Beta 2

Ian Mercer
Ian Mercer

Mixed mode assembly errors after upgrade to .NET 4 Beta 2

Fixing this error was fairly simple

Ian Mercer
Ian Mercer

The EntityContainer name could not be determined

How to fix the exception "the entitycontainer" name could not be determined

Ian Mercer
Ian Mercer

Shortened URLs should be treated like a Codec ...

Expanding URLs would help users decide whether or not to click a link

Ian Mercer
Ian Mercer

A great site for developing and testing regular expressions

Just a link to a site I found useful

Ian Mercer
Ian Mercer

Entity Framework in .NET 4

Ian Mercer
Ian Mercer

Shaving seconds off page load times

Ian Mercer
Ian Mercer

System.Data.EntitySqlException

Hints for dealing with this exception

Ian Mercer
Ian Mercer

Exception Handling using Exception.Data

My latest article on CodeProject covers the lesser known Exception.Data property

Ian Mercer
Ian Mercer

ASP.NET Custom Validation

How to solve a problem encountered with custom validation in ASP.NET

Ian Mercer
Ian Mercer

Optimization Advice

Some advice on software optimization

Ian Mercer
Ian Mercer

Linq's missing link

LinqKit came in handy back in 2009

Ian Mercer
Ian Mercer

Cache optimized scanning of pairwise combinations of values

Using space-filling curves to optimize caching

Ian Mercer
Ian Mercer

Threading and User Interfaces

A rant about how few software programs get threading right

Ian Mercer
Ian Mercer