<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Ian Mercer &#187; crawler</title>
	<atom:link href="http://blog.abodit.com/tag/crawler/feed/" rel="self" type="application/rss+xml" />
	<link>http://blog.abodit.com</link>
	<description>Living in the World&#039;s Smartest House</description>
	<lastBuildDate>Sat, 07 Jan 2012 19:50:56 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.2.1</generator>
		<item>
		<title>A simple web crawler in C# using HtmlAgilityPack</title>
		<link>http://blog.abodit.com/2010/03/a-simple-web-crawler-in-c-using-htmlagilitypack/</link>
		<comments>http://blog.abodit.com/2010/03/a-simple-web-crawler-in-c-using-htmlagilitypack/#comments</comments>
		<pubDate>Wed, 10 Mar 2010 17:35:15 +0000</pubDate>
		<dc:creator>Ian Mercer</dc:creator>
				<category><![CDATA[.NET]]></category>
		<category><![CDATA[IIS]]></category>
		<category><![CDATA[C#]]></category>
		<category><![CDATA[crawler]]></category>

		<guid isPermaLink="false">http://blog.abodit.com/?p=595</guid>
		<description><![CDATA[]]></description>
			<content:encoded><![CDATA[<pre class="brush: csharp; title: ; notranslate">
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using System.Net;

namespace LinkChecker.WebSpider
{
    /// &lt;summary&gt;
    /// A result encapsulating the Url and the HtmlDocument
    /// &lt;/summary&gt;
    public abstract class WebPage
    {
        public Uri Url { get; set; }

        /// &lt;summary&gt;
        /// Get every WebPage.Internal on a web site (or part of a web site) visiting all internal links just once
        /// plus every external page (or other Url) linked to the web site as a WebPage.External
        /// &lt;/summary&gt;
        /// &lt;remarks&gt;
        /// Use .OfType WebPage.Internal to get just the internal ones if that's what you want
        /// &lt;/remarks&gt;
        public static IEnumerable&lt;WebPage&gt; GetAllPagesUnder(Uri urlRoot)
        {
            var queue = new Queue&lt;Uri&gt;();
            var allSiteUrls = new HashSet&lt;Uri&gt;();

            queue.Enqueue(urlRoot);
            allSiteUrls.Add(urlRoot);

            while (queue.Count &gt; 0)
            {
                Uri url = queue.Dequeue();

                HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url);
                oReq.UserAgent = @&quot;Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5&quot;;

                HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse();

                WebPage result;

                if (resp.ContentType.StartsWith(&quot;text/html&quot;, StringComparison.InvariantCultureIgnoreCase))
                {
                    HtmlDocument doc = new HtmlDocument();
                    try
                    {
                        var resultStream = resp.GetResponseStream();
                        doc.Load(resultStream); // The HtmlAgilityPack
                        result = new Internal() { Url = url, HtmlDocument = doc };
                    }
                    catch (System.Net.WebException ex)
                    {
                        result = new WebPage.Error() { Url = url, Exception = ex };
                    }
                    catch (Exception ex)
                    {
                        ex.Data.Add(&quot;Url&quot;, url);    // Annotate the exception with the Url
                        throw;
                    }

                    // Success, hand off the page
                    yield return new WebPage.Internal() { Url = url, HtmlDocument = doc };

                    // And and now queue up all the links on this page
                    foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@&quot;//a[@href]&quot;))
                    {
                        HtmlAttribute att = link.Attributes[&quot;href&quot;];
                        if (att == null) continue;
                        string href = att.Value;
                        if (href.StartsWith(&quot;javascript&quot;, StringComparison.InvariantCultureIgnoreCase)) continue;      // ignore javascript on buttons using a tags

                        Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute);

                        // Make it absolute if it's relative
                        if (!urlNext.IsAbsoluteUri)
                        {
                            urlNext = new Uri(urlRoot, urlNext);
                        }

                        if (!allSiteUrls.Contains(urlNext))
                        {
                            allSiteUrls.Add(urlNext);               // keep track of every page we've handed off

                            if (urlRoot.IsBaseOf(urlNext))
                            {
                                queue.Enqueue(urlNext);
                            }
                            else
                            {
                                yield return new WebPage.External() { Url = urlNext };
                            }
                        }
                    }
                }
            }
        }

        ///// &lt;summary&gt;
        ///// In the future might provide all the images too??
        ///// &lt;/summary&gt;
        //public class Image : WebPage
        //{
        //}

        /// &lt;summary&gt;
        /// Error loading page
        /// &lt;/summary&gt;
        public class Error : WebPage
        {
            public int HttpResult { get; set; }
            public Exception Exception { get; set; }
        }

        /// &lt;summary&gt;
        /// External page - not followed
        /// &lt;/summary&gt;
        /// &lt;remarks&gt;
        /// No body - go load it yourself
        /// &lt;/remarks&gt;
        public class External : WebPage
        {
        }

        /// &lt;summary&gt;
        /// Internal page
        /// &lt;/summary&gt;
        public class Internal : WebPage
        {
            /// &lt;summary&gt;
            /// For internal pages we load the document for you
            /// &lt;/summary&gt;
            public virtual HtmlDocument HtmlDocument { get; internal set; }
        }
    }
}
</pre>
]]></content:encoded>
			<wfw:commentRss>http://blog.abodit.com/2010/03/a-simple-web-crawler-in-c-using-htmlagilitypack/feed/</wfw:commentRss>
		<slash:comments>7</slash:comments>
		</item>
	</channel>
</rss>

