abot是.net爬虫框架中的一种,Abot是一个开源的.net爬虫,速度快,易于使用和扩展。项目的地址是。
爬取的html解析,我们使用AngleSharp,项目的地址:
首先我们需要配置abot
private static readonly Uri FeedUrl = new Uri("https://www.jd.com/allSort.aspx");//定义一个爬取的url,这里以京东商品的分类为例子
public static IWebCrawler GetManuallyConfiguredWebCrawler() { //这里进行配置,具体的含义自己看源代码了解 CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = System.Environment.ProcessorCount; config.MaxPagesToCrawl = 1000; config.IsExternalPageCrawlingEnabled = false; config.IsUriRecrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.IsRespectRobotsDotTextEnabled = false; config.DownloadableContentTypes = "text/html, text/plain"; config.MinCrawlDelayPerDomainMilliSeconds = 1000; config.CrawlTimeoutSeconds = 0; config.MaxPagesToCrawlPerDomain = 0; var crawler = new PoliteWebCrawler(config, null, null, null, null, null, null, null, null); //爬取页面前的判断 crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.ShouldDownloadPageContent(ShouldDownloadPageContent); crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks); //下面是爬取的四个事件 crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;//单个页面爬取开始 crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedAsync;//单个页面爬取结束 // crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;// 页面链接不允许爬取事件 //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;//页面不允许爬取事件 return crawler; }
爬虫中主要是4个事件, 页面爬取开始、页面爬取失败、页面不允许爬取事件、页面中的链接不允许爬取事件.
以下是示例
//单个页面爬取开始 public static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; } //单个页面爬取结束 public static void crawler_ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e) { if (e.CrawledPage.Uri == FeedUrl) { StringBuilder sb=new StringBuilder(); //这里使用AngleSharp解析html var all=e.CrawledPage.AngleSharpHtmlDocument.QuerySelector(".category-items").Children; foreach (var col in all) { var categorys=col.QuerySelectorAll(".category-item"); foreach (var category in categorys) { var first=category.QuerySelector(".item-title span").Text(); sb.Append("\r\n" + first + "\r\n"); var seconds = category.QuerySelector(".items").Children; foreach (var second in seconds) { var secondtext=second.QuerySelector("dt a").Text(); sb.Append(secondtext + "\t"); var thireds = second.QuerySelector("dd").Children; foreach (var thired in thireds) { var thiredtext = thired.Text(); sb.Append(thiredtext + ","); } sb.Remove(sb.Length - 1, 1); } } } //爬取的数据保存到C:\Program Files (x86)\IIS Express下面。注意这里保存可能需要以管理员的身份运行VS System.IO.File.AppendAllText("fake.txt", sb.ToString()); } } #region ////// 同步方法注册一个委托,以确定是否应该抓取一个页面 /// /// /// ///public static CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl.IsRetry || pageToCrawl.IsRoot || FeedUrl == pageToCrawl.Uri )//判断是否为根Url,爬取的Url是否为我们指定的 { return new CrawlDecision() { Allow = true }; } else { return new CrawlDecision { Allow = false, Reason = "Not match uri" };//如果为false,就不爬取页面 } } /// /// 同步方法注册一个委托,以确定页面的内容是否应该被加载 /// /// /// ///private static CrawlDecision ShouldDownloadPageContent(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl.IsRoot || pageToCrawl.IsRetry || FeedUrl == pageToCrawl.Uri) { return new CrawlDecision { Allow = true }; } return new CrawlDecision { Allow = false, Reason = "Not match uri" }; } /// /// 同步方法注册一个委托,以确定是否应该抓取一个页面的链接 /// /// /// ///private static CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (!crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" }; if (crawledPage.IsRoot || crawledPage.IsRetry || crawledPage.Uri == FeedUrl) { return new CrawlDecision { Allow = true }; } else { return new CrawlDecision { Allow = false, Reason = "We only crawl links of pagination pages" }; } } #endregion
接下来就是测试
public ActionResult Index() { var crawler = GetManuallyConfiguredWebCrawler(); var reuslt = crawler.Crawl(FeedUrl); Response.Write(reuslt.ErrorException); return View(); }