1- using Microsoft . Extensions . Logging ;
1+ using DotnetSpider . Downloader ;
2+ using Microsoft . Extensions . Logging ;
3+ using System ;
4+ using System . Collections . Generic ;
25
36namespace DotnetSpider . Core . Processor
47{
@@ -13,9 +16,20 @@ public abstract class BasePageProcessor : IPageProcessor
1316 public ILogger Logger { get ; set ; }
1417
1518 /// <summary>
16- /// 目标链接的解析器、抽取器
19+ /// 用于判断是否需要处理当前 Request, 以及解析出来的目标链接是否需要添加到队列.
20+ /// RequestExtractor 解析出来的结果也需验证是否符合 Filter, 如果不符合 Filter 那么最终也不会进入到 Processor, 即为无意义的 Request
1721 /// </summary>
18- public ITargetRequestExtractor TargetUrlsExtractor { get ; set ; }
22+ public IFilter Filter { get ; set ; }
23+
24+ /// <summary>
25+ /// 解析目标链接的接口
26+ /// </summary>
27+ public IRequestExtractor RequestExtractor { get ; set ; }
28+
29+ /// <summary>
30+ /// 是否最后一页的判断接口, 如果是最后一页, 则不需要执行 RequestExtractor
31+ /// </summary>
32+ public ILastPageChecker LastPageChecker { get ; set ; }
1933
2034 /// <summary>
2135 /// 去掉链接#后面的所有内容
@@ -43,54 +57,51 @@ public void Process(Page page)
4357 properties [ Env . UrlPropertyKey ] = page . Request . Url ;
4458 properties [ Env . TargetUrlPropertyKey ] = page . TargetUrl ;
4559
46- if ( TargetUrlsExtractor != null )
60+ if ( ! ( page . Request . GetProperty ( Page . Depth ) == 1 && ! Env . FilterDefaultRequest ) )
4761 {
48- bool isTarget = true ;
49- if ( ( page . Request . GetProperty ( Page . Depth ) != 1 || Env . ProcessorFilterDefaultRequest ) && TargetUrlsExtractor . TargetUrlPatterns != null && TargetUrlsExtractor . TargetUrlPatterns . Count > 0 && ! TargetUrlsExtractor . TargetUrlPatterns . Contains ( null ) )
50- {
51- foreach ( var regex in TargetUrlsExtractor . TargetUrlPatterns )
52- {
53- isTarget = regex . IsMatch ( page . Request . Url ) ;
54- if ( isTarget )
55- {
56- break ;
57- }
58- }
59- }
60-
61- if ( ! isTarget )
62+ if ( Filter != null && ! Filter . IsMatch ( page . Request ) )
6263 {
6364 return ;
6465 }
6566 }
6667
6768 Handle ( page ) ;
6869
69- // IAfterDownloaderHandler中可以实现解析, 有可能不再需要解析了
70- if ( ! page . SkipExtractedTargetRequests && TargetUrlsExtractor != null )
71- {
72- ExtractUrls ( page ) ;
73- }
74- }
70+ if ( LastPageChecker != null && LastPageChecker . IsLastPage ( page ) ) return ;
7571
76- /// <summary>
77- /// 解析目标链接并添加到Page对象中, 供Spider对象添加到对列中
78- /// </summary>
79- /// <param name="page">页面数据</param>
80- protected virtual void ExtractUrls ( Page page )
81- {
82- var links = TargetUrlsExtractor . ExtractRequests ( page ) ;
83- if ( links != null )
72+ IEnumerable < Request > requests ;
73+ if ( RequestExtractor != null && ( requests = RequestExtractor . Extract ( page ) ) != null )
8474 {
85- foreach ( var link in links )
75+ foreach ( var link in requests )
8676 {
77+ if ( Filter != null && ! Filter . IsMatch ( link ) ) continue ;
78+
8779 if ( CleanPound )
8880 {
8981 link . Url = link . Url . Split ( '#' ) [ 0 ] ;
9082 }
83+
9184 page . AddTargetRequest ( link ) ;
9285 }
9386 }
9487 }
88+
89+ public BasePageProcessor SetRequestExtractor ( IRequestExtractor requestExtractor )
90+ {
91+ RequestExtractor = requestExtractor ;
92+ return this ;
93+ }
94+
95+ public BasePageProcessor SetFilter ( IFilter filter )
96+ {
97+ Filter = filter ;
98+ return this ;
99+ }
100+
101+ public BasePageProcessor SetLastPageChecker ( ILastPageChecker lastPageChecker )
102+ {
103+ LastPageChecker = lastPageChecker ;
104+ return this ;
105+ }
95106 }
96- }
107+ }
0 commit comments