diff --git a/.github/workflows/docker-image-latest.yml b/.github/workflows/docker-image-latest.yml index 6c7e00ac..5019fc97 100644 --- a/.github/workflows/docker-image-latest.yml +++ b/.github/workflows/docker-image-latest.yml @@ -25,11 +25,11 @@ jobs: id: meta uses: docker/metadata-action@v3 with: - images: jhao104/proxy_pool + images: wingser/proxy_pool - name: Build and push Docker image uses: docker/build-push-action@v2 with: context: . push: true - tags: jhao104/proxy_pool:latest + tags: wingser/proxy_pool:latest diff --git a/.github/workflows/docker-image-tags.yml b/.github/workflows/docker-image-tags.yml index 9a59645a..b3c8d017 100644 --- a/.github/workflows/docker-image-tags.yml +++ b/.github/workflows/docker-image-tags.yml @@ -25,7 +25,7 @@ jobs: id: meta uses: docker/metadata-action@v3 with: - images: jhao104/proxy_pool + images: wingser/proxy_pool - name: Build and push Docker image uses: docker/build-push-action@v2 diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..9f078a4e --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,19 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + //"justMyCode": true, + //"python": "${command:python.interpreterPath}", + "env": {"PYTHONPATH":"${workspaceRoot}"}, + "envFile": "${workspaceRoot}/.env" + } + ] +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 89019cd7..854b99e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.6-alpine -MAINTAINER jhao104 +MAINTAINER wingser WORKDIR /app @@ -13,7 +13,7 @@ RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.ustc.edu.cn/g' /etc/apk/repositorie RUN apk add -U tzdata && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && apk del tzdata # runtime environment -RUN apk add musl-dev gcc libxml2-dev libxslt-dev && \ +RUN apk add bash musl-dev gcc libxml2-dev libxslt-dev && \ pip install --no-cache-dir -r requirements.txt && \ apk del gcc musl-dev diff --git a/README.md b/README.md index f48bad80..bb8be0c4 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ ProxyPool 爬虫代理IP池 爬虫代理IP池项目,主要功能为定时采集网上发布的免费代理验证入库,定时验证入库的代理保证代理的可用性,提供API和CLI两种使用方式。同时你也可以扩展代理源以增加代理池IP的质量和数量。 +[原作者项目地址](https://github.com/jhao104/proxy_pool) 感谢jhao104之前项目贡献,因原项目更新缓慢,自己fork开启更新维护。欢迎提建议,我尽量更新,如果我更新也慢了,你可以考虑自己从原项目fork一份自己维护。 + * 文档: [document](https://proxy-pool.readthedocs.io/zh/latest/) [![Documentation Status](https://readthedocs.org/projects/proxy-pool/badge/?version=latest)](https://proxy-pool.readthedocs.io/zh/latest/?badge=latest) * 支持版本: [![](https://img.shields.io/badge/Python-2.7-green.svg)](https://docs.python.org/2.7/) @@ -43,13 +45,14 @@ ProxyPool 爬虫代理IP池 * git clone ```bash -git clone git@github.com:jhao104/proxy_pool.git +git clone git@github.com:wingser/proxy_pool.git ``` * releases ```bash -https://github.com/jhao104/proxy_pool/releases 下载对应zip文件 +https://github.com/wingser/proxy_pool/releases 下载对应zip文件 +建议docker安装。 ``` ##### 安装依赖: @@ -74,14 +77,6 @@ PORT = 5000 # 监听端口 DB_CONN = 'redis://:pwd@127.0.0.1:8888/0' - -# 配置 ProxyFetcher - -PROXY_FETCHER = [ - "freeProxy01", # 这里是启用的代理抓取方法名,所有fetch方法位于fetcher/proxyFetcher.py - "freeProxy02", - # .... -] ``` #### 启动项目: @@ -101,9 +96,9 @@ python proxyPool.py server ### Docker Image ```bash -docker pull jhao104/proxy_pool +docker pull wingser/proxy_pool -docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 jhao104/proxy_pool:latest +docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 --name wingser_pool wingser/proxy_pool:latest ``` ### docker-compose @@ -122,7 +117,9 @@ docker-compose up -d | ----| ---- | ---- | ----| | / | GET | api介绍 | None | | /get | GET | 随机获取一个代理| 可选参数: `?type=https` 过滤支持https的代理| +| /gettxt | GET | 随机获取一个代理,非json,ip:port格式| 可选参数: `?type=https` 过滤支持https的代理| | /pop | GET | 获取并删除一个代理| 可选参数: `?type=https` 过滤支持https的代理| +| /poptxt | GET | 获取并删除一个代理,非json,ip:port格式| 可选参数: `?type=https` 过滤支持https的代理| | /all | GET | 获取所有代理 |可选参数: `?type=https` 过滤支持https的代理| | /count | GET | 查看代理数量 |None| | /delete | GET | 删除代理 |`?proxy=host:ip`| @@ -185,20 +182,7 @@ class ProxyFetcher(object): # 确保每个proxy都是 host:ip正确的格式返回 ``` -* 2、添加好方法后,修改[setting.py](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47)文件中的`PROXY_FETCHER`项: - -  在`PROXY_FETCHER`下添加自定义方法的名字: - -```python -PROXY_FETCHER = [ - "freeProxy01", - "freeProxy02", - # .... - "freeProxyCustom1" # # 确保名字和你添加方法名字一致 -] -``` - - +* 2、添加好方法后,改为自动加载,无需配置。(原设计不太合理,我自己提交都漏掉几次,直接改自动加载):   `schedule` 进程会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 ### 免费代理源 @@ -210,21 +194,23 @@ PROXY_FETCHER = [ | 站大爷 | ✔ | ★ | ** | [地址](https://www.zdaye.com/) | [`freeProxy01`](/fetcher/proxyFetcher.py#L28) | | 66代理 | ✔ | ★ | * | [地址](http://www.66ip.cn/) | [`freeProxy02`](/fetcher/proxyFetcher.py#L50) | | 开心代理 | ✔ | ★ | * | [地址](http://www.kxdaili.com/) | [`freeProxy03`](/fetcher/proxyFetcher.py#L63) | - | FreeProxyList | ✔ | ★ | * | [地址](https://www.freeproxylists.net/zh/) | [`freeProxy04`](/fetcher/proxyFetcher.py#L74) | | 快代理 | ✔ | ★ | * | [地址](https://www.kuaidaili.com/) | [`freeProxy05`](/fetcher/proxyFetcher.py#L92) | | FateZero | ✔ | ★★ | * | [地址](http://proxylist.fatezero.org) | [`freeProxy06`](/fetcher/proxyFetcher.py#L111) | | 云代理 | ✔ | ★ | * | [地址](http://www.ip3366.net/) | [`freeProxy07`](/fetcher/proxyFetcher.py#L124) | - | 小幻代理 | ✔ | ★★ | * | [地址](https://ip.ihuan.me/) | [`freeProxy08`](/fetcher/proxyFetcher.py#L134) | - | 免费代理库 | ✔ | ☆ | * | [地址](http://ip.jiangxianli.com/) | [`freeProxy09`](/fetcher/proxyFetcher.py#L144) | | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L155) | | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L165) | + | SEO方法代理 | ✔ | ☆ | * | [地址](https://proxy.seofangfa.com/) | [`wingser01`](/fetcher/proxyFetcher.py#L194) | + | 小舒代理 | ✔ | ☆ | * | [地址](http://www.xsdaili.cn/) | [`wingser02`](/fetcher/proxyFetcher.py#L206) | + | PzzQz代理 | ✔ | ☆ | * | [地址](https://pzzqz.com/) | [`wingser03`](/fetcher/proxyFetcher.py#L244) | + | proxy-list | ✔ | ☆ | * | [地址](https://proxy-list.org/) | [`wingser04`](/fetcher/proxyFetcher.py#L269) | + | proxylistplus| ✔ | ☆ | * | [地址](https://list.proxylistplus.com/)| [`wingser05`](/fetcher/proxyFetcher.py#L284) | + - - 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 +如果还有其他好的免费代理网站, 可以在提交在[Issues](https://github.com/zwingser/proxy_pool/issues), 下次更新时会考虑在项目中支持。 ### 问题反馈 -  任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,同时也可以到我的[博客](http://www.spiderpy.cn/blog/message)中留言。 +  任何问题欢迎在[Issues](https://github.com/zwingser/proxy_pool/issues) 中反馈。   你的反馈会让此项目变得更加完美。 @@ -232,7 +218,7 @@ PROXY_FETCHER = [   本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 -  本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,我会尽力改进,使她更加完美。 +  本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/zwingser/proxy_pool/issues)中提交bug(或新功能)描述,我会尽力改进,使她更加完美。   这里感谢以下contributor的无私奉献: diff --git a/api/proxyApi.py b/api/proxyApi.py index bd2de57e..0d0cb9c1 100644 --- a/api/proxyApi.py +++ b/api/proxyApi.py @@ -43,7 +43,9 @@ def force_type(cls, response, environ=None): api_list = [ {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"}, + {"url": "/gettxt", "params": "type: ''https'|''", "desc": "get a proxy"}, {"url": "/pop", "params": "", "desc": "get and delete a proxy"}, + {"url": "/poptxt", "params": "", "desc": "get and delete a proxy"}, {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"}, {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"}, {"url": "/count", "params": "", "desc": "return proxy count"} @@ -62,6 +64,11 @@ def get(): proxy = proxy_handler.get(https) return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} +@app.route('/gettxt/') +def gettxt(): + https = request.args.get("type", "").lower() == 'https' + proxy = proxy_handler.get(https) + return proxy._proxy if proxy else {"code": 0, "src": "no proxy"} @app.route('/pop/') def pop(): @@ -69,6 +76,11 @@ def pop(): proxy = proxy_handler.pop(https) return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} +@app.route('/poptxt/') +def poptxt(): + https = request.args.get("type", "").lower() == 'https' + proxy = proxy_handler.pop(https) + return proxy._proxy if proxy else {"code": 0, "src": "no proxy"} @app.route('/refresh/') def refresh(): diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index 1ed43cba..43fbb345 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -12,11 +12,12 @@ """ __author__ = 'JHao' -import re import json +import re from time import sleep from util.webRequest import WebRequest +from pyquery import PyQuery as pq class ProxyFetcher(object): @@ -28,13 +29,14 @@ class ProxyFetcher(object): def freeProxy01(): """ 站大爷 https://www.zdaye.com/dayProxy.html + 好像屏蔽了国外服务器,国内可以正常爬取. """ - start_url = "https://www.zdaye.com/dayProxy.html" - html_tree = WebRequest().get(start_url, verify=False).tree - latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() + start_url = "https://www.zdaye.com/dayProxy/{}/{}/{}.html" from datetime import datetime + html_tree = WebRequest().get(start_url.format(datetime.now().year, datetime.now().month, 1), verify=False).tree + latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") - if interval.seconds < 300: # 只采集5分钟内的更新 + if interval.seconds < 300: # 只采集5分钟内的更新,当前7个小时更新一次 target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() while target_url: _tree = WebRequest().get(target_url, verify=False).tree @@ -44,7 +46,7 @@ def freeProxy01(): yield "%s:%s" % (ip, port) next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False - sleep(5) + sleep(10) @staticmethod def freeProxy02(): @@ -60,9 +62,14 @@ def freeProxy02(): yield "%s:%s" % (ip, port) @staticmethod - def freeProxy03(): - """ 开心代理 """ - target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] + def freeProxy03(page_count=10): + """ 开心代理 http://www.kxdaili.com/dailiip.html""" + target_url = "http://www.kxdaili.com/dailiip/{}/{}.html" + target_urls = [] + for tabIndex in range(2): + for pageIndex in range(page_count): + target_urls.append(target_url.format(tabIndex + 1, pageIndex + 1)) + for url in target_urls: tree = WebRequest().get(url).tree for tr in tree.xpath("//table[@class='active']//tr")[1:]: @@ -71,25 +78,7 @@ def freeProxy03(): yield "%s:%s" % (ip, port) @staticmethod - def freeProxy04(): - """ FreeProxyList https://www.freeproxylists.net/zh/ """ - url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" - tree = WebRequest().get(url, verify=False).tree - from urllib import parse - - def parse_ip(input_str): - html_str = parse.unquote(input_str) - ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) - return ips[0] if ips else None - - for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): - ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) - port = "".join(tr.xpath('./td[2]/text()')).strip() - if ip: - yield "%s:%s" % (ip, port) - - @staticmethod - def freeProxy05(page_count=1): + def freeProxy05(page_count=10): """ 快代理 https://www.kuaidaili.com """ url_pattern = [ 'https://www.kuaidaili.com/free/inha/{}/', @@ -103,7 +92,7 @@ def freeProxy05(page_count=1): for url in url_list: tree = WebRequest().get(url).tree proxy_list = tree.xpath('.//table//tr') - sleep(1) # 必须sleep 不然第二条请求不到数据 + sleep(10) # 必须sleep 不然第二条请求不到数据 for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2]) @@ -129,37 +118,29 @@ def freeProxy07(): proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ":".join(proxy) + sleep(10) @staticmethod - def freeProxy08(): - """ 小幻代理 """ - urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html'] - for url in urls: - r = WebRequest().get(url, timeout=10) - proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) + def freeProxy10(): + """ + 89免费代理 + 怀疑封国外请求,境外服务器爬取异常. + + """ + url = "https://www.89ip.cn/{}.html" + target_url = url.format('index_1') + next_page = True + while next_page: + r = WebRequest().get(target_url, timeout=10) + proxies = re.findall( + r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', r.text) for proxy in proxies: - yield ":".join(proxy) + yield ':'.join(proxy) - @staticmethod - def freeProxy09(page_count=1): - """ 免费代理库 """ - for i in range(1, page_count + 1): - url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) - html_tree = WebRequest().get(url, verify=False).tree - for index, tr in enumerate(html_tree.xpath("//table//tr")): - if index == 0: - continue - yield ":".join(tr.xpath("./td/text()")[0:2]).strip() - - @staticmethod - def freeProxy10(): - """ 89免费代理 """ - r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10) - proxies = re.findall( - r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', - r.text) - for proxy in proxies: - yield ':'.join(proxy) + next_page = r.tree.xpath("//a[@class='layui-laypage-next']/@href") + next_page = next_page[0].strip() if next_page else False + target_url = url.format(next_page) + sleep(10) @staticmethod def freeProxy11(): @@ -171,77 +152,109 @@ def freeProxy11(): except Exception as e: print(e) - # @staticmethod - # def wallProxy01(): - # """ - # PzzQz https://pzzqz.com/ - # """ - # from requests import Session - # from lxml import etree - # session = Session() - # try: - # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text - # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) - # if x_csrf_token: - # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} - # proxy_resp = session.post("https://pzzqz.com/", verify=False, - # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() - # tree = etree.HTML(proxy_resp["proxy_html"]) - # for tr in tree.xpath("//tr"): - # ip = "".join(tr.xpath("./td[1]/text()")) - # port = "".join(tr.xpath("./td[2]/text()")) - # yield "%s:%s" % (ip, port) - # except Exception as e: - # print(e) - - # @staticmethod - # def freeProxy10(): - # """ - # 墙外网站 cn-proxy - # :return: - # """ - # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] - # request = WebRequest() - # for url in urls: - # r = request.get(url, timeout=10) - # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) - # for proxy in proxies: - # yield ':'.join(proxy) - - # @staticmethod - # def freeProxy11(): - # """ - # https://proxy-list.org/english/index.php - # :return: - # """ - # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] - # request = WebRequest() - # import base64 - # for url in urls: - # r = request.get(url, timeout=10) - # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) - # for proxy in proxies: - # yield base64.b64decode(proxy).decode() - - # @staticmethod - # def freeProxy12(): - # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] - # request = WebRequest() - # for url in urls: - # r = request.get(url, timeout=10) - # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) - # for proxy in proxies: - # yield ':'.join(proxy) + @staticmethod + def wingser01(): + """ + seo方法 crawler, https://proxy.seofangfa.com/ + """ + url = 'https://proxy.seofangfa.com/' + html_tree = WebRequest().get(url, verify=False).tree + for index, tr in enumerate(html_tree.xpath("//table//tr")): + if index == 0: + continue + yield ":".join(tr.xpath("./td/text()")[0:2]).strip() + + @staticmethod + def wingser02(): + """ + 小舒代理 crawler, http://www.xsdaili.cn/ + """ + url = 'http://www.xsdaili.cn/' + base_url = "http://www.xsdaili.cn/dayProxy/ip/{page}.html" + + '''通过网站,获取最近10个日期的共享''' + urls = [] + html = WebRequest().get(url, verify=False).tree + doc = pq(html) + title = doc(".title:eq(0) a").items() + latest_page = 0 + for t in title: + res = re.search(r"/(\d+)\.html", t.attr("href")) + latest_page = int(res.group(1)) if res else 0 + if latest_page: + urls = [base_url.format(page=page) for page in range(latest_page - 10, latest_page)] + else: + urls = [] + + '''每个日期的网站,爬proxy''' + for u in urls: + h = WebRequest().get(u, verify=False).tree + doc = pq(h) + contents = doc('.cont').text() + contents = contents.split("\n") + for content in contents: + yield content[:content.find("@")] + + + + @staticmethod + def wingser03(): + """ + PzzQz https://pzzqz.com/ + """ + from requests import Session + from lxml import etree + session = Session() + try: + index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text + x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) + if x_csrf_token: + data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} + proxy_resp = session.post("https://pzzqz.com/", verify=False, + headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() + tree = etree.HTML(proxy_resp["proxy_html"]) + for tr in tree.xpath("//tr"): + ip = "".join(tr.xpath("./td[1]/text()")) + port = "".join(tr.xpath("./td[2]/text()")) + yield "%s:%s" % (ip, port) + except Exception as e: + print(e) + + + + @staticmethod + def wingser04(): + """ + https://proxy-list.org/english/index.php + :return: + """ + urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] + request = WebRequest() + import base64 + for url in urls: + r = request.get(url, timeout=10) + proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) + for proxy in proxies: + yield base64.b64decode(proxy).decode() + + @staticmethod + def wingser05(): + urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] + request = WebRequest() + for url in urls: + r = request.get(url, timeout=10) + proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) + for proxy in proxies: + yield ':'.join(proxy) if __name__ == '__main__': p = ProxyFetcher() - for _ in p.freeProxy11(): + for _ in p.wingser05(): print(_) -# http://nntime.com/proxy-list-01.htm -# freeProxy04 -# freeProxy07 -# freeProxy08 + +# http://nntime.com/proxy-list-01.htm + diff --git a/fetcher/proxyFetcherBak.py b/fetcher/proxyFetcherBak.py new file mode 100644 index 00000000..4cd35f3e --- /dev/null +++ b/fetcher/proxyFetcherBak.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +""" +------------------------------------------------- + File Name: proxyFetcherBak + Description : 原文件改为自动加载爬虫程序,所以调试中或者失效的程序也会加载, + 把调试中的程序,或者失效程序,放到这个文件里面. + Author : wingser + date: 2016/11/25 +------------------------------------------------- + Change Activity: + 2016/11/25: proxyFetcherBak +------------------------------------------------- +""" +__author__ = 'JHao' + +import json +import re +from time import sleep + +from util.webRequest import WebRequest +from pyquery import PyQuery as pq + + +class ProxyFetcherBak(object): + """ + proxy getter + """ + + @staticmethod + def freeProxy04(): + """ FreeProxyList https://www.freeproxylists.net/zh/ """ + url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" + tree = WebRequest().get(url, verify=False).tree + from urllib import parse + + def parse_ip(input_str): + html_str = parse.unquote(input_str) + ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) + return ips[0] if ips else None + + for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): + ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) + port = "".join(tr.xpath('./td[2]/text()')).strip() + if ip: + yield "%s:%s" % (ip, port) + + @staticmethod + def freeProxy08(): + """ 小幻代理 """ + url = 'https://ip.ihuan.me/' + tree = WebRequest().get(url, verify=False).tree + hrefs = tree.xpath("//ul[@class='pagination']/li/a/@href") + + for href in hrefs: + r = WebRequest().get(url + href, timeout=10) + proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) + for proxy in proxies: + yield ":".join(proxy) + sleep(10) + + @staticmethod + def freeProxy09(page_count=1): + """ 免费代理库 """ + for i in range(1, page_count + 1): + url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) + html_tree = WebRequest().get(url, verify=False).tree + for index, tr in enumerate(html_tree.xpath("//table//tr")): + if index == 0: + continue + yield ":".join(tr.xpath("./td/text()")[0:2]).strip() + + +if __name__ == '__main__': + p = ProxyFetcherBak() + for _ in p.freeProxy09(): + print(_) + + + + +# http://nntime.com/proxy-list-01.htm + diff --git a/handler/configHandler.py b/handler/configHandler.py index 29000bcc..a68f49b9 100644 --- a/handler/configHandler.py +++ b/handler/configHandler.py @@ -12,16 +12,18 @@ """ __author__ = 'JHao' -import os +import os, inspect import setting +from fetcher.proxyFetcher import ProxyFetcher from util.singleton import Singleton from util.lazyProperty import LazyProperty -from util.six import reload_six, withMetaclass +from util.six import withMetaclass class ConfigHandler(withMetaclass(Singleton)): def __init__(self): + self.fetchersMethord = [method for method in dir(ProxyFetcher) if callable(getattr(ProxyFetcher, method)) and not method.startswith("__")] pass @LazyProperty @@ -40,11 +42,10 @@ def dbConn(self): def tableName(self): return os.getenv("TABLE_NAME", setting.TABLE_NAME) - @property + @LazyProperty def fetchers(self): - reload_six(setting) - return setting.PROXY_FETCHER - + return [method for method in dir(ProxyFetcher) if callable(getattr(ProxyFetcher, method)) and not method.startswith("__")] + @LazyProperty def httpUrl(self): return os.getenv("HTTP_URL", setting.HTTP_URL) @@ -81,3 +82,7 @@ def proxyRegion(self): def timezone(self): return os.getenv("TIMEZONE", setting.TIMEZONE) + +if __name__ == '__main__': + config = ConfigHandler() + print(config.fetchers) diff --git a/handler/logHandler.py b/handler/logHandler.py index 45cd1201..84e94383 100644 --- a/handler/logHandler.py +++ b/handler/logHandler.py @@ -46,7 +46,7 @@ class LogHandler(logging.Logger): LogHandler """ - def __init__(self, name, level=DEBUG, stream=True, file=True): + def __init__(self, name, level=DEBUG, stream=True, file=False): self.name = name self.level = level logging.Logger.__init__(self, self.name, level=level) diff --git a/requirements.txt b/requirements.txt index 53dc129b..c762e2cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ requests==2.20.0 gunicorn==19.9.0 -lxml==4.9.2 +lxml==4.9.3 redis==3.5.3 APScheduler==3.10.0;python_version>="3.10" APScheduler==3.2.0;python_version<"3.10" @@ -10,3 +10,4 @@ Flask==2.1.1;python_version>"3.6" Flask==1.0;python_version<="3.6" werkzeug==2.1.0;python_version>"3.6" werkzeug==0.15.5;python_version<="3.6" +pyquery>=1.4.3,<2.0.0 \ No newline at end of file diff --git a/setting.py b/setting.py index 9bab8475..208e12d8 100644 --- a/setting.py +++ b/setting.py @@ -44,25 +44,28 @@ # ###### config the proxy fetch function ###### -PROXY_FETCHER = [ - "freeProxy01", - "freeProxy02", - "freeProxy03", - "freeProxy04", - "freeProxy05", - "freeProxy06", - "freeProxy07", - "freeProxy08", - "freeProxy09", - "freeProxy10", - "freeProxy11" -] +# 改为自动加载,不需要配置. +# PROXY_FETCHER = [ +# "freeProxy01", +# "freeProxy02", +# "freeProxy03", +# "freeProxy05", +# "freeProxy06", +# "freeProxy07", +# "freeProxy10", +# "freeProxy11", +# "wingser01", +# "wingser02", +# "wingser03", +# "wingser04", +# "wingser05" +# ] # ############# proxy validator ################# # 代理验证目标网站 HTTP_URL = "http://httpbin.org" -HTTPS_URL = "https://www.qq.com" +HTTPS_URL = "https://jd.com" # 代理验证时超时时间 VERIFY_TIMEOUT = 10 diff --git a/util/six.py b/util/six.py index 14ee059b..7d858834 100644 --- a/util/six.py +++ b/util/six.py @@ -29,10 +29,10 @@ def iteritems(d, **kw): else: from urlparse import urlparse -if PY3: - from imp import reload as reload_six -else: - reload_six = reload +# if PY3: +# from imp import reload as reload_six +# else: +# reload_six = reload if PY3: from queue import Empty, Queue