使用aiohttp/async检查代理列表

2024-05-21 04:29:21 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试使用aiohttp和async来获取代理列表,并检查它们是否与特定URL一起工作(>;状态代码200)。如果是这样,我想将它们添加到一个新的工作代理列表中。我以前在请求中这样做过,它工作得很好,但速度非常慢,因此我尝试使用异步方法使其工作。当我使刮片部分工作时,我无法使检查部分运行:

from bs4 import BeautifulSoup
import random
import asyncio
import aiohttp

URL1 = 'https://free-proxy-list.net/'
URL2 = 'https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=2900&country=all&ssl=all&anonymity=elite&simplified=true'


async def proxy_db():
    async with aiohttp.ClientSession() as session:
        async with session.get(URL1) as resp1:
            text1 = await resp1.read()
            soup1 = BeautifulSoup(text1.decode('utf-8'), 'html.parser')
            proxy_list_fpl = []
            for items1 in soup1.select("#proxylisttable tbody tr"):
                proxy_list_fpl.append(':'.join([item.text for item in items1.select("td")[:2]]))
            print(len(proxy_list_fpl))
        async with session.get(URL2) as resp2:
            text2 = await resp2.read()
            soup2 = BeautifulSoup(text2.decode('utf-8'), 'html.parser')
            proxy_list_ps = []
            for items2 in soup2:
                proxy_list_ps = items2.split()
            print(len(proxy_list_ps))
    templist = list(set(proxy_list_fpl + proxy_list_ps))
    proxy_list = ["http://" + s for s in templist]
    print(len(proxy_list))
    return proxy_list

loop = asyncio.get_event_loop()
proxies = loop.run_until_complete(proxy_db())
print(proxies)
loop.close()

# Until here it works fine. Im new to python and asyncio, so there might be a more 
# efficient way of coding this, however it already saved 50% time compared to my requests method before


working_proxy = []


async def fetch(session, url, proxy):
    async with session.get(url, proxy = proxies) as response:
        if response.status != 200:
            response.raise_for_status()
        return await response.text

async def fetch_all(session, url, proxy):
    tasks = []
    for proxy in proxies:
        task = asyncio.create_task(fetch(session, url, proxy))
        tasks.append(task)
    results = await asyncio.gather(*tasks)
    return results

async def main():
    url = "http://httpbin.org/ip"
    proxy = proxies
    async with aiohttp.ClientSession() as session:
        page = await fetch_all(session, url, proxy)
        if page.status == 200:
            working_proxy.append(proxies)
            print(len(working_proxy))

if __name__ == "__main__":
    asyncio.run(main())

导致:

Traceback (most recent call last):
  File "/Users/xxx/Dropbox/Python/5APR/Web_Scraping/asyncio_test.py", line 179, in <module>
    asyncio.run(main())
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 642, in run_until_complete
    return future.result()
  File "/Users/xxx/Dropbox/Python/5APR/Web_Scraping/asyncio_test.py", line 173, in main
    page = await fetch_all(session, url, proxy)
  File "/Users/xxx/Dropbox/Python/5APR/Web_Scraping/asyncio_test.py", line 166, in fetch_all
    results = await asyncio.gather(*tasks)
  File "/Users/xxx/Dropbox/Python/5APR/Web_Scraping/asyncio_test.py", line 156, in fetch
    async with session.get(url, proxy = proxies) as response:
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/aiohttp/client.py", line 1117, in __aenter__
    self._resp = await self._coro
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/aiohttp/client.py", line 415, in _request
    proxy = URL(proxy)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/yarl/_url.py", line 158, in __new__
    raise TypeError("Constructor parameter should be str")
TypeError: Constructor parameter should be str

Process finished with exit code 1

我将非常感谢任何关于如何运行的想法或提示。我对Python和一般的编码都是新手,所以我也很高兴看到关于糟糕的实践/风格或更有效的编码方式的任何提示。提前谢谢你们


Tags: inpyasynciourlasyncaiohttpsessionwith