我正在尝试使用aiohttp和async来获取代理列表,并检查它们是否与特定URL一起工作(>;状态代码200)。如果是这样,我想将它们添加到一个新的工作代理列表中。我以前在请求中这样做过,它工作得很好,但速度非常慢,因此我尝试使用异步方法使其工作。当我使刮片部分工作时,我无法使检查部分运行:
from bs4 import BeautifulSoup
import random
import asyncio
import aiohttp
URL1 = 'https://free-proxy-list.net/'
URL2 = 'https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=2900&country=all&ssl=all&anonymity=elite&simplified=true'
async def proxy_db():
async with aiohttp.ClientSession() as session:
async with session.get(URL1) as resp1:
text1 = await resp1.read()
soup1 = BeautifulSoup(text1.decode('utf-8'), 'html.parser')
proxy_list_fpl = []
for items1 in soup1.select("#proxylisttable tbody tr"):
proxy_list_fpl.append(':'.join([item.text for item in items1.select("td")[:2]]))
print(len(proxy_list_fpl))
async with session.get(URL2) as resp2:
text2 = await resp2.read()
soup2 = BeautifulSoup(text2.decode('utf-8'), 'html.parser')
proxy_list_ps = []
for items2 in soup2:
proxy_list_ps = items2.split()
print(len(proxy_list_ps))
templist = list(set(proxy_list_fpl + proxy_list_ps))
proxy_list = ["http://" + s for s in templist]
print(len(proxy_list))
return proxy_list
loop = asyncio.get_event_loop()
proxies = loop.run_until_complete(proxy_db())
print(proxies)
loop.close()
# Until here it works fine. Im new to python and asyncio, so there might be a more
# efficient way of coding this, however it already saved 50% time compared to my requests method before
working_proxy = []
async def fetch(session, url, proxy):
async with session.get(url, proxy = proxies) as response:
if response.status != 200:
response.raise_for_status()
return await response.text
async def fetch_all(session, url, proxy):
tasks = []
for proxy in proxies:
task = asyncio.create_task(fetch(session, url, proxy))
tasks.append(task)
results = await asyncio.gather(*tasks)
return results
async def main():
url = "http://httpbin.org/ip"
proxy = proxies
async with aiohttp.ClientSession() as session:
page = await fetch_all(session, url, proxy)
if page.status == 200:
working_proxy.append(proxies)
print(len(working_proxy))
if __name__ == "__main__":
asyncio.run(main())
导致:
Traceback (most recent call last):
File "/Users/xxx/Dropbox/Python/5APR/Web_Scraping/asyncio_test.py", line 179, in <module>
asyncio.run(main())
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 642, in run_until_complete
return future.result()
File "/Users/xxx/Dropbox/Python/5APR/Web_Scraping/asyncio_test.py", line 173, in main
page = await fetch_all(session, url, proxy)
File "/Users/xxx/Dropbox/Python/5APR/Web_Scraping/asyncio_test.py", line 166, in fetch_all
results = await asyncio.gather(*tasks)
File "/Users/xxx/Dropbox/Python/5APR/Web_Scraping/asyncio_test.py", line 156, in fetch
async with session.get(url, proxy = proxies) as response:
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/aiohttp/client.py", line 1117, in __aenter__
self._resp = await self._coro
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/aiohttp/client.py", line 415, in _request
proxy = URL(proxy)
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/yarl/_url.py", line 158, in __new__
raise TypeError("Constructor parameter should be str")
TypeError: Constructor parameter should be str
Process finished with exit code 1
我将非常感谢任何关于如何运行的想法或提示。我对Python和一般的编码都是新手,所以我也很高兴看到关于糟糕的实践/风格或更有效的编码方式的任何提示。提前谢谢你们
目前没有回答
相关问题 更多 >
编程相关推荐