<p>可以将池与gevent一起使用,也可以使用urllib3(或请求)中的内置池。然后,您可以根据池大小一次执行10或100个操作,并在池耗尽时使用异步队列来获取剩余的操作。你知道吗</p>
<pre><code>from gevent import monkey, spawn, joinall
monkey.patch_all()
from gevent.pool import Pool as GeventPool
import pandas as pd
from pandas import Series
import numpy as np
import requests
from bs4 import BeautifulSoup
urlList = pd.read_csv(r"url.csv")
urlList = urlList.url.tolist()
pool = GeventPool(10)
notfound = []
found = []
skulist =[]
skumissinglist =[]
count = len(urllist)
# Function scrap, pass url, open with soup, and find class
def scrap(url):
tag ='select'
classused = "super-attribute-select"
d = dict(A=np.array(found), B=np.array(skulist), C=np.array(notfound), D=np.array(skumissinglist))
try:
content = requests.get(url).text
soup = BeautifulSoup(content, features="html.parser")
sku= soup.find("div", {"itemprop": "sku"}).string
result = soup.find(tag, class_=classused)
#soup returns None if can't find anything
if result == None:
notfound.append(url)
skumissinglist.append(sku)
else:
found.append(url)
skulist.append(sku)
except:
print("Some extraction went wrong")
df = pd.DataFrame(dict([(k, Series(v)) for k, v in d.items()]))
return df.to_csv('Test.csv')
pool.map(scrap, urllist)
</code></pre>