我用Bs4提取了链接列表中的文本,出于某些原因,在极少数情况下不起作用,也不提取文本,在“描述”列中,我没有值,如果我打印其中的行,我只有“”
代码:`
hdr = {'User-Agent': 'Mozilla/5.0',
'Connection': 'keep-alive'}
start_url =f"https://www.google.it/search?q=design+milano&source=lnms&tbm=nws&sa=X&ved=2ahUKEwjx"
link = start_url.replace(" ","")
req=Request(link, headers = hdr)
webpage=urlopen(req).read()
with requests.Session() as c:
soup= BeautifulSoup(webpage,'html.parser')
###### LINKS ######
data_link=soup.find_all("div", attrs={"class": "kCrYT"})
links = []
for i in range(0, len(data_link)):
for link in data_link[i].find_all('a'):
links.append(link.get('href'))
listozza=[]
for i in range(0, len(links)):
c=links[i].split("/url?q=")[1].split('&')[0]
listozza.append(c)
listozza = list(dict.fromkeys(listozza))
listozza
#### Extract text from each link in listozza
text_from_link =[]
for link in listozza:
page = requests.get(link)
page.encoding = 'utf-8'
soup = BeautifulSoup(page.content, "html.parser")
paragraphs = soup.find_all('p')
text_from_link.append(( " ".join([p.get_text() for p in paragraphs])))
#### create dataframe
df = pd.DataFrame(
{'link': listozza,
'description':text_from_link})
df.head(9)`
目前没有回答
相关问题 更多 >
编程相关推荐