我的程序通过打开Selenium窗口并查找“联系人”页面来获取Google搜索结果。不幸的是,我得到了这个错误-下面-停止了程序的轨道。怎么解决?谢谢你的帮助 程序如下:
#THIS SEARCHES FOR ANY NUMBER OF SEARCH TERMS PLUS OTHER WORDS
#AND SCRAPES FOR EMAILS THE FIRST X NUMBERS OF RESULTS
#THEN DOUBLE CHECKS WITH SELENIUM BY OPENING CONTACT LINKS AND SCRAPING WITH REGEX
import requests, re, bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time,random
def google_this_for_emails(): #Googles and gets the first few links
search_terms = ['Barcelona','Madrid']
added_terms = 'seleccion de personal executive search email contact? @'
number_of_sites = 10 #NUMBER OF SITES (SEARCH RESULTS) TO PARSE FOR EMAILS
#added_terms = 'email contact? @'
# search_terms = ['Doctor Hemimegalencefalia HEMISFERECTOMIA','Cirugia Hemimegalencefalia HEMISFERECTOMIA','Surgery Hemimegalencephaly HEMISPHERECTOMY','chirurgia Hemimegalencefalia HEMISFERECTOMY', 'Chirurgie Hemimegalencefalia HEMISFERECTOMY']
global scrapedEmails
scrapedEmails = []
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
print('\n Searching for the terms...', el,added_terms)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
global dicti_pretty_links
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:(number_of_sites)]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
# for el in dicti_pretty_links:
# #######START OF THE BS CHECK FOR EMAILS BY REGEX #################
# #This opens page in BS for parsing emails
# webpage = (el)
# headers = {'User-agent':'Mozilla/5.0'}
# res = requests.get(webpage, headers=headers)
#
# statusCode = res.status_code
# if statusCode == 200:
# soup = bs4.BeautifulSoup(res.text,'lxml')
#
# #This is the first way to search for an email in soup, "MO"
# emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
# mo = emailRegex.findall(res.text)
# print('THIS BELOW IS MO')
# print(mo)
# for el in mo:
# if el not in scrapedEmails:
# scrapedEmails.append(el)
#
# #This is the second way to search for an email in soup, "MAILTOS":
# mailtos = soup.select('a[href^=mailto]')
# print('THIS BELOW IS MAILTOS')
# print(mailtos)
#
# dicti_cleaner = []
# target = re.compile(r'mailto')
# for el in mailtos:
# mo = target.search(str(el))
# dicti_cleaner.append(el)
#
# temp = []
# for el in dicti_cleaner:
# pretty_url = str(el).partition(':')[2]
# second_url = str(pretty_url).partition('"')[0]
# temp.append(second_url)
#
# for el in temp:
# if el not in scrapedEmails:
# scrapedEmails.append(el)
# #######END OF THE BS CHECK FOR EMAILS BY REGEX #################
for el in dicti_pretty_links:
#######START OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
browser = webdriver.Firefox() #This converts page into Selenium object
page = browser.get(el)
time.sleep(random.uniform(0.5,1.5))
try: #Tries to open "contact" link
contact_link = browser.find_element_by_partial_link_text('ontact')
if contact_link:
contact_link.click()
except:
pass #Silently ignores exception
html = browser.page_source #Loads up the page for Regex search
soup = BeautifulSoup(html,'lxml')
time.sleep(random.uniform(0.5,1.5))
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(html)
print('THIS BELOW IS SEL_emails_MO for',el)
print(mo)
for el in mo:
if el not in scrapedEmails: #Checks if emails is/adds to ddbb
scrapedEmails.append(el)
browser.close()
#######END OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
time.sleep(random.uniform(0.5,1.5)) #INSERTS HUMAN-LIKE RANDOM DELAY
print(100*'-')
print(len(search_terms),'terms have been searched, for a total of',number_of_sites,'search results each')
print(len(search_terms)*number_of_sites,'pages have been scraped for emails.')
print('A total of ',len(scrapedEmails),'emails have been found')
print('These are the emails found:')
print(scrapedEmails)
print(100*'-')
google_this_for_emails()
这是上面程序的输出:
^{pr2}$
目前没有回答
相关问题 更多 >
编程相关推荐