Python3.5版。如何避免google结果Selenium scraper中的winerror6无效句柄?

2024-05-20 12:28:58 发布

您现在位置:Python中文网/ 问答频道 /正文

我的程序通过打开Selenium窗口并查找“联系人”页面来获取Google搜索结果。不幸的是,我得到了这个错误-下面-停止了程序的轨道。怎么解决?谢谢你的帮助 程序如下:

#THIS SEARCHES FOR ANY NUMBER OF SEARCH TERMS PLUS OTHER WORDS
#AND SCRAPES FOR EMAILS THE FIRST X NUMBERS OF RESULTS
#THEN DOUBLE CHECKS WITH SELENIUM BY OPENING CONTACT LINKS AND SCRAPING WITH REGEX

import requests, re, bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time,random

def google_this_for_emails():                #Googles and gets the first few links
    search_terms = ['Barcelona','Madrid']
    added_terms = 'seleccion de personal executive search email contact? @'
    number_of_sites = 10   #NUMBER OF SITES (SEARCH RESULTS) TO PARSE FOR EMAILS
    #added_terms = 'email contact? @'
    # search_terms = ['Doctor Hemimegalencefalia HEMISFERECTOMIA','Cirugia Hemimegalencefalia HEMISFERECTOMIA','Surgery Hemimegalencephaly HEMISPHERECTOMY','chirurgia Hemimegalencefalia HEMISFERECTOMY', 'Chirurgie Hemimegalencefalia HEMISFERECTOMY']


    global scrapedEmails
    scrapedEmails = []

    #This searches for certain keywords in Google and parses results with BS
    for el in search_terms:
        webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
        print('\n Searching for the terms...', el,added_terms)
        headers = {'User-agent':'Mozilla/5.0'}
        res = requests.get(webpage, headers=headers)
        #res.raise_for_status()

        statusCode = res.status_code
        if statusCode == 200:
            soup = bs4.BeautifulSoup(res.text,'lxml')
            serp_res_rawlink = soup.select('.r a')

            dicti = []                  #This gets the href links
            for link in serp_res_rawlink:
                url = link.get('href')
                if 'pdf' not in url:
                    dicti.append(url)

            dicti_url = []              #This cleans the "url?q=" from link
            for el in dicti:
                if '/url?q=' in el:
                    result = (el.strip('/url?q='))
                    dicti_url.append(result)
            #print(dicti_url)

            global dicti_pretty_links
            dicti_pretty_links = []     #This cleans the gibberish at end of url
            for el in dicti_url[0:(number_of_sites)]:
                pretty_url = el.partition('&')[0]
                dicti_pretty_links.append(pretty_url)
            print(dicti_pretty_links)


            # for el in dicti_pretty_links:
            # #######START OF THE BS CHECK FOR EMAILS BY REGEX #################
            #     #This opens page in BS for parsing emails
            #     webpage = (el)
            #     headers = {'User-agent':'Mozilla/5.0'}
            #     res = requests.get(webpage, headers=headers)
            #
            #     statusCode = res.status_code
            #     if statusCode == 200:
            #         soup = bs4.BeautifulSoup(res.text,'lxml')
            #
            #         #This is the first way to search for an email in soup, "MO"
            #         emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
            #         mo = emailRegex.findall(res.text)
            #         print('THIS BELOW IS MO')
            #         print(mo)
            #         for el in mo:
            #             if el not in scrapedEmails:
            #                 scrapedEmails.append(el)
            #
            #         #This is the second way to search for an email in soup, "MAILTOS":
            #         mailtos = soup.select('a[href^=mailto]')
            #         print('THIS BELOW IS MAILTOS')
            #         print(mailtos)
            #
            #         dicti_cleaner = []
            #         target = re.compile(r'mailto')
            #         for el in mailtos:
            #             mo = target.search(str(el))
            #             dicti_cleaner.append(el)
            #
            #         temp = []
            #         for el in dicti_cleaner:
            #             pretty_url = str(el).partition(':')[2]
            #             second_url = str(pretty_url).partition('"')[0]
            #             temp.append(second_url)
            #
            #         for el in temp:
            #             if el not in scrapedEmails:
            #                 scrapedEmails.append(el)
            #     #######END OF THE BS CHECK FOR EMAILS BY REGEX #################


            for el in dicti_pretty_links:
            #######START OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
                browser = webdriver.Firefox()  #This converts page into Selenium object
                page = browser.get(el)
                time.sleep(random.uniform(0.5,1.5))
                try:                                #Tries to open "contact" link
                    contact_link = browser.find_element_by_partial_link_text('ontact')
                    if contact_link:
                        contact_link.click()
                except:
                    pass    #Silently ignores exception
                html = browser.page_source          #Loads up the page for Regex search
                soup = BeautifulSoup(html,'lxml')
                time.sleep(random.uniform(0.5,1.5))
                emailRegex = re.compile(r'([a-zA-Z0-9_.+]+@[a-zA-Z0-9_.+.+]+)', re.VERBOSE)
                mo = emailRegex.findall(html)
                print('THIS BELOW IS SEL_emails_MO for',el)
                print(mo)
                for el in mo:
                    if el not in scrapedEmails:     #Checks if emails is/adds to ddbb
                        scrapedEmails.append(el)
                browser.close()
                #######END OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################

    time.sleep(random.uniform(0.5,1.5))    #INSERTS HUMAN-LIKE RANDOM DELAY

    print(100*'-')
    print(len(search_terms),'terms have been searched, for a total of',number_of_sites,'search results each')
    print(len(search_terms)*number_of_sites,'pages have been scraped for emails.')
    print('A total of ',len(scrapedEmails),'emails have been found')
    print('These are the emails found:')
    print(scrapedEmails)
    print(100*'-')

google_this_for_emails()

这是上面程序的输出:

^{pr2}$

Tags: theinurlforsearchifprettyres