Python 3、bs4、webcrawler;连接太多网站时出错

2024-10-01 00:33:42 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图建立一个特定网站的网络爬虫。 但由于某些原因,我不会连接到该网站。 我(自己)出错,无法连接。 使用selesium调用该网站,我发现它无法连接

作为一个新手,我可能犯了一个愚蠢的错误,但我不知道是什么。 希望你愿意帮助我

import csv
import requests
import datetime
from time import sleep, time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
captcha = input('Press Enter after bypassing Captcha')

# def get_driver():
#     driver = webdriver.Chrome()
#     return driver


def get_driver():
    # initialize options
    options = webdriver.ChromeOptions()
    # pass in headless argument to options
    options.add_argument('--headless')
    # initialize driver
    driver = webdriver.Chrome(chrome_options=options)
    return driver


def connect_to_base(browser, page_number):
    base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
    html = None
    links = None
    connection_attempts = 0
    while connection_attempts < 3:
        try:
            browser.get(base_url)
            #wait for table element with id = 'map' to load
            #before returning True
            WebDriverWait(browser, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'result-content')))
            return True
        except Exception as ex:
            connection_attempts += 1
            print(f'Error connecting to {base_url}')
            print(f'Attempt #{connection_attempts}')
    return False


def parse_html(html):
        soup = BeautifulSoup(html, 'html.parser')
        inside = soup.find_all('a', {'class':'property-inner'},{'href'})
        # Make empty lists with header lines
        output_list = []
        listing = 1
        for items in inside:
            href = items.get('href')
            url1 = href.format(page)    
        if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
                browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
        connection_attempts = 0
        while connection_attempts < 3:
            try:
                browser.get(url1)
                WebDriverWait(browser, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'detail-address')))
                return True
            except Exception as ex:
                connection_attempts += 1
                print(f'Error connecting to {base_url}')
                print(f'Attempt #{connection_attempts}')

            details = BeautifulSoup(browser.page_source, 'html')       
            adres = details.find_all ('div', {'class':'detail-address'})
            for adresinfo in adres:
                try:
                    adres = adres[0].get_text(separator=',', strip=True)
                except Indexerror:
                    adres = "Unknown"

            kenmerken = details.find_all ('div', {'class':'detail-tab-content kenmerken'})
            try:
                tr_kenmerken = ','.join([td.text.strip() for td in kenmerken[0].select('td.value')])
            except IndexError:
                tr_kenmerken = 'Unknown'

            waarde = details.find_all ('div', {'class':'detail-tab-content woningwaarde'})
            try:
                tr_waarde = ','.join([td.text.strip() for td in waarde[0].select('td.value')])
            except IndexError:
                tr_waarde = 'Unknown'

            informatie = {
                'adres': adres, 
                'kenmerken': tr_kenmerken,
                'waarde': tr_waarde, 
                'url': href
            }

            output_list.append(informatie)
            listing += 1
        return output_list


def get_load_time(article_url):
    try:
        # set headers
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        # make get request to article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000)
        # get page load time
        load_time = response.elapsed.total_seconds()
    except Exception as ex:
        load_time = 'Loading Error'
    return load_time


def write_to_file(output_list, filename):
    for row in output_list:
        with open(filename, 'a') as csvfile:
            fieldnames = ['adres', 'kenmerken', 'waarde', 'link']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)


def run_process(page_number, filename, browser):
    if connect_to_base(browser, page_number):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
    else:
        print('Error connecting to jaap')

if __name__ == '__main__':
    # set variables
    start_time = time()
    current_page = 1
    output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    output_filename = f'output_{output_timestamp}.csv'
    browser = get_driver()
    # scrape and crawl
    while current_page <= 3:
        print(f'Scraping page #{current_page}...')
        run_process(current_page, output_filename, browser)
        current_page = current_page + 1
    # exit
    browser.quit()
    end_time = time()
    elapsed_time = end_time - start_time
    print(f'Elapsed run time: {elapsed_time} seconds')

Tags: toimportbrowserurloutputgettimehtml
1条回答
网友
1楼 · 发布于 2024-10-01 00:33:42

我看到你把EC.presence_of_element_located((By.ID,{'class':'result-content'}))改成了EC.presence_of_element_located((By.CLASS_NAME,'result-content')))

接下来,您可能会遇到一个问题(取决于浏览器打开的位置),即必须绕过/单击表示您可以接受cookie的javascript

但是,考虑到数据是以json格式存储在html的script标记中,所有这些代码似乎都需要大量的工作。为什么不简单地使用requests,取出json,转换为dataframe,然后写入csv

import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize

def run_process(page_number):
    base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    jsonStr = soup.find('script', {'id':'page-data'}).text
    jsonData = json.loads(jsonStr)

    df = json_normalize(jsonData['properties'])
    return df



if __name__ == '__main__':
    # set variables
    start_time = time()
    current_page = 1
    output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    output_filename = f'C:/test/output_{output_timestamp}.csv'

    final_df = pd.DataFrame()
    while current_page <= 3:
        print(f'Scraping page #{current_page}...')
        df = run_process(current_page)
        final_df = final_df.append(df, sort=True).reset_index(drop=True)
        current_page = current_page + 1
    final_df.to_csv(output_filename, index=False)
    end_time = time()
    elapsed_time = end_time - start_time
    print(f'Elapsed run time: {elapsed_time} seconds')

输出:

Scraping page #1...
Scraping page #2...
Scraping page #3...
Elapsed run time: 7.441420555114746 seconds

和csv文件,看起来像:

     app area                                         detailsUrl  expired       houseTypeValue        id  latLng  latLng.latitude  latLng.longitude location.city               location.street location.zipcode   lotSize market numberOfRooms openHouseDate openHouseTimes  openhouse                             photo      price  priceToShow showoffColor showoffCustomText showoffPhotoText  spotlight             status  veiling
0  False  165  /te-koop/noord+holland/groot-amsterdam/amsterd...    False            Herenhuis   6899666     NaN        52.368420          4.833631     AMSTERDAM         Hof van Versailles 61           1064NX       216   sale             4          None           None      False  10014EAAF8B8883668593EFAC9E5FF1C   595000.0     595000.0         None              None             None      False               Sale    False
1   True  211  /te-koop/noord+holland/groot-amsterdam/amsterd...    False          Appartement  10585731     NaN        52.327550          4.889076     AMSTERDAM                Beysterveld 35           1083KA  Onbekend   sale             4          None           None      False  E4F9E5BC7BC90B5B92C7BD8D48B7A677   925000.0     925000.0         None              None             None      False               Sale    False
2   True  111  /te-koop/noord+holland/groot-amsterdam/amsterd...    False  Dubbele bovenwoning  11731386     NaN        52.341890          4.896053     AMSTERDAM      Uiterwaardenstraat 320 2           1079DC  Onbekend   sale             5          None           None      False  AB9F45B2CD4AD7879C5A80F18092F9D4   750000.0     750000.0         None              None             None      False  SoldConditionally    False
3  False  269  /te-koop/noord+holland/groot-amsterdam/amsterd...    False            Herenhuis  11840681     NaN        52.358266          4.875508     AMSTERDAM      Korte van Eeghenstraat 4           1071ER       107   sale             9          None           None      False  A3DF2B1D426B5E4D501503C5D0E66966  3100000.0    3100000.0         None              None             None      False               Sale    False
4  False  100  /te-koop/noord+holland/groot-amsterdam/amsterd...    False         Tussenwoning  12152943     NaN        52.421245          4.899478     AMSTERDAM  Pieter A v Heijningestraat 9           1035SV        83   sale             5          None           None      False  55C6F589523FA553D67A709776DD70DD   399000.0     399000.0         None              None             None      False               Sale    False
5   True  111  /te-koop/noord+holland/groot-amsterdam/amsterd...    False          Bovenwoning  15796874     NaN              NaN               NaN     AMSTERDAM      Eerste Amstelvlietpad 20           1096GB  Onbekend   sale             3          None           None      False  AE822B627ED096310B9ECBE7756340C8  1200000.0    1200000.0         None              None             None      False               Sale    False
6   True   76  /te-koop/noord+holland/groot-amsterdam/amsterd...    False        Benedenwoning  10580650     NaN        52.346010          4.888799     AMSTERDAM       Grevelingenstraat 18 HS           1078KP  Onbekend   sale             2          None           None      False  6FD1011D917E776DCF4DA836B5FFEE3E   550000.0     550000.0         None              None             None      False  SoldConditionally    False
7  False  298  /te-koop/noord+holland/groot-amsterdam/amsterd...    False                Villa   9623182     NaN        52.330610          4.862902     AMSTERDAM                 Cannenburg 51           1081GW       651   sale             7          None           None      False  15FA170B99D4E2DEA03B6FC27E3B5B74  2495000.0    2495000.0         None              None             None      False               Sale    False
8  False  270  /te-koop/noord+holland/groot-amsterdam/amsterd...    False            Herenhuis  15791215     NaN        52.347780          5.004530     AMSTERDAM            Nico Jessekade 189           1087MR       200   sale             9          None           None      False  6EA5C0CDA0475DFC88A3A918A6B2909A  1549000.0    1549000.0         None              None             None      False  SoldConditionally    False
9  False  201  /te-koop/noord+holland/groot-amsterdam/amsterd...    False                Villa   9617942     NaN        52.377391          4.764554     AMSTERDAM               Osdorperweg 803           1067SW      1348   sale             6          None           None      False  4680429D99EC5AC47C950D57A77DF1EB   950000.0     950000.0         None              None             None      False               Sale    False

更新:

import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

def run_process(page_number):
    page_number = 1
    base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    jsonStr = soup.find('script', {'id':'page-data'}).text
    jsonData = json.loads(jsonStr)


    df = json_normalize(jsonData['properties'])
    root_URL = 'https://jaap.nl'
    df['detailsUrl'] = root_URL + df['detailsUrl']

    allPropDetails = pd.DataFrame()


    for idx, row in df.iterrows():
        propDetails = pd.DataFrame(index=[0])
        w=1
        detailLink = row['detailsUrl']
        print ('Scraping: %s' %(row['location.street']))
        dfs = pd.read_html(detailLink)

        for each in dfs:

            #each = dfs[8]
            w=1
            if each.isnull().all().all():
                continue
            each = each.dropna(axis=0, how='all')

            specialCase = False
            for col in list(each.columns):
                if each[col].dtypes == 'object':
                    if each[col].str.contains('Voorziening').any():
                        specialCase = True
                        break

            if specialCase == True:
                df_obj = each.select_dtypes(['object'])
                each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
                cols1 = list(each.iloc[2:,0])
                each = each.iloc[2:,:]
                each[1] = each[1] + ' -' + each[2]
                each = each.iloc[:,-2]
                each.index = cols1

                each = each.to_frame().T
                propRow = each
                propRow.index = [0]

                temp_df = pd.DataFrame(index=[0])
                for col in propRow.columns:
                    temp_df = temp_df.merge(propRow[col].str.split(' -', expand=True).rename(columns={0:col, 1:col+'.distance'}),left_index=True, right_index=True )
                propRow = temp_df


            else:
                df_obj = each.select_dtypes(['object'])
                each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))

                temp_df = each.T
                cols = [ temp_df.index[0] + '_' + colName for colName in list(temp_df.iloc[0,:]) ]

                propRow = temp_df.iloc[-1,:]
                propRow.index = cols
                propRow = propRow.to_frame().T
                propRow.index = [0]

            propDetails = propDetails.merge(propRow, left_index=True, right_index=True)


        propDetails.index = [idx]    
        allPropDetails = allPropDetails.append(propDetails, sort=True)

    df = df.merge(allPropDetails, how = 'left', left_index=True, right_index=True)        
    return df



if __name__ == '__main__':
    # set variables
    start_time = time()
    current_page = 1
    output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    output_filename = f'C:/test/output_{output_timestamp}.csv'

    final_df = pd.DataFrame()
    while current_page <= 3:
        print(f'Scraping page #{current_page}...')
        df = run_process(current_page)
        final_df = final_df.append(df, sort=True).reset_index(drop=True)
        current_page = current_page + 1
    final_df.to_csv(output_filename, index=False)
    end_time = time()
    elapsed_time = end_time - start_time
    print(f'Elapsed run time: {elapsed_time} seconds')

相关问题 更多 >