我试图建立一个特定网站的网络爬虫。 但由于某些原因,我不会连接到该网站。 我(自己)出错,无法连接。 使用selesium调用该网站,我发现它无法连接
作为一个新手,我可能犯了一个愚蠢的错误,但我不知道是什么。 希望你愿意帮助我
import csv
import requests
import datetime
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
captcha = input('Press Enter after bypassing Captcha')
# def get_driver():
# driver = webdriver.Chrome()
# return driver
def get_driver():
# initialize options
options = webdriver.ChromeOptions()
# pass in headless argument to options
options.add_argument('--headless')
# initialize driver
driver = webdriver.Chrome(chrome_options=options)
return driver
def connect_to_base(browser, page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
html = None
links = None
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(base_url)
#wait for table element with id = 'map' to load
#before returning True
WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'result-content')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
return False
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
inside = soup.find_all('a', {'class':'property-inner'},{'href'})
# Make empty lists with header lines
output_list = []
listing = 1
for items in inside:
href = items.get('href')
url1 = href.format(page)
if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(url1)
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, 'detail-address')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
details = BeautifulSoup(browser.page_source, 'html')
adres = details.find_all ('div', {'class':'detail-address'})
for adresinfo in adres:
try:
adres = adres[0].get_text(separator=',', strip=True)
except Indexerror:
adres = "Unknown"
kenmerken = details.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr_kenmerken = ','.join([td.text.strip() for td in kenmerken[0].select('td.value')])
except IndexError:
tr_kenmerken = 'Unknown'
waarde = details.find_all ('div', {'class':'detail-tab-content woningwaarde'})
try:
tr_waarde = ','.join([td.text.strip() for td in waarde[0].select('td.value')])
except IndexError:
tr_waarde = 'Unknown'
informatie = {
'adres': adres,
'kenmerken': tr_kenmerken,
'waarde': tr_waarde,
'url': href
}
output_list.append(informatie)
listing += 1
return output_list
def get_load_time(article_url):
try:
# set headers
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as ex:
load_time = 'Loading Error'
return load_time
def write_to_file(output_list, filename):
for row in output_list:
with open(filename, 'a') as csvfile:
fieldnames = ['adres', 'kenmerken', 'waarde', 'link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
def run_process(page_number, filename, browser):
if connect_to_base(browser, page_number):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print('Error connecting to jaap')
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'output_{output_timestamp}.csv'
browser = get_driver()
# scrape and crawl
while current_page <= 3:
print(f'Scraping page #{current_page}...')
run_process(current_page, output_filename, browser)
current_page = current_page + 1
# exit
browser.quit()
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
我看到你把
EC.presence_of_element_located((By.ID,{'class':'result-content'}))
改成了EC.presence_of_element_located((By.CLASS_NAME,'result-content')))
接下来,您可能会遇到一个问题(取决于浏览器打开的位置),即必须绕过/单击表示您可以接受cookie的javascript
但是,考虑到数据是以json格式存储在html的
script
标记中,所有这些代码似乎都需要大量的工作。为什么不简单地使用requests
,取出json,转换为dataframe,然后写入csv输出:
和csv文件,看起来像:
更新:
相关问题 更多 >
编程相关推荐