获取一个简单python脚本的回溯错误以获取GoodReads.com URL

2024-10-03 04:32:40 发布

您现在位置:Python中文网/ 问答频道 /正文

这是一个脚本,用于获取图书标题列表(BookTitles.txt),在网站Goodreads中搜索每个标题的第一个结果,并将URL列表返回到csv文件(GoodReadsBooksNew.csv)

我得到以下错误:

iii@iii:~$ python /home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py

Traceback (most recent call last):

File "/home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 72, in create_csv_file()

File "/home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 29, in create_csv_file with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:

TypeError: 'encoding' is an invalid keyword argument for this function


from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options

from pyvirtualdisplay import Display
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common import keys
import csv
import time
import json

class Book:
    def __init__(self, title, url):
        self.title = title
        self.url = url
    def __iter__(self):
        return iter([self.title, self.url])

url = 'https://www.goodreads.com/'

def create_csv_file():
    header = ['Title', 'URL']
    with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:
        wr = csv.writer(csv_file, delimiter=',')
        wr.writerow(header)

def read_from_txt_file():
    lines = [line.rstrip('\n') for line in open('/home/iii/AudioBookReviews/WebScraping/BookTitles.txt', encoding='utf-8')]
    return lines

def init_selenium():
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage') 
    options = Options()
    options.add_argument('--headless')
    global driver
    driver = webdriver.Chrome("/home/iii/AudioBookReviews/WebScraping/chromedriver",  chrome_options=chrome_options)
    driver.get(url)
    time.sleep(5)
    driver.get('https://www.goodreads.com/search?q=')

def search_for_title(title):
    driver.get('https://www.goodreads.com/search?q=')
    search_field = driver.find_element_by_name('q')
    search_field.clear()
    search_field.send_keys(title)
    search_field.send_keys(keys.Keys.RETURN) # you missed this part
    url = driver.find_element_by_xpath(
        '/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[1]/td[2]/a')
    print(url.get_attribute('href'))

def scrape_url():
    try:
        url = driver.find_element_by_css_selector('a.bookTitle').get_attribute('href')
    except:
        url = "N/A"

    return url

def write_into_csv_file(vendor):
   with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'a', encoding='utf-8') as csv_file:
        wr = csv.writer(csv_file, delimiter=',')
        wr.writerow(list(vendor))

create_csv_file()
titles = read_from_txt_file()    
init_selenium()

for title in titles:
    search_for_title(title)
    url = scrape_url()
    book = Book(title, url)
    write_into_csv_file(book)

Tags: csvfromimporturlhometitledefdriver