这是一个脚本,用于获取图书标题列表(BookTitles.txt),在网站Goodreads中搜索每个标题的第一个结果,并将URL列表返回到csv文件(GoodReadsBooksNew.csv)
我得到以下错误:
iii@iii:~$ python /home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py
Traceback (most recent call last):
File "/home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 72, in create_csv_file()
File "/home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 29, in create_csv_file with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:
TypeError: 'encoding' is an invalid keyword argument for this function
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
from pyvirtualdisplay import Display
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common import keys
import csv
import time
import json
class Book:
def __init__(self, title, url):
self.title = title
self.url = url
def __iter__(self):
return iter([self.title, self.url])
url = 'https://www.goodreads.com/'
def create_csv_file():
header = ['Title', 'URL']
with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:
wr = csv.writer(csv_file, delimiter=',')
wr.writerow(header)
def read_from_txt_file():
lines = [line.rstrip('\n') for line in open('/home/iii/AudioBookReviews/WebScraping/BookTitles.txt', encoding='utf-8')]
return lines
def init_selenium():
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
options = Options()
options.add_argument('--headless')
global driver
driver = webdriver.Chrome("/home/iii/AudioBookReviews/WebScraping/chromedriver", chrome_options=chrome_options)
driver.get(url)
time.sleep(5)
driver.get('https://www.goodreads.com/search?q=')
def search_for_title(title):
driver.get('https://www.goodreads.com/search?q=')
search_field = driver.find_element_by_name('q')
search_field.clear()
search_field.send_keys(title)
search_field.send_keys(keys.Keys.RETURN) # you missed this part
url = driver.find_element_by_xpath(
'/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[1]/td[2]/a')
print(url.get_attribute('href'))
def scrape_url():
try:
url = driver.find_element_by_css_selector('a.bookTitle').get_attribute('href')
except:
url = "N/A"
return url
def write_into_csv_file(vendor):
with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'a', encoding='utf-8') as csv_file:
wr = csv.writer(csv_file, delimiter=',')
wr.writerow(list(vendor))
create_csv_file()
titles = read_from_txt_file()
init_selenium()
for title in titles:
search_for_title(title)
url = scrape_url()
book = Book(title, url)
write_into_csv_file(book)
我认为您使用的是
python 2.7
版本python 2.7
中的open
函数具有以下签名open(name[, mode[, buffering]])
另一方面
python 3+
有以下签名相关问题 更多 >
编程相关推荐