我有一个使用熊猫创建数据帧的代码,然后将结果输出到csv。我想让它做的是在csv中导入循环的每个迭代,这样,如果出现错误,例如连接丢失,我仍然可以得到一些结果
import requests
from googlesearch import search
import csv
import pandas
from bs4 import BeautifulSoup
import numpy as np
import os
from datetime import datetime
import time
import os
start_time = time.time()
emptyWebPageSet = []
emptySetTitle = []
emptysetGenre = []
infoSet = []
date = []
colnames = ['title']
data = pandas.read_csv('D:/Desktop/imdbWebScrape/mediaDataForGenreScrape.csv', names=colnames, header=None)
my_list = data["title"]
my_list = list(my_list)
my_list = my_list[1:]
length = len(my_list)
for film in my_list:
filmIndex = my_list.index(film) + 1
query = film + " imdb"
for j in search(query, tld="co.in", num=10, stop=1, pause=2):
page = requests.get(j)
response = page.status_code
if response == 200:
soup = BeautifulSoup(page.content, "lxml")
genreData = soup.find_all("div",{"class":"subtext"})
summaryText = soup.find("div", {"class":"summary_text"})
summaryText = summaryText.string
infoSet.append(summaryText)
filmtitle = soup.find("h1")
filmtitle = filmtitle.contents[0].strip()
emptySetTitle.append(filmtitle)
links = []
genres = []
for h in genreData:
a = h.find_all('a')
aLength = len(a)
a1 = a[0]
for b in range(0,aLength - 1):
r = a[b].string
genres.append(r)
print (str(filmIndex) + " " + str(filmtitle))
emptysetGenre.append(genres)
emptyWebPageSet.append(j)
lst1 = [item[0] for item in emptysetGenre]
lst2 = [i[1] if len(i) > 1 else '' for i in emptysetGenre]
df = pandas.DataFrame({"imdbPage": emptyWebPageSet,
"title": emptySetTitle,
"genre1": lst1,
"info":infoSet
})
df.to_csv("movieDetails.csv", encoding='utf-8', index=False)
最后一部分
相关问题 更多 >
编程相关推荐