从列表中删除字符串unicode标记，并使每个项位于单独的lin上

def getActors(item_url): response = requests.get(item_url) soup = BeautifulSoup(response.content, "lxml") # or BeautifulSoup(response.content, "html5lib") tempActors = [] try: tempActors.append(soup.find(text="Actors:").find_parent("tr").find_all(text=True)[1:]) except AttributeError: tempActors.append("n/a") return tempActors

[u'Jennifer Lawrence', u'Josh Hutcherson', u'Liam Hemsworth', u'Elizabeth Banks', u'Stanley Tucci', u'Woody Harrelson', u'Philip Seymour Hoffman', u'Jeffrey Wright', u'Jena Malone', u'Amanda Plummer', u'Sam Claflin', u'Donald Sutherland', u'Lenny Kravitz'] [u'Robert Downey, Jr.', u'Gwyneth Paltrow', u'Don Cheadle', u'Guy Pearce', u'Rebecca Hall', u'James Badge Dale', u'Jon Favreau', u'Ben Kingsley', u'Paul Bettany*', u' ', u'(Voice)', u'Mark Ruffalo*', u' ', u'(Cameo)']

def spider(max_pages): page = 1 while page <= max_pages: url = 'http://www.boxofficemojo.com/yearly/chart/?page=' + str(page) + '&view=releasedate&view2=domestic&yr=2013&p=.htm' source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text) for link in soup.select('td > b > font > a[href^=/movies/?]'): href = 'http://www.boxofficemojo.com' + link.get('href') listOfActors.append(getActors(href)) page += 1

1条回答

网友

1楼 · 发布于 2024-09-29 23:24:11

首先，您应该将getActors的当前实现更改为。当前实现返回一个列表列表。这将返回单个列表。你知道吗

def getActors(item_url):
    response = requests.get(item_url)
    soup = BeautifulSoup(response.content, "lxml")  # or BeautifulSoup(response.content, "html5lib")
    tempActors = []
    try:
        return(soup.find(text="Actors:").find_parent("tr").find_all(text=True)[1:])
    except AttributeError:
        return ['n/a']

然后，在将getActors中的许多列表收集到一个名为listOfActors的列表列表列表之后，您可以将它们全部写入如下csv文件

out = open('csv.csv','w')
for i in listOfActors:
    line = ''
    for j in i:
        line = line+j+','
    out.write(line+'\n')

out.close()

请使用逗号分隔这些值。另外，python将自动处理unicode字符串。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章