当试图计算列表中的元素时出现元组问题？

every_link_test = ['http://www.millercenter.org/president/obama/speeches/speech-4427', 'http://www.millercenter.org/president/obama/speeches/speech-4424', 'http://www.millercenter.org/president/obama/speeches/speech-4453', 'http://www.millercenter.org/president/obama/speeches/speech-4612', 'http://www.millercenter.org/president/obama/speeches/speech-5502']

import urllib2,sys,os from bs4 import BeautifulSoup,NavigableString from string import punctuation as p from multiprocessing import Pool import re, nltk import requests reload(sys) url = 'http://www.millercenter.org/president/speeches' url2 = 'http://www.millercenter.org' conn = urllib2.urlopen(url) html = conn.read() miller_center_soup = BeautifulSoup(html) links = miller_center_soup.find_all('a') linklist = [tag.get('href') for tag in links if tag.get('href') is not None] # remove all items in list that don't contain 'speeches' linkslist = [_ for _ in linklist if re.search('speeches',_)] del linkslist[0:2] # concatenate 'http://www.millercenter.org' with each speech's URL ending every_link_dups = [url2 + end_link for end_link in linkslist] # remove duplicates seen = set() every_link = [] # no duplicates array for l in every_link_dups: if l not in seen: every_link.append(l) seen.add(l) def processURL_short_2(l): open_url = urllib2.urlopen(l).read() item_soup = BeautifulSoup(open_url) item_div = item_soup.find('div',{'id':'transcript'},{'class':'displaytext'}) item_str = item_div.text.lower() splitlink = l.split("/") president = splitlink[4] speech_num = splitlink[-1] filename = "{0}_{1}".format(president, speech_num) return item_str, filename every_link_test = every_link[0:5] print every_link_test count = 0 for l in every_link_test: content_1 = processURL_short_2(l) for word in content_1.split(): word = word.strip(p) if word in contractions: count = count + 1 print count, filename

2条回答

网友

1楼 · 编辑于 2024-09-30 00:42:09

正如错误消息所解释的，您不能使用split的使用方式。拆分是用于字符串的。你知道吗

所以你需要改变这一点：

for word in content_1.split():

对此：

for word in content_1[0]:

我通过运行代码选择了[0]，我想这会提供您要搜索的文本块。你知道吗

@TigerhawkT3有一个很好的建议，你也应该遵循他们的回答：

https://stackoverflow.com/a/32981533/1832539

网友

2楼 · 编辑于 2024-09-30 00:42:09

您应该将这些数据保存到数据结构中，比如字典，而不是print count, filename。由于processURL_short_2已被修改为返回元组，因此需要将其解包。你知道吗

data = {} # initialize a dictionary
for l in every_link_test:
    content_1, filename = processURL_short_2(l) # unpack the content and filename
    for word in content_1.split():
        word = word.strip(p)
        if word in contractions:
            count = count + 1        
    data[filename] = count # add this to the dictionary as filename:count

这将为您提供一个类似于{'obama_4424':79, 'obama_4453':101,...}的字典，允许您轻松地存储和访问解析的数据。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章