<p>我制作了一个模块,可以过滤掉重复的实例,并在途中删除hashtag”</p>
<pre><code>__all__ = ['filterDuplicates']
import re
hashRegex = re.compile(r'#[a-z0-9]+', re.IGNORECASE)
trunOne = re.compile(r'^\s+')
trunTwo = re.compile(r'\s+$')
def filterDuplicates(tweets):
dupes = []
new_dict = []
for dic in tweets:
new_txt = hashRegex.sub('', dic['text']) #Removes hashtags
new_txt = trunOne.sub('', trunTwo.sub('', new_txt)) #Truncates extra spaces
print(new_txt)
dic.update({'text':new_txt})
if new_txt in dupes:
continue
dupes.append(new_txt)
new_dict.append(dic)
return new_dict
if __name__ == '__main__':
the_tweets = [
{'text':'#yolo #swag something really annoying', 'id':1},
{'text':'something really annoying', 'id':2},
{'text':'thing thing thing haha', 'id':3},
{'text':'#RF thing thing thing haha', 'id':4},
{'text':'thing thing thing haha', 'id':5}
]
#Tweets pre-filter
for dic in the_tweets:
print(dic)
#Tweets post-filter
for dic in filterDuplicates(the_tweets):
print(dic)
</code></pre>
<p>只需导入这个在您的脚本和运行它过滤掉推文!你知道吗</p>