<p>下面是一个简单的实现:</p>
<pre><code>from collections import defaultdict
import nltk
def is_dialogue(line):
# Add more rules to check if the
# line is a dialogue or not
if len(line) > 0 and line.find('[') == -1 and line.find(']') == -1:
return True
def get_dialogues(filename, people_list):
dialogues = defaultdict(list)
people_list = map(lambda x: x+':', people_list)
current_person = None
with open(filename) as fin:
for line in fin:
current_line = line.strip().replace('\n','')
if current_line in people_list:
current_person = current_line
if (current_person is not None) and (current_line != current_person) and is_dialogue(current_line):
dialogues[current_person].append(current_line)
return dialogues
def get_word_counts(dialogues):
word_counts = defaultdict(dict)
for (person, dialogue_list) in dialogues.items():
word_count = defaultdict(int)
for dialogue in dialogue_list:
for word in nltk.tokenize.word_tokenize(dialogue):
word_count[word] += 1
word_counts[person] = word_count
return word_counts
if __name__ == '__main__':
dialogues = get_dialogues('script.txt', ['Sampson', 'Gregory', 'Abraham'])
word_counts = get_word_counts(dialogues)
print word_counts
</code></pre>