将语料库中的频率附加到

with open('ANC-all-count.txt', 'r', errors='ignore') as f: freqs = csv.reader(f, delimiter='\t') freqs = {} for word, pos, f in freq_list: if word not in freqs: freqs[word] = {} freqs[word][pos] = f for i, (word, pos) in enumerate(tokens): if word not in freqs: tokens[i].append(0) continue if pos not in freqs[word]: tokens[i] = [tokens[i][0:2]] single_token = tokens[i][0] if single_token[0] in freqs: tokens[i].append(freqs[word].values()) continue tokens[i].append(freqs[word][pos])

1条回答

网友

1楼 · 发布于 2024-09-26 18:06:51

TL；博士

>>> from itertools import chain
>>> from collections import Counter

>>> from nltk.corpus import brown
>>> from nltk import pos_tag, word_tokenize

# Access first hundred tokenized sentence from brown corpus
# POS tag these sentences.
>>> tagged_sents = [pos_tag(tokenized_sent) for tokenized_sent in brown.sents()[:100]]

# Sanity check that the tagged_sents are what we want.
>>> list(chain(*tagged_sents))[:10]
[('The', 'DT'), ('Fulton', 'NNP'), ('County', 'NNP'), ('Grand', 'NNP'), ('Jury', 'NNP'), ('said', 'VBD'), ('Friday', 'NNP'), ('an', 'DT'), ('investigation', 'NN'), ('of', 'IN')]

# Use a collections.Counter to get the counts.
>>> freq = Counter(chain(*tagged_sents))

# Top 20 most common words.
>>> dict(freq.most_common(20))
{('the', 'DT'): 128, ('.', '.'): 89, (',', ','): 88, ('of', 'IN'): 67, ('to', 'TO'): 55, ('a', 'DT'): 50, ('and', 'CC'): 40, ('in', 'IN'): 39, ('``', '``'): 35, ("''", "''"): 34, ('The', 'DT'): 28, ('said', 'VBD'): 24, ('that', 'IN'): 24, ('for', 'IN'): 22, ('be', 'VB'): 21, ('was', 'VBD'): 18, ('jury', 'NN'): 17, ('Fulton', 'NNP'): 14, ('election', 'NN'): 14, ('will', 'MD'): 14}

# All the words from most to least common.
>>> dict(freq.most_common())


# To print out the word, pos and counts to file.
>>> with open('freq-counts', 'w') as fout:
...     for (word,pos), count in freq.most_common(20):
...         print('\t'.join([word, pos, str(count)]))

TL；博士

相关问题更多 >

编程相关推荐

热门问题

热门文章