使用Python字典计算单词的频率，不包括将从第二个文件读取的一组“停止单词”

def main(): text = open("usconst.txt" , "r") texts = open("stopwords.txt" , "r") #loop through each line of the file for us const line_count = 1 d = dict() # us const for line in text: print("line{} : is {}".format(line_count , line)) line_count += 1 line = line.translate(line.maketrans("","",string.punctuation)) line = line.lower() words = line.split() print("words =" , words , "\n") # stop words for line in texts: line_count += 1 line = line.lower() line = line.translate(line.maketrans("","",string.punctuation)) words = line.split() for word in words: if word in d: print("word--{}-- is already in dictionary, its value is {}".format(word , d[word])) else: d[word] = 42

2条回答

网友

1楼 · 编辑于 2024-10-03 04:33:24

类似（未经测试，因为我没有您的列表）：

def main():
  
    text = open("usconst.txt" , "r")
    texts = open("stopwords.txt" , "r")
    
    line_count = 1
   
    # us const 
    uswords = []
    for line in text:
      print("line{} : is {}".format(line_count , line))
      line_count += 1
      line = line.translate(line.maketrans("","",string.punctuation))
      line = line.lower()
      uswords.extend( line.split() )
    print("uswords =" , uswords , "\n")
      
      # stop words
    stopwords = [] 
    for line in texts:
      line_count += 1
      line = line.lower()
      line = line.translate(line.maketrans("","",string.punctuation))
      stopwords.extend( line.split())

    counts = {}      
    for word in uswords:
        if word in stopwords:
            continue
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    print( counts )

有一些更聪明的方法可以做到这一点，但这保留了你的基本理念

网友

2楼 · 编辑于 2024-10-03 04:33:24

首先初始化一组停止字，并在规范化它们（删除标点符号、小写字母等）后从文本中记录字数

然后，您可以对不在停止词集中的词的dict值求和

我使用了部分代码，但采用了上面详述的方法

from collections import defaultdict

def normalize(line):
    line = line.lower()
    return line.translate(line.maketrans("","",string.punctuation))   

# create a normalized stop-word set
stop_words = set()
with open("stopwords.txt" , "r") as f:
  for line in f:
    stop_words.update(normalize(line).split())

# create normalized-words count dictionary
words_count = defaultdict(int)
with open("usconst.txt" , "r") as f:
  for line in f:
    for w in normalize(line).split():
      words_count[w] += 1

# list by most frequent words which are not stop-words
sorted([k,v for k,v in words_count.items() if k not in stop_words], reverse=True, key=lambda x: x[1])

相关问题更多 >

编程相关推荐

热门问题

热门文章