如何让此代码计算单词而不是字母？

import os # create the dictionary dictionary = {} # create dictionary list dictionarylist = [] def make_a_listh(): path = 'data/training/' Heal = path + 'Health/' heal_files = os.listdir(Heal) # print(heal_files) punctuations = '''!()-—[]{};:'"\,<>.|/?@#$%^&*_~''' no_puncth = "" line = "--------------------------------------------------- --------------------------" for j in heal_files: file2 = open(Heal + j, 'r').read() for char in file2: if char not in punctuations: no_puncth = no_puncth + char print(j + line, "\n", no_puncth) def make_a_listm(): path = 'data/training/' Minn = path + 'Minnesota/' minn_files = os.listdir(Minn) # print the filename and a new line punctuations = '''!()—-—[]{};:’'"\,<>.|/?@“#$%^&*_~''' no_punctm = "" line = "--------------------------------------------------- -------------------------" for i in minn_files: file1 = open(Minn + i, 'r') for char in file1: if char not in punctuations: no_punctm = no_punctm + char # print(i + line, "\n", no_punctm.replace('"','')) return no_punctm def Freq(file1): # as long as there is a line in file loop for line in file1: # create variable to hold each word from the file words = line.split() # as long as there is a word in words loop for eachword in words: # if there is an existing word in dictionary increase occurrence count if eachword in dictionary: dictionary[eachword] = dictionary[eachword] + 1 # if there is a word that is new set count to 1 else: dictionary[eachword] = 1 # for every item (k and v) in dictionary, loop for k, v in dictionary.items(): # create temporary place holder for v and k values temporary = [v, k] # (add) temporary values to dictionaryList dictionarylist.append(temporary) # print out each value from dictionaryList in. descending order on new lines print("\n".join(map(str, sorted(dictionarylist, reverse=True)))) Freq(file1=make_a_listm())

1条回答

网友

1楼 · 发布于 2024-10-02 14:16:37

下面是如何使用collections模块中的Counter()方法，以及如何使用re.sub()更有效地处理标点：

from glob import glob
import re
from collections import Counter

words = []

for file in glob("C:\\Users\\User\\Desktop\\Folder\\*.txt"): # For every file in Folder that ends with .txt
    with open(file, 'r') as r: # Open the file in read mode
        nopunc = re.sub('\W', ' ', r.read()) # Use re.sub to replace all punctuations with spaces
        words += [w.strip().lower() for w in nopunc.split() if w.strip()] # List all the words in lower case, and add the list to words

print(Counter(words)) # prints out a dictionary with each unique word as the keys, and the frequency of those words as values

相关问题更多 >

编程相关推荐

热门问题

热门文章