如何让此代码计算单词而不是字母?

2024-10-02 14:16:37 发布

您现在位置:Python中文网/ 问答频道 /正文

我对python相当陌生,我正在尝试让这段代码打开txt文件,去掉文件中的标点符号,读取这些文件,创建单词列表,然后计算每个单词的出现次数。它正在计算字母的出现次数。另外,如何在其他函数中正确调用函数

import os


# create the dictionary
dictionary = {}
# create dictionary list
dictionarylist = []


def make_a_listh():
   path = 'data/training/'
   Heal = path + 'Health/'
   heal_files = os.listdir(Heal)
   # print(heal_files)
   punctuations = '''!()-—[]{};:'"\,<>.|/?@#$%^&*_~'''
   no_puncth = ""
    line = "--------------------------------------------------- 
--------------------------"
   for j in heal_files:
      file2 = open(Heal + j, 'r').read()
      for char in file2:
          if char not in punctuations:
              no_puncth = no_puncth + char
      print(j + line, "\n", no_puncth)


def make_a_listm():
    path = 'data/training/'
    Minn = path + 'Minnesota/'
    minn_files = os.listdir(Minn)
    # print the filename and a new line
    punctuations = '''!()—-—[]{};:’'"\,<>.|/?@“#$%^&*_~'''
    no_punctm = ""
    line = "--------------------------------------------------- 
    -------------------------"

    for i in minn_files:
       file1 = open(Minn + i, 'r')
       for char in file1:
         if char not in punctuations:
            no_punctm = no_punctm + char
    # print(i + line, "\n", no_punctm.replace('"',''))
    return no_punctm


def Freq(file1):
    # as long as there is a line in file loop
    for line in file1:
        # create variable to hold each word from the file
        words = line.split()
        # as long as there is a word in words loop
        for eachword in words:
            # if there is an existing word in dictionary 
        increase occurrence count
            if eachword in dictionary:
                dictionary[eachword] = dictionary[eachword] + 1
         # if there is a word that is new set count to 1
            else:
                dictionary[eachword] = 1
             # for every item (k and v) in dictionary, loop
     for k, v in dictionary.items():
      # create temporary place holder for v and k values
          temporary = [v, k]
          # (add) temporary values to dictionaryList
          dictionarylist.append(temporary)
          # print out each value from dictionaryList in. 
          descending order on new lines
          print("\n".join(map(str, sorted(dictionarylist, 
          reverse=True))))

Freq(file1=make_a_listm())

Tags: pathnoinfordictionaryifiscreate
1条回答
网友
1楼 · 发布于 2024-10-02 14:16:37

下面是如何使用collections模块中的Counter()方法,以及如何使用re.sub()更有效地处理标点:

from glob import glob
import re
from collections import Counter

words = []

for file in glob("C:\\Users\\User\\Desktop\\Folder\\*.txt"): # For every file in Folder that ends with .txt
    with open(file, 'r') as r: # Open the file in read mode
        nopunc = re.sub('\W', ' ', r.read()) # Use re.sub to replace all punctuations with spaces
        words += [w.strip().lower() for w in nopunc.split() if w.strip()] # List all the words in lower case, and add the list to words

print(Counter(words)) # prints out a dictionary with each unique word as the keys, and the frequency of those words as values

相关问题 更多 >

    热门问题