使用目录作为带有python`textblob的tfidf的输入`

#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import division, unicode_literals import math from textblob import TextBlob as tb def tf(word, blob): return blob.words.count(word) / len(blob.words) def n_containing(word, bloblist): return sum(1 for blob in bloblist if word in blob) def idf(word, bloblist): return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) document1 = tb("""Today, the weather is 30 degrees in Celcius. It is really hot""") document2 = tb("""I can't believe the traffic headed to the beach. It is really a circus out there.'""") document3 = tb("""There are so many tolls on this road. I recommend taking the interstate.""") bloblist = [document1, document2, document3] for i, blob in enumerate(bloblist): print("Document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words: score_weight = score * 100 print("\t{}, {}".format(word, round(score_weight, 5)))

file_names = glob.glob("/path/to/foo/*") files = map(open,file_names) documents = [file.read() for file in files] [file.close() for file in files] bloblist = [documents] for i, blob in enumerate(bloblist): print("Document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words: score_weight = score * 100 print("\t{}, {}".format(word, round(score_weight, 5)))

3条回答

网友

1楼 · 编辑于 2024-09-24 02:22:11

在第一个代码示例中，用tb()的结果填充bloblist，在第二个示例中，使用tb()的输入（仅字符串）。在

尝试将bloblist = [documents]替换为bloblist = map(tb, documents)。在

您也可以像这样对文件名列表进行排序file_names = sorted(glob.glob("/path/to/foo/*"))，以使两个版本的输出匹配。在

网友

2楼 · 编辑于 2024-09-24 02:22:11

我不知道你到底想达到什么目的。您可以有一个数组并将结果附加到该数组：

scores = []
bloblist = [documents]
for i, blob in enumerate(bloblist):
  ... do your evaluation ..
  scores.append(score_weight)

print scores

网友

3楼 · 编辑于 2024-09-24 02:22:11

@annabanazzi在这里提供了一个代码片段，https://gist.github.com/sloria/6407257

import os, glob
folder = "/path/to/folder/"
os.chdir(folder)
files = glob.glob("*.txt") # Makes a list of all files in folder
bloblist = []
for file1 in files:
    with open (file1, 'r') as f:
        data = f.read() # Reads document content into a string
        document = tb(data.decode("utf-8")) # Makes TextBlob object
        bloblist.append(document)

我修改了它以供我使用（Python3）：

^{pr2}$

更新1:

我个人在使用pythonglob模块时遇到了困难，因为我经常（I）使用没有扩展名的文件名（例如01），以及（ii）希望在嵌套目录上递归。在

乍一看，“glob”方法似乎是一个简单的解决方案。但是，当我试图遍历glob返回的文件时，我经常会遇到错误（例如）

IsADirectoryError: [Errno 21] Is a directory: ...

当循环遇到glob返回的目录名（不是文件名）时。在

在我看来，只要稍加努力，以下方法就更为有效：

import os
bloblist = []

def make_corpus(input_dir):
    for root, subdirs, files in os.walk(input_dir):
        for filename in files:
            f = os.path.join(root, filename)
            print('file:', f)
            with open(os.path.join(root, filename)) as f:
                for line in f:
                    # print(line, end='')
                    bloblist.append(line)
    # print('bloblist:\n', bloblist)
    print('len(bloblist):', len(bloblist), '\n')

make_corpus('input')       ## 'input' = input dir

更新2:

最后一种方法（Linux shellfind命令，适合在python3中使用）：

import sh     ## pip install sh

def make_corpus(input_dir):
    '''find (here) matches filenames, excludes directory names'''

    corpus = []
    file_list = []
    #FILES = sh.find(input_dir, '-type', 'f', '-iname', '*.txt')    ## find all .txt files
    FILES = sh.find(input_dir, '-type', 'f', '-iname', '*')         ## find any file
    print('FILES:', FILES)                                          ## caveat: files in FILES are '\n'-terminated ...
    for filename in FILES:
        #print(filename, end='')
        # file_list.append(filename)                                ## when printed, each filename ends with '\n'
        filename = filename.rstrip('\n')                            ## ... this addresses that issue
        file_list.append(filename)
        with open(filename) as f:
            #print('file:', filename)
            #                     
            # for general use:
            #for line in f:
                #print(line)
                #corpus.append(line)
            #                     
            # for this particular example (Question, above):
            data = f.read()
            document = tb(data)
            corpus.append(document)
    print('file_list:', file_list)
    print('corpus length (lines):', len(corpus))

    with open('output/corpus', 'w') as f:                           ## write to file
        for line in corpus:
            f.write(line)

相关问题更多 >

编程相关推荐

热门问题

热门文章