Python中单词列表的共现矩阵

3条回答

网友

1楼 · 编辑于 2024-09-28 19:02:01

from collections import OrderedDict

document = [['A', 'B'], ['C', 'B'], ['A', 'B', 'C', 'D']]
names = ['A', 'B', 'C', 'D']

occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)

# Find the co-occurrences:
for l in document:
    for i in range(len(l)):
        for item in l[:i] + l[i + 1:]:
            occurrences[l[i]][item] += 1

# Print the matrix:
print(' ', ' '.join(occurrences.keys()))
for name, values in occurrences.items():
    print(name, ' '.join(str(i) for i in values.values()))

输出

网友

2楼 · 编辑于 2024-09-28 19:02:01

另一种选择是使用构造函数 csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])来自scipy.sparse.csr_matrix，其中data、row_ind和col_ind满足关系a[row_ind[k], col_ind[k]] = data[k]。

诀窍是通过遍历文档并创建元组列表（doc-id，word-id）来生成row_ind和col_ind。data只是一个长度相同的向量。

将文档单词矩阵与其转置相乘将得到共现矩阵。

此外，这在运行时间和内存使用方面都是有效的，因此它还应该处理大的小体。

import numpy as np
import itertools
from scipy.sparse import csr_matrix


def create_co_occurences_matrix(allowed_words, documents):
    print(f"allowed_words:\n{allowed_words}")
    print(f"documents:\n{documents}")
    word_to_id = dict(zip(allowed_words, range(len(allowed_words))))
    documents_as_ids = [np.sort([word_to_id[w] for w in doc if w in word_to_id]).astype('uint32') for doc in documents]
    row_ind, col_ind = zip(*itertools.chain(*[[(i, w) for w in doc] for i, doc in enumerate(documents_as_ids)]))
    data = np.ones(len(row_ind), dtype='uint32')  # use unsigned int for better memory utilization
    max_word_id = max(itertools.chain(*documents_as_ids)) + 1
    docs_words_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(documents_as_ids), max_word_id))  # efficient arithmetic operations with CSR * CSR
    words_cooc_matrix = docs_words_matrix.T * docs_words_matrix  # multiplying docs_words_matrix with its transpose matrix would generate the co-occurences matrix
    words_cooc_matrix.setdiag(0)
    print(f"words_cooc_matrix:\n{words_cooc_matrix.todense()}")
    return words_cooc_matrix, word_to_id

运行示例：

allowed_words = ['A', 'B', 'C', 'D']
documents = [['A', 'B'], ['C', 'B', 'K'],['A', 'B', 'C', 'D', 'Z']]
words_cooc_matrix, word_to_id = create_co_occurences_matrix(allowed_words, documents)

输出：

allowed_words:
['A', 'B', 'C', 'D']

documents:
[['A', 'B'], ['C', 'B', 'K'], ['A', 'B', 'C', 'D', 'Z']]

words_cooc_matrix:
[[0 2 1 1]
 [2 0 2 1]
 [1 2 0 1]
 [1 1 1 0]]

网友

3楼 · 编辑于 2024-09-28 19:02:01

显然，这可以扩展到您的目的，但它执行一般操作时要记住：

import math

for a in 'ABCD':
    for b in 'ABCD':
        count = 0

        for x in document:
            if a != b:
                if a in x and b in x:
                    count += 1

            else:
                n = x.count(a)
                if n >= 2:
                    count += math.factorial(n)/math.factorial(n - 2)/2

        print '{} x {} = {}'.format(a, b, count)

相关问题更多 >

编程相关推荐

热门问题

热门文章

Python中单词列表的共现矩阵

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >