如何分离单词列表中的字符以查找双字符频率

from collections import Counter import pandas from pd CMU_data = pd.read_csv("CMU.csv") #opening the csv file transcript = CMU_data["Transcription"] #storing transcriptions column as a variable def converter(x): #converting dataframe column from series to tuple if isinstance(x, pd.Series): return tuple(x.values) else: return x transcript2 = transcript.apply(converter).unique() print(transcript2) #finding bigrams data = transcript2 bigrams = Counter(x+y for x, y in zip(*[data[i:] for i in range(2)])) for bigram, count in bigrams.most_common(): print(bigram, '=', count)

# P OY1 N T # # S L AE1 SH # = 1 # S L AE1 SH # # TH R IY1 D IY2 # = 1 # TH R IY1 D IY2 # # K OW1 L AH0 N # = 1 # K OW1 L AH0 N # # S EH1 M IY0 K OW1 L AH0 N # = 1 # S EH1 M IY0 K OW1 L AH0 N # # S EH1 M IH0 K OW2 L AH0 N # = 1 # S EH1 M IH0 K OW2 L AH0 N # # K W EH1 S CH AH0 N M AA1 R K # = 1 # K W EH1 S CH AH0 N M AA1 R K # # AH0 # = 1 # AH0 # # EY1 # = 1 # EY1 # # EY1 Z # = 1 # EY1 Z # # EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T # = 1 (...)

1条回答

网友

1楼 · 发布于 2024-09-28 23:26:11

这里有一种方法：

from nltk.util import ngrams 
from collections import Counter
import pandas as pd


inp = ['# P OY1 N T # ', '# S L AE1 SH # ', '# TH R IY1 D IY2 # ',
       '# L EH1 F T B R EY1 S # ', '# OW1 P EH0 N B R EY1 S # ',
       '# K L OW1 Z B R EY1 S # ']

def tokenise(s):
    toks = s.strip().split(' ')
    # Join starting # with second element
    toks[0] = ' '.join(toks[:2])
    toks.pop(1)
    # Join penultimate element with end #
    toks[-1] = ' '.join(toks[-2:])
    toks.pop(-2)
    return toks

def count_ngrams(tups,n):
    
    df = pd.DataFrame(Counter(tups).items(),columns=['bigram','count'])\
           .sort_values(by='count',ascending=False)\
           .reset_index(drop=True)
    
    return df

def counts(inp,n,unit='sound'):

    if unit == 'sound':
        tokenised = [tokenise(s) for s in inp]
        # Create ngram tuples and flatten nested list
        tups = [item for sublist in [list(ngrams(t,n)) for t in tokenised] for item in sublist]
        
    elif unit == 'word':
        tups = list(ngrams(inp,n))

    return count_ngrams(tups,n)

声音双字符计数

counts(inp,2,unit='sound')

#          bigram  count
# 0    (EY1, S #)      3
# 1      (R, EY1)      3
# 2        (B, R)      3
# 3    (# P, OY1)      1
# 4        (T, B)      1
# 5      (OW1, Z)      1
# 6      (L, OW1)      1
# 7      (# K, L)      1
# 8        (N, B)      1
# 9      (EH0, N)      1
# 10     (P, EH0)      1
# 11   (# OW1, P)      1
# 12       (F, T)      1
# 13     (OY1, N)      1
# 14     (EH1, F)      1
# 15   (# L, EH1)      1
# 16   (D, IY2 #)      1
# 17     (IY1, D)      1
# 18     (R, IY1)      1
# 19    (# TH, R)      1
# 20  (AE1, SH #)      1
# 21     (L, AE1)      1
# 22     (# S, L)      1
# 23     (N, T #)      1
# 24       (Z, B)      1

单词双字符计数

counts(inp,2,unit='word')

#                                               bigram  count
# 0                  (# P OY1 N T # , # S L AE1 SH # )      1
# 1             (# S L AE1 SH # , # TH R IY1 D IY2 # )      1
# 2    (# TH R IY1 D IY2 # , # L EH1 F T B R EY1 S # )      1
# 3  (# L EH1 F T B R EY1 S # , # OW1 P EH0 N B R E...      1
# 4  (# OW1 P EH0 N B R EY1 S # , # K L OW1 Z B R E...      1

相关问题更多 >

编程相关推荐

热门问题

热门文章