如何用sastrawi提取印尼单词词干

2024-10-03 23:23:23 发布

您现在位置:Python中文网/ 问答频道 /正文

我对存储在vaksinsamplel2.csv文件中的tweet数据进行了预处理。我已经完成了几个步骤,比如文本清理、大小写折叠、标记化、停止字删除、规范化。但我不能做词干分析。请帮我解决它。 代码如下:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

df = pd.read_csv('vaksinsampel2.csv', encoding = 'unicode_escape')

def remove_punct(tweet):
    tweet = re.sub('[^a-zA-Z0-9 ]', ' ', str(tweet))
    tweet = re.sub('[0-9]+', ' ', tweet)
    tweet = re.sub(r'#', '', str(tweet))  
    tweet = re.sub(r'http\S+', ' ', tweet)
    return tweet

df['TEXT'] = df['full_text'].apply(lambda x:remove_punct(x))

df['case_folding'] = df['TEXT'].str.lower()

def tokenization(tweet):
    tweet = re.split('\W+', tweet)
    return tweet
df['Tokenization'] = df['TEXT'].apply(lambda x: tokenization(x.lower()))
df.head(10)

stopword = nltk.corpus.stopwords.words('indonesian')

def remove_stopwords(tweet):
    tweet = [word for word in tweet if word not in stopword]
    return tweet

df['Stopword_Removal'] = df['Tokenization'].apply(lambda x: remove_stopwords(x))
df.head(10)

def normalisasi(tweet):
    kamus_slangword = eval(open("slang_indonesia.txt").read()) # Membuka dictionary slangword
    pattern = re.compile(r'\b( ' + '|'.join (kamus_slangword.keys())+r')\b') # Search pola kata (contoh kpn -> kapan)
    content = []
    for kata in tweet:
        filteredSlang = pattern.sub(lambda x: kamus_slangword[x.group()],kata) # Replace slangword berdasarkan pola review yg telah ditentukan
        content.append(filteredSlang.lower())
    tweet = content
    return tweet
df['Normalization'] = df['Stopword_Removal'].apply(lambda x: normalisasi(x))
df.head(10)

factory = StemmerFactory()
stemming = factory.create_stemmer()
def stem_list(tweet):
        return stemming.stem(df['Normalization'])
df['Stemming'] = df.apply(stem_list, axis=1)
df.head(50)

Tags: csvlambdafromimportredfreturndef