为什么一些英语单词在使用停止词或nltk语料库后会被删除？

df.select('words').show(10, truncate = 100) +----------------------------------------------------------------------------------------------------+ | words| +----------------------------------------------------------------------------------------------------+ |[you, are, hereby, ordered, to, cease, and, desist, all, furthe, r, emails, to, this, address, im...| |[content, type, text, plain, charset, utf, 8, content, transfer, encoding, quoted, printable, x, ...| |[you, are, hereby, ordered, to, cease, and, desist, all, furthe, r, emails, to, this, address, im...| |[, original, message, return, path, bounce, 19853e, 6fb54, visyak, 3djuno, com, cysticacneonchin,...| |[, forwarded, message, return, pat, h, bounce, 19853e, 6fb54, visyak, 3djuno, com, cysticacneonch...| |[, original, message, from, 248, 623, 1653, mailto, lisa, lahlahsales, com, 20, sent, tuesday, fe...| |[2018, horse, trailer, closeouts, free, delivery, cash, back, click, here, to, view, it, online, ...| |[, original, message, from, paypal, us, mailto, scottkahndmd, nc, rr, com, sent, 27, february, 20...| |[2col, 1, 2, 09, client, specific, styles, outlook, a, padding, 0, force, outlook, to, provide, a...| |[you, are, hereby, ordered, to, cease, and, desist, all, furthe, r, emails, to, this, address, im...| +----------------------------------------------------------------------------------------------------+ only showing top 10 rows

remover = StopWordsRemover(inputCol='words', outputCol='words_clean') #remove stop-word df = remover.transform(df) df = df.withColumn("words_filtered", F.expr("filter(words_clean, x -> not(length(x) < 3))")).where(F.size(F.col("words_filtered")) > 0) #remove words with less than 3 characters wnl = WordNetLemmatizer() @F.udf('array<string>') def remove_words(words): return [word for word in words if wnl.lemmatize(word) in nltk.corpus.words.words()] #removing words that are not in nltk corpus df = df.withColumn('words_final', remove_words('words_filtered'))

df.select('words_final').show(10, truncate = 100) +----------------------------------------------------------------------------------------------------+ | words_final| +----------------------------------------------------------------------------------------------------+ |[hereby, ordered, cease, desist, address, immediately, authorities, provider, continued, failure,...| |[content, type, text, plain, content, transfer, printable, apparently, yahoo, tue, return, path, ...| |[hereby, ordered, cease, desist, address, immediately, authorities, provider, continued, failure,...| |[original, message, return, path, bounce, received, sender, bounce, tue, pst, results, received, ...| |[message, return, pat, bounce, received, sender, bounce, tue, pst, results, received, ass, receiv...| | [original, message, sent, ball, subject, get]| |[horse, trailer, free, delivery, cash, back, click, view, horse, magazine, index, option, archive...| |[original, message, sent, subject, notification, payment, number, hello, payment, amount, payment...| |[client, specific, styles, outlook, padding, force, outlook, provide, view, browser, button, body...| |[hereby, ordered, cease, desist, address, immediately, authorities, provider, continued, failure,...| +----------------------------------------------------------------------------------------------------+ only showing top 10 rows

1条回答

网友

1楼 · 发布于 2024-09-28 21:33:40

在您的情况下，过滤似乎发生在几个地方：

StopWordsRemover删除common words，如he、she、myself等。通常这些词在文本模型中可能不是很有用，但这取决于您试图解决的任务
另一层过滤是你的WordNetLemmatizer-它可能是删除email、encoding等的主要原因。试着调整它，减少删除单词的攻击性

另外，如果你在Spark上做NLP，我建议你看看Spark NLP包。它可以更高的性能，更多的功能，等等

相关问题更多 >

编程相关推荐

热门问题

热门文章