在这段代码中,我首先解析.sgm文件,然后对它们进行标记,然后对它们进行词干处理,我希望能够创建一个反向索引。我需要补充什么?在
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
files = ["reut2-000.sgm", "reut2-001.sgm","reut2-002.sgm","reut2-003.sgm","reut2-004.sgm", "reut2-005.sgm", "reut2-006.sgm", "reut2-007.sgm",
"reut2-008.sgm", "reut2-009.sgm", "reut2-010.sgm", "reut2-011.sgm", "reut2-012.sgm", "reut2-013.sgm", "reut2-014.sgm",
"reut2-015.sgm", "reut2-016.sgm", "reut2-017.sgm", "reut2-018.sgm", "reut2-019.sgm", "reut2-020.sgm", "reut2-021.sgm" ]
for i in files :
content = open(i)
x = content.read()
soup = BeautifulSoup(x,"html.parser")
documents = soup.find_all('body')
for document in documents :
tokens = []
tokens = tokens + nltk.word_tokenize(str(document))
stop = stopwords.words('english')
token = [token.lower() for token in tokens]
doc = [ i for i in token if i not in stop ]
ps = PorterStemmer()
for w in doc:
print(ps.stem(w))
目前没有回答
相关问题 更多 >
编程相关推荐