TypeError：应为字符串或byteslike对象；

filenames= os.listdir("/input") raw_files = [] for filename in filenames: with open('/input') as myfile: raw_files.append(myfile.read().split()) import nltk from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer nltk.download('stopwords') global stopwords import gensim import re stopwords = stopwords.words('english') stemmer = SnowballStemmer("english") def clean_sentences(text): tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str) sent_list = [] for sent in tokens: sent_str = '' for i, word in enumerate(nltk.word_tokenize(sent)): # nltk doesn't handle apostrophes correctly if word[0] == "'": sent_str = sent_str[:-1] # only adds words and digits if re.sub('[a-zA-Z0-9]',"", str(word)): sent_str += str(word.lower() + ' ') sent_list.append(sent_str.strip()).apply(str) return str(sent_list) # takes list of clean sentences and converts to list of tokens def tokens_only(text): tokens = [] for sentence in text: tokens.extend(sentence.split(" ")) return tokens # takes in text, cleans it, and returns lemma only def lemma_tokens(text): import gensim tokens = tokens_only (str(clean_sentences(text))) return [stemmer.stem(token) for token in tokens] all_lemma = [] all_tokens = [] all_sentences = [] all_sentences_label = [] for i, doc in enumerate(raw_files): # clean sentences tmp_list= str(clean_sentences(doc)) all_sentences.extend(tmp_list) for j in range(len(tmp_list)): all_sentences_label.append(filenames[i]) # convert list of clean sentences to tokens tmp_list = tokens_only(tmp_list) all_tokens.extend(tmp_list) # gets root word for tokens in document all_lemma.extend(lemma_tokens(doc))

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\exception.py" in inner 34. response = get_response(request) File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response 115. response = self.process_exception_by_middleware(e, request) File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response 113. response = wrapped_callback(request, *callback_args, **callback_kwargs) File "C:\Users\User\waqaf\waqaf\views.py" in output4 572. tmp_list= str(clean_sentences(doc)) File "C:\Users\User\waqaf\waqaf\views.py" in clean_sentences 531. tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str) File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py" in sent_tokenize 106. return tokenizer.tokenize(text) File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in tokenize 1277. return list(self.sentences_from_text(text, realign_boundaries)) File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in sentences_from_text 1331. return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in <listcomp> 1331. return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in span_tokenize 1321. for sl in slices: File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _realign_boundaries 1362. for sl1, sl2 in _pair_iter(slices): File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _pair_iter 318. prev = next(it) File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _slices_from_text 1335. for match in self._lang_vars.period_context_re().finditer(text): Exception Type: TypeError at /output4 Exception Value: expected string or bytes-like object

1条回答

网友

1楼 · 发布于 2024-04-18 22:40:12

最终传递给sent_tokenize的是raw_files中的一个项目，即myfile.read().split()的输出，这是一个字符串列表。但它只需要一个字符串

我建议省略.split()

相关问题更多 >

编程相关推荐

热门问题

热门文章