使用数据帧建模数据

#class17.py """ Created on Fri Nov 17 14:07:36 2017 @author: twaters Read three science fiction novels Predict a sentence or paragraph see whether sentence/phrase/book is from a science fiction novel or not """ import nltk import pandas as pd import csv from sklearn.metrics import accuracy_score from sklearn.linear_model import LogisticRegression from sklearn import model_selection from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from nltk.corpus import stopwords #nltk.download() irobot = "C:/Users/twaters/Desktop/Assignments/SQL/Python/DA Project/irobot.txt" enders_game = "C:/Users/twaters/Desktop/Assignments/SQL/Python/DA Project/endersgame.txt" space_odyssey ="C:/Users/twaters/Desktop/Assignments/SQL/Python/DA Project/spaceodyssey.txt" to_kill_a_mockingbird = "C:/Users/twaters/Desktop/Assignments/SQL/Python/DA Project/tokillamockingbird.txt" sr = set(stopwords.words('english')) freq = {} def main(): #read_novels() model_novels() def read_novel(b, is_scifi): read_file = open(b) text = read_file.read() words = text.split() clean_tokens = words[:] filtered_list = [] for word in clean_tokens: word = word.lower() if word not in sr: filtered_list.append(word) freq = nltk.FreqDist(clean_tokens) #print(filtered_list) for word in clean_tokens: count = freq.get(word,0) freq[word] = count + 1 frequency_list = freq.keys() with open('C:/Users/twaters/Desktop/Assignments/SQL/Python/DA Project/novels_data.txt', 'w', encoding='utf-8') as csvfile: fieldnames = ['word','frequency','is_scifi'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator = '\n') writer.writeheader() for words in frequency_list: writer.writerow({'word': words,'frequency': freq[words],'is_scifi':is_scifi}) print("List compiled.") def read_novels(): read_novel(enders_game, 0) read_novel(space_odyssey, 0) read_novel(irobot, 0) read_novel(to_kill_a_mockingbird, 1) def model_novels(): df = pd.read_csv('C:/Users/twaters/Desktop/Assignments/SQL/Python/DA Project/novels_data.txt', 'rb', delimiter='\t', encoding='utf-8') print(df) #for index in range(2, df.shape[0], 100): df_subset = df.loc[1:] #print(df_subset) X = df_subset.loc[:, 'frequency':'is_scifi'] Y = df_subset.loc[:, 'frequency':'is_scifi'] testing_size = 0.2 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=testing_size, random_state=seed) selectedModel = LogisticRegression() selectedModel.fit(X_train, Y_train) predictions = selectedModel.predict(X_validation) #%% #print("Accuracy Score:\n", accuracy_score(Y_validation, predictions)) #print("Confusion Matrix:\n",confusion_matrix(predictions, Y_validation)) #print("Class report:\n", classification_report(Y_validation, predictions)) #df_test = pd.read_csv('C:/Users/twaters/Desktop/Assignments/SQL/Python/DA Project/novels_data.txt', delimiter='\t') #predictions_test = selectedModel.predict(df_test) #test_frame = pd.DataFrame(predictions_test) #test_frame.to_csv('C:/Users/twaters/Desktop/Assignments/SQL/Python/DA Project/novels_data_result.txt', sep='\t')

1条回答

网友

1楼 · 发布于 2024-09-28 03:15:56

以下是stacktrace中您应该注意的要点：

文件“C:/Users/user/Desktop/Assignments/SQL/Python/DA Project/class17.py”，第95行，在model\ U中所选型号.fit（X\列车，Y\列车）

文件“D:\程序文件（x86）\Anaconda\lib\site packages\sklearn\utils\验证.py“，第44行，在\u assert \u all \u finite”或一个值对于%r太大。%X.dtype）

这说明X的格式有问题，logistic回归可以接受。你知道吗

您应该检查X\u train和X，看看它们是否包含错误的值。你知道吗

这个答案将给你一些如何做到这一点的指针。你知道吗

Python pandas: check if any value is NaN in DataFrame

相关问题更多 >

编程相关推荐

热门问题

热门文章