以下是我的代码:
def dataPreprocessing(data):
splitData = data.map(lambda line: line.split('`'))
getData = splitData.map(lambda line: [labelMap[line[2]], list(jieba.cut(line[6]+line[13]))])
return getData
trainSql = sqlContext.createDataFrame(dataPreprocessing(trainData)).toDF('label', 'raw')
testSql = sqlContext.createDataFrame(dataPreprocessing(testData)).toDF('label', 'raw')
remover = StopWordsRemover(inputCol="raw", outputCol="filtered").setStopWords(stopWord)
filteredTrain = remover.transform(trainSql)
filteredTest = remover.transform(testSql)
filteredTrain.cache(); filteredTest.cache()
tf = HashingTF(numFeatures=20000, inputCol='filtered', outputCol='rawFeatures')
trainTf = tf.transform(filteredTrain); testTf = tf.transform(filteredTest)
print trainTf.select('rawFeatures').take(5)
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(trainTf)
trainIdf = idfModel.transform(trainTf) ; testIdf = idfModel.transform(testTf)
trainIdf.show()
因为我处理的是中文文本,所以我使用python包jieba来拆分单词。“print”行的输出列车选择('rawFeatures').take(5)”是正确的。IDF附带错误,错误如下:
^{pr2}$我该怎么办?在
目前没有回答
相关问题 更多 >
编程相关推荐