我将遵循O'Reilly的“高级分析与火花”第4章。这本书是用Scala编写的,我在将这些代码转换成Python时遇到了困难。在
Scala代码
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression._
val rawData = sc.textFile("hdfs:///user/ds/covtype.data")
val data = rawData.map { line =>
val values = line.split(',').map(_.toDouble)
val featureVector = Vectors.dense(values.init)
val label = values.last - 1
LabeledPoint(label, featureVector)
}
val Array(trainData, cvData, testData) =
data.randomSplit(Array(0.8, 0.1, 0.1))
trainData.cache()
cvData.cache()
testData.cache()
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.model._
import org.apache.spark.rdd._
def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]):
MulticlassMetrics = {
val predictionsAndLabels = data.map(example =>
(model.predict(example.features), example.label)
)
new MulticlassMetrics(predictionsAndLabels)
}
val model = DecisionTree.trainClassifier(
trainData, 7, Map[Int,Int](), "gini", 4, 100)
val metrics = getMetrics(model, cvData)
metrics.confusionMatrix
我的Python代码
^{pr2}$当我运行此程序时,def help_lam(model)
内部的方法def _help_lam(dataline)
中有一个错误,当我试图隐式地传递映射迭代时:
AttributeError: 'Py4JError' object has no attribute 'message'
我认为问题出在
model.predict
函数中来自pyspark mllib/tree.py
你能做的就是像这样直接传递特征向量
编辑:
对
^{pr2}$getMetrics
的更新可以是:相关问题 更多 >
编程相关推荐