我对TensorFlow/机器学习比较陌生,因此有一些困难。我有一个csv格式的数据集here,我想用here这样的熊猫来读取它。它在一个不同的数据集上工作,但我修改和扩展了,但我认为我在这里遗漏了一些重要的东西。基本上,我所要做的就是预测给定数据集的“总体”评级。这是我的代码和回溯:
import pandas as pd
import tensorflow as tf
import tempfile
COLUMNS = ["reviewerID", "asin", "reviewerName", "helpful_0", "helpful_1", "reviewText",
"overall", "summary", "unixReviewTime"]
CATEGORICAL_COLUMNS = ["reviewerID", "reviewerName", "reviewText", "summary"]
CONTINUOUS_COLUMNS = ["helpful_0", "helpful_1", "unixReviewTime"]
df_train = pd.read_csv('Digital_Music_5.csv', names=COLUMNS, skipinitialspace=True,
low_memory=False, skiprows=1)
df_test = pd.read_csv('Digital_Music_5_test.csv', names=COLUMNS,
skipinitialspace=True, skiprows=1)
LABEL_COLUMN = "label"
df_train[LABEL_COLUMN] = df_train["overall"]
df_test[LABEL_COLUMN] = df_train["overall"]
print(df_train)
def input_fn(df):
# Creates a dictionary mapping from each continuous feature column name (k)
# to the values of that column stored in a constant Tensor.
continuous_cols = {k: tf.constant(df[k].values)
for k in CONTINUOUS_COLUMNS}
# Creates a dictionary mapping from each categorical feature column name
# (k) to the values of that column stored in a tf.SparseTensor.
categorical_cols = {k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
dense_shape=[df[k].size, 1],) for k in CATEGORICAL_COLUMNS}
# Merges the two dictionaries into one.
feature_cols = dict(continuous_cols)
feature_cols.update(categorical_cols)
# Converts the label column into a constant Tensor.
label = tf.constant(df[LABEL_COLUMN].values)
# Returns the feature columns and the label.
return feature_cols, label
def train_input_fn():
return input_fn(df_train)
def eval_input_fn():
return input_fn(df_test)
reviewText = tf.contrib.layers.sparse_column_with_hash_bucket("reviewText", hash_bucket_size=100000)
reviewerID = tf.contrib.layers.sparse_column_with_hash_bucket("reviewerID", hash_bucket_size=100000)
reviewerName = tf.contrib.layers.sparse_column_with_hash_bucket("reviewerName", hash_bucket_size=100000)
summary = tf.contrib.layers.sparse_column_with_hash_bucket("summary", hash_bucket_size=100000)
asin = tf.contrib.layers.real_valued_column("asin")
helpful_0 = tf.contrib.layers.real_valued_column("helpful_0")
helpful_1 = tf.contrib.layers.real_valued_column("helpful_1")
unixReviewTime = tf.contrib.layers.real_valued_column("unixReviewTime")
# reviewText_x_summary = tf.contrib.layers.crossed_column([reviewText, summary], hash_bucket_size=100000)
# reviewerID_x_reviewerName = tf.contrib.layers.crossed_column([reviewerID, reviewerName], hash_bucket_size=100000)
# reviewText_x_reviewerID_x_reviewerName = tf.contrib.layers.crossed_column([reviewText, reviewerID, reviewerName], hash_bucket_size=100000)
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[reviewText, reviewerName, summary,
asin, helpful_0, helpful_1, unixReviewTime], optimizer=tf.train.FtrlOptimizer(
learning_rate=0.1,
l1_regularization_strength=1.0,
l2_regularization_strength=1.0),
model_dir=model_dir)
m.fit(input_fn=train_input_fn, steps=200)
# results = m.evaluate(input_fn=eval_input_fn, steps=1)
# for key in sorted(results):
# print("{}: {}".format(key, results[key]))
回溯:
^{pr2}$
您的输入数据帧包含空的审阅者姓名和审阅文本,它们由
pd.read_csv()
映射到NaN,但是TensorFlow需要一个字符串而不是NaN。在使用以下命令检查空单元格:
您可以使用以下命令将这些nan转换为空字符串
^{pr2}$或者让
pd.read_csv()
直接使用na_values=[]
创建空字符串而不是nan:相关问题 更多 >
编程相关推荐