<p>我建议修改如下所示的代码</p>
<pre><code>import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def predict(l):
# finds the userIds corresponding to the top 5 similarities
# calculate the prediction according to the formula
return (df[l.index] * l).sum(axis=1) / l.sum()
# use userID as columns for convinience when interpretering the forumla
df = pd.read_csv('ratings.csv').pivot(columns='userId',
index='movieId',
values='rating')
df = df - df.mean()
similarity = pd.DataFrame(cosine_similarity(
df.T.fillna(0)), index=df.columns, columns=df.columns)
res = df.apply(lambda col: (0 * col).fillna(
predict(similarity[col.name].nlargest(6).iloc[1:])
).nlargest(5).index.tolist()
).apply(pd.Series).rename(
columns=lambda col_name: 'movie-id{}'.format(col_name + 1)).reset_index(
).rename(columns={'userId': 'customer_id'})
# convert to csv
res.to_csv('filepath.txt', sep = ' ',index = False)
</code></pre>
<p><code>res.head()</code></p>
<pre><code>In [2]: res.head()
Out[2]:
customer_id movie-id1 movie-id2 movie-id3 movie-id4 movie-id5
0 1 3072 1196 838 2278 1259
1 2 648 475 1 151 1035
2 3 457 150 300 21 339
3 4 1035 7153 953 4993 2571
4 5 260 671 1210 2628 7153
</code></pre>
<p>显示文件</p>
<pre><code> In [3]: ! head -5 filepath.txt
customer_id movie-id1 movie-id2 movie-id3 movie-id4 movie-id5
1 3072 1196 838 2278 1259
2 648 475 1 151 1035
3 457 150 300 21 339
4 1035 7153 953 4993 2571
</code></pre>