为了学习推荐系统,我根据一些Twitter数据制作了一个非常基本的推荐系统。它工作正常,唯一的问题是运行需要15-20分钟。我敢肯定,我犯了很多初学者的错误,导致了这一点,因为它应该在大约半分钟内运行
我所知道的是,这个问题不是由load_data()函数引起的,因为这个函数在没有剩余代码的情况下大约需要10秒钟。你知道我哪里出了错吗
#!/usr/bin/env python3
import sys
from operator import itemgetter
from collections import defaultdict
def load_data():
n = 50
with open(sys.argv[1], 'r') as f_user_sim:
#build dictionary of all users and similarities
sim_dict = dict()
for line in f_user_sim:
line = line.strip().split()
user1, user2, sim = line[0], line[1], line[2]
if not user1 in sim_dict:
sim_dict[user1] = []
sim_dict[user1].append((user2, sim))
#for each user, sort its list of other users by similarity, resize list according to n
for u in sim_dict:
newlist = sorted(sim_dict[u], key=itemgetter(1), reverse=True)[:n]
sim_dict[u] = newlist
return sim_dict
def VIP(user):
with open(sys.argv[3], 'r') as f_training_matrix:
vip_dict = list()
for line in f_training_matrix:
if line.split()[0] == user:
vip_dict.append(line.split()[1])
return vip_dict
def VIP_counter(user,simusers):
# list of vips the user already follows
uservips = VIP(user)
# makes list of lists of vips that users follow
viplist = []
for u in simusers:
viplist.append(VIP(u))
# convert list of lists to counted dict
counts = dict()
for i in viplist:
for j in i:
if j not in uservips and j != user:
counts[j] = counts.get(j,0) +1
# sort dict
sorteddict = sorted(counts.items(), key=lambda x: x[1], reverse=True)
topten = sorteddict[:10]
#convert list of tuples to str of keys with spaces
return ' '.join([str(elem[0]) for elem in topten])
def main():
simdict = load_data()
# open input and output files
with open(sys.argv[4], 'r') as f_input:
with open('recommender_s2922916.dev.output', 'w+') as f_output:
for line in f_input:
user = line.strip()
# make list of N similar users
simusers = []
for i in simdict[user][:10]:
simusers.append(i[0])
# for all similar users, list every VIP they follow in list and enumerate them, output the the ten highest in spaced out string form
counts = VIP_counter(user,simusers)
# write the output
f_output.write(counts)
f_output.write('\n')
if __name__ == "__main__":
main()
使用数据帧读取/写入/处理代码的操作。在这种情况下,它将获得更多收益
例如:
您可以改进代码中的所有方法,而无需循环、比较、拆分和创建新的数据元素。不建议像处理大型数据集那样执行普通操作
相关问题 更多 >
编程相关推荐