我对我的推荐人有这个问题。对于一些啤酒,我希望它的工作非常完美的建议,但有时它返回一个键错误。我不知道为什么会这样?你知道吗
在尝试为同一项推荐时总是发生,因此它可能与hashmap代码或反向hashmap有关。你知道吗
Picture of KeyError while running script
**照片**
代码
import os
import time
import gc
import argparse
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
class KnnRecommender:
"""
This is an item based collaborative filtering recommender with KNN implemented by sklearn
"""
def __init__(self, path_beers, path_tastingprofiles):
"""
Recommender requires path to data: movies data and ratings data
Parameters
----------
path_movies: str, movies data file path
path_ratings: str, ratings data file path
"""
self.path_beers = path_beers
self.path_tastingprofiles = path_tastingprofiles
self.model = NearestNeighbors()
def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
"""
set model params for sklearn.neighbors.NearestNeighbors
Parameters
----------
n_neighbors: int, optional (default = 5)
algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
metric: string or callable, default 'minkowski', or one of
['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
n_jobs: int or None, optional (default=None)
"""
if n_jobs and (n_jobs > 1 or n_jobs == -1):
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
self.model.set_params(**{
'n_neighbors': n_neighbors,
'algorithm': algorithm,
'metric': metric,
'n_jobs': n_jobs})
def _prep_data(self):
"""
prepare data for recommender
1. beer-tastingprofile scipy sparse matrix
2. hashmap of beer to row index in beer-tastingprofile scipy sparse matrix
"""
# read data
df_beers = pd.read_csv(
os.path.join(self.path_beers),
usecols=['beerID', 'name', 'beertypeID'],
dtype={'beerID': 'int32', 'name': 'str', 'beerID': 'int32'})
df_tastingprofiles = pd.read_csv(
os.path.join(self.path_tastingprofiles),
usecols=['beerID', 'malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'],
dtype={'beerID': 'int32', 'malty': 'float32', 'sweet': 'float32', 'sour': 'float32', 'hoppy': 'float32', 'bitter': 'float32', 'fruity': 'float32'})
#filtering beers/removing unprofiled beers
df_beers_merged = pd.merge(df_tastingprofiles, df_beers, on='beerID')
df_beers = df_beers_merged.drop(['malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'], axis=1)
# pivot and create tastingprofile matrix
df_tastingprofile_features = df_tastingprofiles.set_index('beerID')
# create mapper from beer name to index
hashmap = {
beer: i for i, beer in
enumerate(list(df_beers.set_index('beerID').loc[df_tastingprofile_features.index].name)) # noqa
}
#converting tastingprofile features to scipy sparse matrix
mat_tastingprofile_features = csr_matrix(df_tastingprofile_features.values)
# clean up
del df_beers, df_beers_merged
del df_tastingprofiles, df_tastingprofile_features
return mat_tastingprofile_features, hashmap
def _fuzzy_matching(self, hashmap, fav_beer):
"""
return the closest match via fuzzy ratio.
If no match found, return None
Parameters
----------
hashmap: dict, map beer name to index of the beer in data
fav_beer: str, name of user input beer
Return
------
index of the closest match
"""
match_tuple = []
# get match
for name, idx in hashmap.items():
ratio = fuzz.ratio(name.lower(), fav_beer.lower())
if ratio >= 60:
match_tuple.append((name, idx, ratio))
# sort
match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
if not match_tuple:
print('Oops! No match is found')
else:
print('Found possible matches in our database: '
'{0}\n'.format([x[0] for x in match_tuple]))
return match_tuple[0][1]
def _inference(self, model, data, hashmap,
fav_beer, n_recommendations):
"""
return top n similar beer recommendations based on user's input movie
Parameters
----------
model: sklearn model, knn model
data: beer-tastingprofile matrix
hashmap: dict, map beer name to index of the mobeervie in data
fav_beer: str, name of user input beer
n_recommendations: int, top n recommendations
Return
------
list of top n similar beer recommendations
"""
# fit
model.fit(data)
# get input movie index
print('You have input movie:', fav_beer)
idx = self._fuzzy_matching(hashmap, fav_beer)
# inference
print('Recommendation system start to make inference')
print('......\n')
t0 = time.time()
distances, indices = model.kneighbors(
data[idx],
n_neighbors=n_recommendations+1)
# get list of raw idx of recommendations
raw_recommends = \
sorted(
list(
zip(
indices.squeeze().tolist(),
distances.squeeze().tolist()
)
),
key=lambda x: x[1]
)[:0:-1]
print('It took my system {:.2f}s to make inference \n\
'.format(time.time() - t0))
# return recommendation (movieId, distance)
return raw_recommends
def make_recommendations(self, fav_beer, n_recommendations):
"""
make top n beer recommendations
Parameters
----------
fav_beer: str, name of user input beer
n_recommendations: int, top n recommendations
"""
# get data
mat_tastingprofile_features, hashmap = self._prep_data()
# get recommendations
raw_recommends = self._inference(
self.model, mat_tastingprofile_features, hashmap,
fav_beer, n_recommendations)
# print results
reverse_hashmap = {v: k for k, v in hashmap.items()}
print('Recommendations for {}:'.format(fav_beer))
for i, (idx, dist) in enumerate(raw_recommends):
#reverse_hashmap[idx]
print('{0}: {1}, with distance of {2}'.format(i+1,reverse_hashmap[idx], dist))
def parse_args():
parser = argparse.ArgumentParser(
prog="Beer Recommender",
description="Run KNN Beer Recommender")
parser.add_argument('--path', nargs='?', default='',
help='input data path')
parser.add_argument('--beer_filename', nargs='?', default='beer.csv',
help='provide beer filename')
parser.add_argument('--tastingprofile_filename', nargs='?', default='tastingprofile.csv',
help='provide tastingprofile filename')
parser.add_argument('--beer_name', nargs='?', default='',
help='provide your favorite beer name')
parser.add_argument('--top_n', type=int, default=10,
help='top n beer recommendations')
return parser.parse_args()
if __name__ == '__main__':
# get args
args = parse_args()
data_path = args.path
beer_filename = args.beer_filename
tastingprofile_filename = args.tastingprofile_filename
beer_name = args.beer_name
top_n = args.top_n
# initial recommender system
recommender = KnnRecommender(
os.path.join(data_path, beer_filename),
os.path.join(data_path, tastingprofile_filename))
recommender.set_model_params(20, 'brute', 'cosine', -1)
# make recommendations
recommender.make_recommendations(beer_name, top_n)
我修好了。 我发现当我在Hashmap中使用名称作为值时,重复项会自动被删除。所以hashmap比完整的db列表小。在推荐算法中使用数据集之前,我通过删除数据集中的重复项来解决这个问题。你知道吗
我将向您展示我对熊猫数据帧合并和删除重复项的简单修复。你知道吗
相关问题 更多 >
编程相关推荐