我正在制作一个表,其中记录了类似产品的索引,然后将这些行中的数据粘贴到一行中,从而聚合所有数据。在此之后,该行将被删除。代码如下:
matchedproducts_df = pd.read_sql_query("SELECT * from matchedproducts", conn)
print(len(matchedproducts_df.index))
def mergeduplicates(df, similarity_field='', databasetable='', similar_level=85):
print(len(df.index))
def check_simi(d):
global dupl_indexes, dupl_originals
dupl_originals = []
dupl_indexes = []
for i in range(len(d.values) - 1):
for j in range(i + 1, len(d.values)):
if fuzz.token_sort_ratio(d.values[i], d.values[j]) >= similar_level:
dupl_indexes.append(d.index[j])
dupl_originals.append(d.index[i])
indexes = df.groupby([True]*len(df))[similarity_field].apply(check_simi)
a = 0
for i in dupl_indexes:
if df.iloc[i, 5] == 'harveynichols':
df.at[dupl_originals[a], 'pricehn'] = df.loc[i, 'price']
df.at[dupl_originals[a], 'availabilityhn'] = df.loc[i, 'availability']
df.at[dupl_originals[a], 'storehn'] = df.loc[i, 'store']
df.at[dupl_originals[a], 'hyperlinkhn'] = df.loc[i, 'hyperlink']
df.drop([i])
elif df.iloc[i, 5] == 'houseoffraser':
df.at[dupl_originals[a], 'pricehof'] = df.loc[i, 'price']
df.at[dupl_originals[a], 'availabilityhof'] = df.loc[i, 'availability']
df.at[dupl_originals[a], 'storehof'] = df.loc[i, 'store']
df.at[dupl_originals[a], 'hyperlinkhof'] = df.loc[i, 'hyperlink']
df.drop([i])
elif df.iloc[i, 5] == 'selfridges':
df.at[dupl_originals[a], 'pricesf'] = df.loc[i, 'price']
df.at[dupl_originals[a], 'availabilitysf'] = df.loc[i, 'availability']
df.at[dupl_originals[a], 'storesf'] = df.loc[i, 'store']
df.at[dupl_originals[a], 'hyperlinksf'] = df.loc[i, 'hyperlink']
df.drop([i])
elif df.iloc[i, 5] == 'lookfantastic':
df.at[dupl_originals[a], 'pricelf'] = df.loc[i, 'price']
df.at[dupl_originals[a], 'availabilitylf'] = df.loc[i, 'availability']
df.at[dupl_originals[a], 'storelf'] = df.loc[i, 'store']
df.at[dupl_originals[a], 'hyperlinklf'] = df.loc[i, 'hyperlink']
df.drop([i])
elif df.iloc[i, 5] == 'superdrug':
df.at[dupl_originals[a], 'pricesd'] = df.loc[i, 'price']
df.at[dupl_originals[a], 'availabilitysd'] = df.loc[i, 'availability']
df.at[dupl_originals[a], 'storesd'] = df.loc[i, 'store']
df.at[dupl_originals[a], 'hyperlinksd'] = df.loc[i, 'hyperlink']
df.drop([i])
elif df.iloc[i, 5] == 'boots':
df.at[dupl_originals[a], 'priceboots'] = df.loc[i, 'price']
df.at[dupl_originals[a], 'availabilityboots'] = df.loc[i, 'availability']
df.at[dupl_originals[a], 'storeboots'] = df.loc[i, 'store']
df.at[dupl_originals[a], 'hyperlinkboots'] = df.loc[i, 'hyperlink']
df.drop([i])
elif df.iloc[i, 5] == 'allbeauty':
df.at[dupl_originals[a], 'priceab'] = df.loc[i, 'price']
df.at[dupl_originals[a], 'availabilityab'] = df.loc[i, 'availability']
df.at[dupl_originals[a], 'storeab'] = df.loc[i, 'store']
df.at[dupl_originals[a], 'hyperlinkab'] = df.loc[i, 'hyperlink']
df.drop([i])
elif df.iloc[i, 5] == 'asos':
df.at[dupl_originals[a], 'pricea'] = df.loc[i, 'price']
df.at[dupl_originals[a], 'availabilitya'] = df.loc[i, 'availability']
df.at[dupl_originals[a], 'storea'] = df.loc[i, 'store']
df.at[dupl_originals[a], 'hyperlinka'] = df.loc[i, 'hyperlink']
df.drop([i])
# for index_list in indexes:
# df.drop(index_list, inplace=True)
a += 1
print(len(df.index))
df.to_csv('C:/Users/Judoo/Desktop/matchedproducts.csv')
df.to_sql(databasetable, conn, if_exists="replace")
conn.commit()
conn.close()
mergeduplicates(matchedproducts_df, similarity_field='name', databasetable='matchedproducts')
该代码可以很好地查找重复项并将其数据复制到相关的原始行中,但不会在检查后删除该行。脚本前后的表长度相同。我不确定为什么会发生这种情况,任何帮助都将不胜感激
drop()方法通常删除要删除特定行的行。您是否尝试过添加
inplace=True
?因为它可能会保留旧的索引。或者,您也可以使用新的df,比如df = df.drop([i])
来实现相等See here如果这些都不能解决您的问题,那么您必须使用
df.drop([i])
的结构相关问题 更多 >
编程相关推荐