我现在做了很多尝试,过去几天也读了很多书,但我找不到解决问题的正确方法。也许有人能帮我一把
我必须创建CSV文件,其中wdw_clip_db_2018-01-17_2(4720行)应包含所有数据,而wdw_content_complete(2752行)仅包含一个子集
wdw_clip_db_2018-01-17_2.csv:
11,0_7cjgob0v,9000301_AzubiGlueckT1.mxf,0_7cjgob0v.mpg
43,0_heor15yl,,0_heor15yl.mpg
1616,0_dfopff5t,578_Bier.MXF,0_dfopff5t.mpg
1500,0_9fpl1ozv,601_SinnestŠuschungen.MXF,0_9fpl1ozv.mpg
1931,0_cbx3zgw6,9070201_KeinGeldFuerGeschen.mxf,0_cbx3zgw6.mpg
wdw_content_complete.csv:
1737,9000301_AzubiGlueckT1.mxf,0_7cjgob0v
1451,578_Bier.MXF,0_dfopff5t
1433,445_Holzverarbeitung.MXF,NULL
1461,601_Sinnestäuschungen.MXF,NULL
1762,9070201_KeinGeldFuerGeschen.mxf,NULL
我需要想出的是以下可由Excel读取的csv文件:
wdw_clean_assets.csv:
9000301_AzubiGlueckT1.mxf,0_7cjgob0v
578_Bier.MXF,0_dfopff5t
其中,wdw_clean_资产保存与文件和外部参考(例如0_7cjgob0v)匹配的每一行
wdw_to_add_ext_refs.csv:
9070201_KeinGeldFuerGeschen.mxf,0_cbx3zgw6
其中wdw_to_add_ext_refs保存与文件匹配但在外部_引用字段中有空值的每一行。NULL替换为wdw_clip_db_2018-01-17_2.csv中的外部引用
当我比较行数时,wdw_content_complete.csv中的行不在wdw_clip_db_2018-01-17_2.csv中。老实说,这不应该是,所以我需要找出这些线的错误。因此,我需要将wdw_content_complete.csv的其余部分放入一个新的csv文件中
wdw_to_clean_assets.csv:
1433,445_Holzverarbeitung.MXF,NULL
1461,601_Sinnestäuschungen.MXF,NULL
最后,我需要两个单独的csv中的其余csv wdw_clip_db_2018-01-17_2.csv和wdw_content_complete.csv。因此,我试图从一个列表中减去另一个列表,不幸的是,这也不正确
wdw_hansi_assets_rest.csv:
1500,0_9fpl1ozv,601_SinnestŠuschungen.MXF,0_9fpl1ozv.mpg
wdw_mediahub_assets_rest.csv:
1433,445_Holzverarbeitung.MXF,NULL
到目前为止,我得到的是这个Python脚本:
导入csv
# CSV Files
# wdw_clip_db_2018-01-17_2.csv
# wdw_content_complete.csv
# Reading the CSV Files
hansi_assets = []
with open('wdw_clip_db_2018-01-17_2.csv') as hansi_db:
reader = csv.reader(hansi_db)
for row in reader:
hansi_assets.append(row)
hansi_db.close()
mediahub_assets = []
with open('wdw_content_complete.csv') as mediahub_db:
reader = csv.reader(mediahub_db)
for row in reader:
mediahub_assets.append(row)
mediahub_db.close()
clean_asset = []
clean_assets = []
to_add_ext_ref = []
to_add_ext_refs = []
to_clean_assets = []
hansi_assets_rest = []
mediahub_assets_rest = []
hansi_assets_rm = []
mediahub_assets_rm = []
num_clean_rwos = 0
num_to_clean_rows = 0
num_to_add_ext_refs = 0
num_dirty_rows = 0
num_hansi_iterations = 0
num_mediahub_iterations = 0
num_mediahub_null = 0
num_hansi_mediahub_matches = 0
# Looping over the CSV Files
for hansi_asset in hansi_assets:
num_hansi_iterations += 1
for mediahub_asset in mediahub_assets:
num_mediahub_iterations += 1
# Checking if there are similar, clean entries
if hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1] and hansi_asset[1] == mediahub_asset[2]:
clean_assets.append(mediahub_asset)
# Counting for evaluation reasons
num_clean_rwos += 1
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Checking if there are entries which miss the Ext_Ref field and replacing the NULL by the corresponding Ext_Ref in the hansi_asset
elif hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1] and mediahub_asset[2] == "NULL":
to_add_ext_ref = [mediahub_asset[1], hansi_asset[1]]
to_add_ext_refs.append(to_add_ext_ref)
# Counting for evaluation reasons
num_to_add_ext_refs += 1
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Checking if there are entries that don't match
elif hansi_asset[2] != mediahub_asset[1] or hansi_asset[3] != mediahub_asset[1]:
to_clean_assets.append([mediahub_asset[1], mediahub_asset[2]])
# Counting for evaluation reasons
num_to_clean_rows += 1
# Creating a list to substract from its origin to get the Rest
mediahub_assets_rm.append(mediahub_asset)
hansi_assets_rm.append(hansi_asset)
# Just counting the Matches
for hansi_asset in hansi_assets:
for mediahub_asset in mediahub_assets:
if hansi_asset[2] == mediahub_asset[1] or hansi_asset[3] == mediahub_asset[1]:
num_hansi_mediahub_matches += 1
# Just counting the NULLs
for mediahub_asset in mediahub_assets:
num_mediahub_iterations += 1
if mediahub_asset[2] == "NULL":
num_mediahub_null += 1
# for mediahub_asset_rm in mediahub_assets_rm:
# if mediahub_asset[1] != mediahub_asset_rm[1]:
# mediahub_assets_rest = Diff(mediahub_assets, mediahub_assets_rm)
# Trying to substract medihub_assets_rm from mediahub_assets to get the Rest
mediahub_assets_rest = [item for item in mediahub_assets_rm if item not in mediahub_assets]
hansi_assets_rest = [item for item in hansi_assets_rm if item not in hansi_assets]
# Printing some lines for evaluation
print hansi_assets[1]
print mediahub_assets[1]
print clean_assets[1]
print to_clean_assets[1]
print to_add_ext_refs[1]
print hansi_assets_rest[1]
print mediahub_assets_rest[1]
print hansi_assets_rm[1]
print mediahub_assets_rm[1]
print "Num Hansi Assets: " + str(len(hansi_assets))
print "Num Mediahub Assets: " + str(len(mediahub_assets))
print "Num Clean Assets: " + str(len(clean_assets))
print "Num Hansi Assets to remove: " + str(len(hansi_assets_rm))
print "Num Mediahub Assets to remove: " + str(len(mediahub_assets_rm))
print "Num Hansi Rest Assets: " + str(len(hansi_assets_rest))
print "Num Mediahub Rest Assets: " + str(len(mediahub_assets_rest))
print "Num Mediahub NULLs: " + str(num_mediahub_null)
print "Num Hansi Mediahub Matches: " + str(num_hansi_mediahub_matches)
print "Num Clean Rows: " + str(num_clean_rwos)
print "Num To Clean Rows: " + str(num_to_clean_rows)
print "Num To Add Ext_Ref: " + str(num_to_add_ext_refs)
print "Num Dirty Rows: " + str(num_dirty_rows)
print "Num Hansi Iterations: " + str(num_hansi_iterations)
print "Num Mediahub Iterations: " + str(num_mediahub_iterations / num_hansi_iterations)
# Writing clean_assets to a file
wdw_clean_assets = []
with open('wdw_clean_assets.csv', 'w') as wdw_clean_assets:
writer = csv.writer(wdw_clean_assets)
for row in clean_assets:
writer.writerow([row])
wdw_clean_assets.close()
wdw_to_add_ext_refs =[]
with open('wdw_to_add_ext_refs.csv', 'w') as wdw_to_add_ext_refs:
writer = csv.writer(wdw_to_add_ext_refs)
for row in to_add_ext_refs:
writer.writerow([row])
wdw_to_clean_assets = []
with open('wdw_to_clean_assets.csv', 'w') as wdw_to_clean_assets:
writer = csv.writer(wdw_to_clean_assets)
for row in to_clean_assets:
writer.writerow([row])
wdw_to_clean_assets.close()
wdw_hansi_assets_rest = []
with open('wdw_hansi_assets_rest.csv', 'w') as wdw_hansi_assets_rest:
writer = csv.writer(wdw_hansi_assets_rest)
for row in hansi_assets_rest:
writer.writerow([row])
wdw_hansi_assets_rest.close()
wdw_mediahub_assets_rest = []
with open('wdw_mediahub_assets_rest.csv', 'w') as wdw_mediahub_assets_rest:
writer = csv.writer(wdw_mediahub_assets_rest)
for row in mediahub_assets_rest:
writer.writerow([row])
wdw_mediahub_assets_rest.close()
感谢您的帮助
曼努埃尔
目前没有回答
相关问题 更多 >
编程相关推荐