所以我有以下代码:
cols = ['Col001','Col002','Col003','Col004','Col005','Col006','Col007','Col008','Col009',]
import pandas as pd
dataA = [
['AB1', 'A', 100, 'NY', 0.01, 23, 'PQR', 1003, 0.002,],
['AB2', 'B', 201, 'NY', 0.03, 13, 'MNO', 1232, 0.004,],
['AB3', 'A', 234, 'NJ', 0.05, 54, 'ABC', 3443, 0.003,],
['AB4', 'V', 221, 'DE', 0.05, 67, 'ABC', 2345, 0.023,],
['AB5', 'B', 342, 'CT', 0.04, 89, 'MNO', 3457, 0.023,],
['AB6', 'N', 222, 'NY', 0.02, 67, 'PQR', 7665, 0.032,],
['AB7', 'F', 342, 'PA', 0.03, 56, 'ABC', 5767, 0.067,],
['AB8', 'C', 453, 'CA', 0.04, 34, 'PQR', 7563, 0.045,],
['AB9', 'B', 123, 'CT', 0.03, 65, 'PQR', 3465, 0.034,],
['AB10','C', 443, 'NJ', 0.03, 66, 'MNO', 3433, 0.087,],]
dataB = [
['AB1', 'A', 100, 'NY', 0.01, 23, 'PQR', 1003, 0.002,],
['AB2', 'B', 201, 'NY', 0.03, 13, 'MNO', 1232, 0.004,],
['AB3', 'A', 234, 'NJ', 0.05, 54, 'ABC', 3443, 0.003,],
['AB4', 'V', 221, 'DE', 0.08, 67, 'ABC', 2345, 0.023,],
['AB5', 'B', 342, 'NJ', 0.04, 89, 'MNO', 3457, 0.023,],
['AB6', 'N', 222, 'NY', 0.02, 67, 'PQR', 7665, 0.032,],
['AB7', 'F', 342, 'PA', 0.03, 56, 'MNO', 5767, 0.067,],
['AB8', 'C', 453, 'CA', 0.04, 34, 'PQR', 7563, 0.048,],
['AB9', 'B', 123, 'CT', 0.03, 65, 'PQR', 2353, 0.034,],
['AB10','C', 443, 'NJ', 0.03, 66, 'MNO', 3433, 0.087,],]
def getDataFrame(source,sourceName):
df = pd.DataFrame(source,columns=cols)
df['DataSource'] = sourceName
return df
def compareDataFrames(sourceDataFrame,newDataFrame):
targetDF = pd.concat([sourceDataFrame, newDataFrame])
targetDF = targetDF.reset_index(drop=True)
columnsGroup = list(targetDF.columns)
columnsGroup.remove('DataSource')
targetDF_GroupBy = targetDF.groupby(columnsGroup)
idx = [x[0] for x in targetDF_GroupBy.groups.values() if len(x) == 1]
targetDF = targetDF.reindex(idx)
targetDF = targetDF.sort_values(by=['Col001'], ascending=[True])
return targetDF
def getDiff():
sourceData = getDataFrame(dataA,'Legacy')
newData = getDataFrame(dataB,'New')
comparedData = compareDataFrames(sourceData,newData)
return comparedData
它和预期的完全一样。输出是
^{pr2}$到目前为止,还不错。但我不喜欢输出。即使它找到了diff,它也会显示每个diff行的所有列。在
所以,我写了另一种方法,只给我一个区别:
def createDiffDataFrame(diffDataframe,ignoreCols):
diffData = []
compareCols = diffDataframe.columns
for eachContract in (diffDataframe.Col001.unique()):
legacyRow = diffDataframe[(diffDataframe['DataSource']=='Legacy') & (diffDataframe['Col001']==eachContract)]
newRow = diffDataframe[(diffDataframe['DataSource']=='New') & (diffDataframe['Col001']==eachContract)]
if len(legacyRow) == 0:
diffRow = [eachContract,'MISSING','MISSING','New']
elif len(newRow) == 0:
diffRow = [eachContract,'MISSING','Legacy','MISSING']
else:
for eachCol in compareCols:
if eachCol not in ignoreCols and legacyRow.iloc[0][eachCol] != newRow.iloc[0][eachCol]:
diffRow = [eachContract,eachCol,legacyRow.iloc[0][eachCol],newRow.iloc[0][eachCol]]
diffData.append(diffRow)
diffDF = pd.DataFrame(diffData,columns=['Col001','ColumnName','LegacyValue','NewValue'])
diffDF = diffDF.set_index('Col001')
return diffDF
现在,我的输出非常完美:
x = getDiff()
print (createDiffDataFrame(x,['DataSource']))
ColumnName LegacyValue NewValue
Col001
AB4 Col005 0.05 0.08
AB5 Col004 CT NJ
AB7 Col007 ABC MNO
AB8 Col009 0.045 0.048
AB9 Col008 3465 2353
Press any key to continue . . .
我的问题是: 尽管我得到了预期的输出,但在实际世界中有114列和超过50K行,运行createDiffDataFrame()模块需要花费很多时间。有没有更好的方法来实现createDiffDataFrame?熊猫应该被用来做这个吗?在
为什么不尝试只使用
pandas
第二次输出(使用前面的结果)
^{pr2}$更新了不要使用
.ix
使用.loc
相关问题 更多 >
编程相关推荐