数据帧差异

cols = ['Col001','Col002','Col003','Col004','Col005','Col006','Col007','Col008','Col009',] import pandas as pd dataA = [ ['AB1', 'A', 100, 'NY', 0.01, 23, 'PQR', 1003, 0.002,], ['AB2', 'B', 201, 'NY', 0.03, 13, 'MNO', 1232, 0.004,], ['AB3', 'A', 234, 'NJ', 0.05, 54, 'ABC', 3443, 0.003,], ['AB4', 'V', 221, 'DE', 0.05, 67, 'ABC', 2345, 0.023,], ['AB5', 'B', 342, 'CT', 0.04, 89, 'MNO', 3457, 0.023,], ['AB6', 'N', 222, 'NY', 0.02, 67, 'PQR', 7665, 0.032,], ['AB7', 'F', 342, 'PA', 0.03, 56, 'ABC', 5767, 0.067,], ['AB8', 'C', 453, 'CA', 0.04, 34, 'PQR', 7563, 0.045,], ['AB9', 'B', 123, 'CT', 0.03, 65, 'PQR', 3465, 0.034,], ['AB10','C', 443, 'NJ', 0.03, 66, 'MNO', 3433, 0.087,],] dataB = [ ['AB1', 'A', 100, 'NY', 0.01, 23, 'PQR', 1003, 0.002,], ['AB2', 'B', 201, 'NY', 0.03, 13, 'MNO', 1232, 0.004,], ['AB3', 'A', 234, 'NJ', 0.05, 54, 'ABC', 3443, 0.003,], ['AB4', 'V', 221, 'DE', 0.08, 67, 'ABC', 2345, 0.023,], ['AB5', 'B', 342, 'NJ', 0.04, 89, 'MNO', 3457, 0.023,], ['AB6', 'N', 222, 'NY', 0.02, 67, 'PQR', 7665, 0.032,], ['AB7', 'F', 342, 'PA', 0.03, 56, 'MNO', 5767, 0.067,], ['AB8', 'C', 453, 'CA', 0.04, 34, 'PQR', 7563, 0.048,], ['AB9', 'B', 123, 'CT', 0.03, 65, 'PQR', 2353, 0.034,], ['AB10','C', 443, 'NJ', 0.03, 66, 'MNO', 3433, 0.087,],] def getDataFrame(source,sourceName): df = pd.DataFrame(source,columns=cols) df['DataSource'] = sourceName return df def compareDataFrames(sourceDataFrame,newDataFrame): targetDF = pd.concat([sourceDataFrame, newDataFrame]) targetDF = targetDF.reset_index(drop=True) columnsGroup = list(targetDF.columns) columnsGroup.remove('DataSource') targetDF_GroupBy = targetDF.groupby(columnsGroup) idx = [x[0] for x in targetDF_GroupBy.groups.values() if len(x) == 1] targetDF = targetDF.reindex(idx) targetDF = targetDF.sort_values(by=['Col001'], ascending=[True]) return targetDF def getDiff(): sourceData = getDataFrame(dataA,'Legacy') newData = getDataFrame(dataB,'New') comparedData = compareDataFrames(sourceData,newData) return comparedData

def createDiffDataFrame(diffDataframe,ignoreCols): diffData = [] compareCols = diffDataframe.columns for eachContract in (diffDataframe.Col001.unique()): legacyRow = diffDataframe[(diffDataframe['DataSource']=='Legacy') & (diffDataframe['Col001']==eachContract)] newRow = diffDataframe[(diffDataframe['DataSource']=='New') & (diffDataframe['Col001']==eachContract)] if len(legacyRow) == 0: diffRow = [eachContract,'MISSING','MISSING','New'] elif len(newRow) == 0: diffRow = [eachContract,'MISSING','Legacy','MISSING'] else: for eachCol in compareCols: if eachCol not in ignoreCols and legacyRow.iloc[0][eachCol] != newRow.iloc[0][eachCol]: diffRow = [eachContract,eachCol,legacyRow.iloc[0][eachCol],newRow.iloc[0][eachCol]] diffData.append(diffRow) diffDF = pd.DataFrame(diffData,columns=['Col001','ColumnName','LegacyValue','NewValue']) diffDF = diffDF.set_index('Col001') return diffDF

x = getDiff() print (createDiffDataFrame(x,['DataSource'])) ColumnName LegacyValue NewValue Col001 AB4 Col005 0.05 0.08 AB5 Col004 CT NJ AB7 Col007 ABC MNO AB8 Col009 0.045 0.048 AB9 Col008 3465 2353 Press any key to continue . . .

1条回答

网友

1楼 · 发布于 2024-09-30 03:25:55

为什么不尝试只使用pandas

dataA['New']=dataA.apply(lambda x :','.join(x.astype(str)),axis=1)
dataB['New']=dataB.apply(lambda x :','.join(x.astype(str)),axis=1)
DF=pd.concat([dataA,dataB],axis=0)
DF['DIFF']=DF.groupby('New')['New'].transform('count')
DF=DF[DF.DIFF==1]
DF.sort_values('Col001')


Out[1927]: 
  Col001 Col002  Col003 Col004  Col005  Col006 Col007  Col008  Col009                                                New  DIFF
3    AB4      V     221     DE    0.05      67    ABC    2345   0.023  AB4,V,221,DE,0.05,67,ABC,2345,0.023,AB4,V,221,...     1
3    AB4      V     221     DE    0.08      67    ABC    2345   0.023  AB4,V,221,DE,0.08,67,ABC,2345,0.023,AB4,V,221,...     1
4    AB5      B     342     CT    0.04      89    MNO    3457   0.023  AB5,B,342,CT,0.04,89,MNO,3457,0.023,AB5,B,342,...     1
4    AB5      B     342     NJ    0.04      89    MNO    3457   0.023  AB5,B,342,NJ,0.04,89,MNO,3457,0.023,AB5,B,342,...     1
6    AB7      F     342     PA    0.03      56    ABC    5767   0.067  AB7,F,342,PA,0.03,56,ABC,5767,0.067,AB7,F,342,...     1
6    AB7      F     342     PA    0.03      56    MNO    5767   0.067  AB7,F,342,PA,0.03,56,MNO,5767,0.067,AB7,F,342,...     1
7    AB8      C     453     CA    0.04      34    PQR    7563   0.045  AB8,C,453,CA,0.04,34,PQR,7563,0.045,AB8,C,453,...     1
7    AB8      C     453     CA    0.04      34    PQR    7563   0.048  AB8,C,453,CA,0.04,34,PQR,7563,0.048,AB8,C,453,...     1
8    AB9      B     123     CT    0.03      65    PQR    3465   0.034  AB9,B,123,CT,0.03,65,PQR,3465,0.034,AB9,B,123,...     1
8    AB9      B     123     CT    0.03      65    PQR    2353   0.034  AB9,B,123,CT,0.03,65,PQR,2353,0.034,AB9,B,123,...     1

第二次输出（使用前面的结果）

^{pr2}$

更新了不要使用.ix使用.loc

DF1=DF1.set_index('Col001')
cols = ['Col001','Col002','Col003','Col004','Col005','Col006','Col007','Col008','Col009',]
DF2=DF1.loc[:,DF1.columns.isin(cols)].reset_index()
DF3=pd.melt(DF2,id_vars=['Col001'])
DF3['select']=DF3.value.apply(lambda x : len(x))
DF4=DF3.loc[(DF3['select']>1)]
pd.concat([DF4.reset_index(drop=True),pd.DataFrame(DF4.value.values.tolist())],axis=1).\
    rename(columns={0:'LegacyValue',1:'NewValue'}).drop(['value','select'],axis=1)

相关问题更多 >

编程相关推荐

热门问题

热门文章