交叉检查合并数据帧(在这种情况下,pd.concat、pd.merge似乎不起作用)

2024-09-28 23:42:01 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试将两个数据集(在某些情况下,这两个数据集相互补充)合并到一个新的数据帧中,而无需重复列。换句话说,我有两个数据帧。在某些列(在两个数据帧中具有相同的名称)中,我需要的信息将在其中一个列中,但不是在两个列中

例如,请参见下面我创建的反映当前问题的虚构数据帧。这些数据帧包含同3个人的信息。注意“性别”一栏。当一个中缺少该值时,另一个中会找到该值,反之亦然。两列值的组合为我们提供了一个完整的性别列。理想情况下,我会在下面找到您需要的df_

(实际数据集有相当多的列,如性别)

df_have1 = pd.DataFrame({'age':[7,34,19], 'gender':['F',np.nan,'M'], 'profession':['student', 'CEO', 'artist']})
df_have1

df_have2 = pd.DataFrame({'age':[7,34,19], 'gender':['np.nan','F',np.nan], 'interests':['acting', 'cars', 'gardening']})
df_have2

df_need = pd.DataFrame({'age':[7,34,19], 'gender':['F','F','M'], 'profession':['student', 'CEO', 'artist'], 'interests':['acting', 'cars', 'gardening']})
df_need

我试过pd.concat,不幸的是它复制了性别栏。pd.merge和join也是如此

pd.concat([df_have1, df_have2], axis=1)

Tags: 数据信息dataframedfagenp情况nan
3条回答
  • merge()join()将与列后缀一起使用
  • 使用数据来fillna()
  • 完成后删除不需要的列
df_have1 = pd.DataFrame({'age':[7,34,19], 'gender':['F',np.nan,'M'], 'profession':['student', 'CEO', 'artist']})

df_have2 = pd.DataFrame({'age':[7,34,19], 'gender':['np.nan','F',np.nan], 'interests':['acting', 'cars', 'gardening']})

df_need = (df_have1.join(df_have2, rsuffix="_r")
 .assign(gender=lambda dfa: dfa.gender.fillna(dfa.gender_r))
 .drop(columns=["age_r","gender_r"])
)

^{tb1}$

到目前为止,@Rob Raymond方法更好

但是,如果两个数据帧具有相同的行数,则可以使用字典和for循环获得类似的结果(在框架中的实践不佳)

df_have1 = pd.DataFrame({
    'age':[7,34,19], 
    'gender':['F',np.nan,'M'], 
    'profession':['student', 'CEO', 'artist']})
df_have2 = pd.DataFrame({
    'age':[7,34,19], 
    'gender':['np.nan','F',np.nan], 
    'interests':['acting', 'cars', 'gardening']})
df_need = pd.DataFrame({
    'age':[7,34,19],
    'gender':['F','F','M'],
    'profession':['student', 'CEO', 'artist'],
    'interests':['acting', 'cars', 'gardening']})

dct = {k:{} for k in (list(df_have1.columns) + list(df_have2.columns))}
for col in dct.keys():
    if col in list(df_have1.columns):
        for row in df_have1.index:
            if col in list(df_have2.columns):  # intersection
                if df_have1[col].iloc[row] not in ['NaN', np.nan]:
                    dct[col][row] = df_have1[col].iloc[row]
                elif df_have2[col].iloc[row] not in ['NaN', np.nan]:
                    dct[col][row] = df_have2[col].iloc[row]
                else:  # without NaN values in the entry
                    dct[col][row] = np.nan
            else:  # data only in df_have1
                dct[col][row] = df_have1[col].iloc[row]
    else:  # data only in df_have2
        for row in df_have2.index:
            dct[col][row] = df_have2[col].iloc[row]

df_get = pd.DataFrame(dct)

assert df_get.equals(df_need)  # assures the both df are the same

一个更通用的代码改编自@Rob Raymond sugestion

def replace_str_nan_by_np_nan(df_str_nan):
    """
        dealing with nan strings, since fillna handles only np.nan
        
        Args: df with string nan
        
        Return: df with np.nan
    
    Ex: 
        import pandas as pd
        import numpy as np
                
        df_str_nan = pd.DataFrame({
            'age':['np.nan',34,19], 
            'gender':['Nan',np.nan,'M'], 
            'profession':['student', 'nan', 'artist']})
        df_np_nan = replace_str_nan_by_np_nan(df_str_nan)              
        print(df_np_nan.isna())
            age     gender  profession
        0   True    True    False
        1   False   True    True
        2   False   False   False
    """
    import numpy as np
    
    df_np_nan = df_str_nan.copy()
    for nan in ['np.nan', 'NaN', 'Nan', 'nan']:  
        df_np_nan = df_np_nan.replace(nan, np.nan, regex=True)
        
    return df_np_nan


def join_df1_df2_repeated_col(df1, df2):
    """
        join two dataframes keeping values within repeated columns 
        dealing with nan strings, since fillna handles only np.nan
        
        Args: df1, df2 two dataframes
        
        Return: df_join joined dataframe
    
    Ex: 
        import pandas as pd
        import numpy as np
        

        df1 = pd.DataFrame({
            'age':[7,34,19], 
            'gender':['F',np.nan,'M'], 
            'profession':['student', 'CEO', 'artist']})
        df2 = pd.DataFrame({
            'age':[7,34,19], 
            'gender':['np.nan','F',np.nan], 
            'interests':['acting', 'cars', 'gardening']})

        print(join_df1_df2_repeated_col(df1, df2))
        
            age gender  profession  interests
        0   7   F       student     acting
        1   34  F       CEO         cars
        2   19  M       artist      gardening
    """
    import pandas as pd
    import numpy as np
    
    
    # dealing with nan strings, since fillna handles only np.nan
    df1 = replace_str_nan_by_np_nan(df1)
    df2 = replace_str_nan_by_np_nan(df2)
    
    rsuffix = "_r"
    df_join = df1.join(df2, rsuffix=rsuffix)
    
    # dealing with repeated columns
    mask = df_join.columns.str.endswith(rsuffix)
    lst_col_r = list(df_join.loc[:,mask].columns)
    for col_r in lst_col_r:
        col = col_r[:-len(rsuffix)]
        df_join[col] = df_join[col].fillna(df_join[col_r])   
    
    return df_join.drop(columns=lst_col_r)


import pandas as pd
import numpy as np

df1 = pd.DataFrame({
    'age':[7,34,19], 
    'gender':['F',np.nan,'M'], 
    'profession':['student', 'CEO', 'artist']})
df2 = pd.DataFrame({
    'age':[7,34,19], 
    'gender':['np.nan','F',np.nan], 
    'interests':['acting', 'cars', 'gardening']})

join_df1_df2_repeated_col(df1, df2)

相关问题 更多 >