<p>一个更通用的代码改编自@Rob Raymond sugestion</p>
<pre><code>def replace_str_nan_by_np_nan(df_str_nan):
"""
dealing with nan strings, since fillna handles only np.nan
Args: df with string nan
Return: df with np.nan
Ex:
import pandas as pd
import numpy as np
df_str_nan = pd.DataFrame({
'age':['np.nan',34,19],
'gender':['Nan',np.nan,'M'],
'profession':['student', 'nan', 'artist']})
df_np_nan = replace_str_nan_by_np_nan(df_str_nan)
print(df_np_nan.isna())
age gender profession
0 True True False
1 False True True
2 False False False
"""
import numpy as np
df_np_nan = df_str_nan.copy()
for nan in ['np.nan', 'NaN', 'Nan', 'nan']:
df_np_nan = df_np_nan.replace(nan, np.nan, regex=True)
return df_np_nan
def join_df1_df2_repeated_col(df1, df2):
"""
join two dataframes keeping values within repeated columns
dealing with nan strings, since fillna handles only np.nan
Args: df1, df2 two dataframes
Return: df_join joined dataframe
Ex:
import pandas as pd
import numpy as np
df1 = pd.DataFrame({
'age':[7,34,19],
'gender':['F',np.nan,'M'],
'profession':['student', 'CEO', 'artist']})
df2 = pd.DataFrame({
'age':[7,34,19],
'gender':['np.nan','F',np.nan],
'interests':['acting', 'cars', 'gardening']})
print(join_df1_df2_repeated_col(df1, df2))
age gender profession interests
0 7 F student acting
1 34 F CEO cars
2 19 M artist gardening
"""
import pandas as pd
import numpy as np
# dealing with nan strings, since fillna handles only np.nan
df1 = replace_str_nan_by_np_nan(df1)
df2 = replace_str_nan_by_np_nan(df2)
rsuffix = "_r"
df_join = df1.join(df2, rsuffix=rsuffix)
# dealing with repeated columns
mask = df_join.columns.str.endswith(rsuffix)
lst_col_r = list(df_join.loc[:,mask].columns)
for col_r in lst_col_r:
col = col_r[:-len(rsuffix)]
df_join[col] = df_join[col].fillna(df_join[col_r])
return df_join.drop(columns=lst_col_r)
import pandas as pd
import numpy as np
df1 = pd.DataFrame({
'age':[7,34,19],
'gender':['F',np.nan,'M'],
'profession':['student', 'CEO', 'artist']})
df2 = pd.DataFrame({
'age':[7,34,19],
'gender':['np.nan','F',np.nan],
'interests':['acting', 'cars', 'gardening']})
join_df1_df2_repeated_col(df1, df2)
</code></pre>