根据另一个数据帧中的最近位置填充数据帧中的缺失值

import pandas as pd import numpy as np date = pd.date_range(start='2020-01-01', freq='H', periods=4) locations = ["AA3", "AB1", "AD1", "AC0"] x = [5.5, 10.2, np.nan, 2.3, 11.2, np.nan, 2.1, 4.0, 6.1, np.nan, 20.3, 11.3, 4.9, 15.2, 21.3, np.nan] df = pd.DataFrame({'x': x}) df.index = pd.MultiIndex.from_product([locations, date], names=['location', 'date']) df = df.sort_index() df

x location date AA3 2020-01-01 00:00:00 5.5 2020-01-01 01:00:00 10.2 2020-01-01 02:00:00 NaN 2020-01-01 03:00:00 2.3 AB1 2020-01-01 00:00:00 11.2 2020-01-01 01:00:00 NaN 2020-01-01 02:00:00 2.1 2020-01-01 03:00:00 4.0 AC0 2020-01-01 00:00:00 4.9 2020-01-01 01:00:00 15.2 2020-01-01 02:00:00 21.3 2020-01-01 03:00:00 NaN AD1 2020-01-01 00:00:00 6.1 2020-01-01 01:00:00 NaN 2020-01-01 02:00:00 20.3 2020-01-01 03:00:00 11.3

nearest = pd.DataFrame({"AA3": ["AA3", "AB1", "AD1", "AC0"], "AB1": ["AB1", "AA3", "AC0", "AD1"], "AD1": ["AD1", "AC0", "AB1", "AA3"], "AC0": ["AC0", "AD1", "AA3", "AB1"]}) nearest

x location date AA3 2020-01-01 00:00:00 5.5 2020-01-01 01:00:00 10.2 2020-01-01 02:00:00 2.1 2020-01-01 03:00:00 2.3 AB1 2020-01-01 00:00:00 11.2 2020-01-01 01:00:00 10.2 2020-01-01 02:00:00 2.1 2020-01-01 03:00:00 4.0 AC0 2020-01-01 00:00:00 4.9 2020-01-01 01:00:00 15.2 2020-01-01 02:00:00 21.3 2020-01-01 03:00:00 11.3 AD1 2020-01-01 00:00:00 6.1 2020-01-01 01:00:00 15.2 2020-01-01 02:00:00 20.3 2020-01-01 03:00:00 11.3

def fillna_by_nearest(x: pd.Series, nn_data: pd.DataFrame): out = x.copy() for index, value in x.iteritems(): if np.isnan(value) and (index[0] in nn_data.columns): location, date = index for near_location in nn_data[location]: if ((near_location, date) in x.index) and pd.notna(x.loc[near_location, date]): out.loc[index] = x.loc[near_location, date] break return out fillna_by_nearest(df['x'], nearest)

2条回答

网友

1楼 · 编辑于 2024-09-29 01:35:51

我同意Serial Lazer的说法，没有对熊猫/裸体进行更整洁的处决。要求有点复杂。在这种情况下，您应该创建自己的函数。下面是一个例子

nearest = pd.DataFrame({"AA3": ["AA3", "AB1", "AD1", "AC0"],
                        "AB1": ["AB1", "AA3", "AC0", "AD1"],
                        "AD1": ["AD1", "AC0", "AB1", "AA3"],
                        "AC0": ["AC0", "AD1", "AA3", "AB1"]})


def fill_by_nearest(sr: pd.Series):
    if not np.isnan(sr['x']):
        return sr

    location = sr.name[0]
    date = sr.name[1]
    for near_location in nearest[location]:
        if not np.isnan(df.loc[near_location, date]['x']):
            sr['x'] = df.loc[near_location, date]['x']
            return sr
    return sr

df = df.apply(fill_by_nearest, axis=1)

网友

2楼 · 编辑于 2024-09-29 01:35:51

您可以使用应用功能：

def find_nearest(row): 
    for item in list(nearest[row['location']]):
        if len(df[lambda x: (x['location']==item) & (x['date']==row['date']) &(~pd.isnull(x['x']))]):
            return df[lambda x: (x['location']==item) & (x['date']==row['date']) &(~pd.isnull(x['x']))].x.values[0]
    
df = df.reset_index()        
df = df.assign(x = lambda x: x.apply(find_nearest,axis=1))

输出：

   location                date     x
0       AA3 2020-01-01 00:00:00   5.5
1       AA3 2020-01-01 01:00:00  10.2
2       AA3 2020-01-01 02:00:00   2.1
3       AA3 2020-01-01 03:00:00   2.3
4       AB1 2020-01-01 00:00:00  11.2
5       AB1 2020-01-01 01:00:00  10.2
6       AB1 2020-01-01 02:00:00   2.1
7       AB1 2020-01-01 03:00:00   4.0
8       AC0 2020-01-01 00:00:00   4.9
9       AC0 2020-01-01 01:00:00  15.2
10      AC0 2020-01-01 02:00:00  21.3
11      AC0 2020-01-01 03:00:00  11.3
12      AD1 2020-01-01 00:00:00   6.1
13      AD1 2020-01-01 01:00:00  15.2
14      AD1 2020-01-01 02:00:00  20.3
15      AD1 2020-01-01 03:00:00  11.3

输出：

相关问题更多 >

编程相关推荐

热门问题

热门文章