如何使用矢量化对一个数据帧的结果进行分组、剪切、转置和合并

df_size = 1000000 df_random = pd.DataFrame({'boat_id' : np.random.choice(range(300),df_size), 'X' :np.random.random_integers(0,1000,df_size), 'target_Y' :np.random.random_integers(0,10,df_size)}) X boat_id target_Y 0 482 275 6 1 705 245 4 2 328 102 6 3 631 227 6 4 234 236 8 ...

X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 target_Y boat_id 40055 684.0 692.0 950.0 572.0 442.0 850.0 75.0 140.0 382.0 576.0 0.0 1 40056 178.0 949.0 490.0 777.0 335.0 559.0 397.0 729.0 701.0 44.0 4.0 1 40057 21.0 818.0 341.0 577.0 612.0 57.0 303.0 183.0 519.0 357.0 0.0 1 40058 501.0 1000.0 999.0 532.0 765.0 913.0 964.0 922.0 772.0 534.0 1.0 2 40059 305.0 906.0 724.0 996.0 237.0 197.0 414.0 171.0 369.0 299.0 8.0 2 40060 408.0 796.0 815.0 638.0 691.0 598.0 913.0 579.0 650.0 955.0 2.0 3 40061 298.0 512.0 247.0 824.0 764.0 414.0 71.0 440.0 135.0 707.0 9.0 4 40062 535.0 687.0 945.0 859.0 718.0 580.0 427.0 284.0 122.0 777.0 2.0 4 40063 352.0 115.0 228.0 69.0 497.0 387.0 552.0 473.0 574.0 759.0 3.0 4 40064 179.0 870.0 862.0 186.0 25.0 125.0 925.0 310.0 335.0 739.0 7.0 4 ...

start_time = time.time() N = 10 col_names = map(lambda x: 'X'+str(x), range(N)) compil = pd.DataFrame(columns = col_names) i = 0 # I group by boat ID for boat_id, df_boat in df_random.groupby('boat_id'): # then I cut every 50 line for (line_number, (index, row)) in enumerate(df_boat.iterrows()): if line_number%5 == 0: compil_new_line_X = list(df_boat.iloc[line_number-N:line_number,:]["X"]) # filter to avoid issues at the start and end of the columns if len (compil_new_line_X ) == N: compil.loc[i,col_names] = compil_new_line_X compil.loc[i, 'target_Y'] = row['target_Y'] compil.loc[i,'boat_id'] = row['boat_id'] i += 1 print("Total %s seconds" % (time.time() - start_time))

1条回答

网友

1楼 · 发布于 2024-10-01 07:45:20

这是一个将计算时间缩短35%的解决方案。它使用“groupby”表示“boat\u ID”then'groupby.apply应用“把小组分成小块。最后一个应用程序创建新行。我们也许还能改进它。你知道吗

df_size = 1000000
df_random = pd.DataFrame({'boat_id' : np.random.choice(range(300),df_size),
                       'X' :np.random.random_integers(0,1000,df_size),
                       'target_Y' :  np.random.random_integers(0,10,df_size)})

start_time = time.time()
len_of_chunks = 10
col_names = map(lambda x: 'X'+str(x), range(N))+['boat_id', 'target_Y']


def prepare_data(group):
    # this function create the new line we will put in 'compil'
    info_we_want_to_keep =['boat_id', 'target_Y']
    info_and_target = group.tail(1)[info_we_want_to_keep].values

    k = group["X"]
    return np.hstack([k.values, info_and_target[0]]) # this create the new line we will put in 'compil'


# we group by ID (boat)
# we divide in chunk of len "len_of_chunks"
# we apply prepare data from each chunk
groups =  df_random.groupby('boat_id').apply(lambda x: x.groupby(np.arange(len(x)) // len_of_chunks).apply(prepare_data))

# we reset index
# we take the '0' columns containing valuable info
# we put info in a new 'compil' dataframe
# we drop uncomplet line ( generated by chunk < len_of_chunks )
compil =  pd.DataFrame(groups.reset_index()[0].values.tolist(), columns= col_names).dropna()


print("Total  %s seconds" % (time.time() - start_time))

总计153.781999826秒

相关问题更多 >

编程相关推荐

热门问题

热门文章