在应用groupby之后，获得特定数量组的最快和最佳方法是什么？

#These are the codes from different answers import pandas as pd import numpy as np import time import sys df = pd.DataFrame({ 'index':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884], 'id' : ['veh0', 'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192','veh0', 'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192'], 'veh_x' :[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884], 'veh_y':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884] } ) data=['veh0', 'veh1', 'veh2', 'veh3'] # print(df.groupby(['id']).head(1)) #first part start = time.clock() for i in range(0,20): g=df.groupby(['id']).nth([i]).reset_index() for x in data: for idx, row in g.iterrows(): if x==row['id']: print("code1 group",i,"=",row['id']) end = time.clock() print ("%.2gs" % (end-start) ) #second part #This is what I need but it is running slowly when I add it to my whole dataset start = time.clock() for i in range(0,20): for x in data: #these are the selected groups g = df[df['id'].isin([x])].groupby(['id']).nth([i]).reset_index() for x, row in g.iterrows(): print("code2 group",i,"=",row['id']) end = time.clock() print ("%.2gs" % (end-start) ) #Third part start = time.clock() for i in range(0,20): g=df[df['id'].isin(data)].groupby('id').nth([i]).reset_index() for x, row in g.iterrows(): print("code3 group",i,"=",row['id']) end = time.clock() print ("%.2gs" % (end-start)) #fourth part start = time.clock() df2 = df[df['id'].isin(data)] for i in range(0,20): for x in data: row = df2.groupby('id').nth(i) if(x in row.index): print("code4 group",i, " = ", x) end = time.clock() print ("%.2gs" % (end-start)) #fifth part def printf(text): print text start = time.clock() tmp = df.loc[df.id.isin(data)].groupby(['id']).apply(lambda x: x.reset_index(drop=True)).reset_index(level=1) # cleanup and rename index tmp = tmp.rename(columns={'level_1': 'group'}) # print 20 first groups for i in range(20): lst= tmp.loc[tmp.group == i].apply(lambda x:x, axis=1) for x, row in lst.iterrows(): print("code5 group",i,"=",row['id']) end = time.clock() print ("%.2gs" % (end-start))

3条回答

网友

1楼 · 编辑于 2024-07-01 08:12:20

我想你不需要这样的for循环：

data = ['veh0', 'veh1', 'veh2', 'veh3']

# insert group index for each id group
tmp = (df.loc[df.id.isin(data)]
       .groupby(['id'])
       .apply(lambda x: x.reset_index(drop=True))
       .reset_index(level=1))
# cleanup and rename index
tmp = tmp.reset_index(drop=True)
tmp = tmp.rename(columns={'level_1': 'group'})

# fast print 20 first groups
print(tmp.loc[tmp.group.isin(list(range(20)))]
  .sort_values('group')[['group', 'id']]
  .to_string(formatters={'group':'group {} ='.format}, 
             index=False, 
             header=None))

# slow print with loop
for i in range(20): 
    lst = tmp.loc[tmp.group == i]
    for x, row in lst.iterrows():
               print("code5 group",i,"=",row['id'])

网友

2楼 · 编辑于 2024-07-01 08:12:20

据我所知，你的问题：

>>> import pandas as pd
>>> df = \
pd.DataFrame(
    {
        'id': [i for i in range (1000)]*10,
        'col1': ['col1 occurence {} for id {}'.format(j, i) for j in range(10) for i in range (1000)],
        'col2': ['col2 occurence {} for id {}'.format(j, i) for j in range(10) for i in range (1000)]
    }
)
>>> df.head()

   id                       col1                       col2
0   0  col1 occurence 0 for id 0  col2 occurence 0 for id 0
1   1  col1 occurence 0 for id 1  col2 occurence 0 for id 1
2   2  col1 occurence 0 for id 2  col2 occurence 0 for id 2
3   3  col1 occurence 0 for id 3  col2 occurence 0 for id 3
4   4  col1 occurence 0 for id 4  col2 occurence 0 for id 4

这将为每个id精确地提供第0、第5和第9个数据行（根据您的情况修改列表[0,5,9]：

^{pr2}$

编辑： 也许这会对您有所帮助（根据您的情况修改列表[1300]：

>>> df[df['id'].isin([1,300])].groupby(['id']).nth([0]).reset_index()
    id                         col1                         col2
0    1    col1 occurence 0 for id 1    col2 occurence 0 for id 1
1  300  col1 occurence 0 for id 300  col2 occurence 0 for id 300

网友

3楼 · 编辑于 2024-07-01 08:12:20

如果我没弄错你的意思，你希望这样做：

data = ['veh0', 'veh1', 'veh2', 'veh3']
n_groups = 3
df[df['id'].isin(data)].groupby('id').head(n_groups)['id']

与预期输出不同的唯一方法是组没有编号（在示例代码中是i）。但是你没有明确说明这是一个要求？在

如果我错了，请纠正我。在

相关问题更多 >

编程相关推荐

热门问题

热门文章