在应用groupby之后,获得特定数量组的最快和最佳方法是什么?

2024-07-01 08:12:20 发布

您现在位置:Python中文网/ 问答频道 /正文

我有超过1000个不同id的分组,我只需要选择一个分组的特定数目,然后读取每个分组的nth个数。Here我需要的示例:

 #These are the codes from different answers 
 import pandas as pd
import numpy as np
import time
import sys
df = pd.DataFrame({
  'index':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'id'    :   ['veh0',    'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192','veh0',    'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192',  'veh1192'],
'veh_x' :[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'veh_y':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884]

}
)


data=['veh0',   'veh1', 'veh2', 'veh3'] 
# print(df.groupby(['id']).head(1))
#first  part
start = time.clock()
for i in range(0,20):
  g=df.groupby(['id']).nth([i]).reset_index()
  for x in data:
      for idx, row in g.iterrows():
          if x==row['id']:
             print("code1 group",i,"=",row['id'])

end = time.clock()
print ("%.2gs" % (end-start) )      


#second part
#This is what I need but it is running slowly when I add it to my whole dataset


start = time.clock()


for i in range(0,20):
 for x in data: #these are the selected groups
            g = df[df['id'].isin([x])].groupby(['id']).nth([i]).reset_index()
            for x, row in g.iterrows():
               print("code2 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start) ) 


#Third part

start = time.clock()               

for i in range(0,20):
    g=df[df['id'].isin(data)].groupby('id').nth([i]).reset_index()

    for x, row in g.iterrows():
               print("code3 group",i,"=",row['id'])

end = time.clock()
print ("%.2gs" % (end-start))


#fourth part

start = time.clock()               
df2 = df[df['id'].isin(data)] 
for i in range(0,20): 
  for x in data: 
      row = df2.groupby('id').nth(i) 
      if(x in row.index): 
            print("code4 group",i, " = ", x) 

end = time.clock()
print ("%.2gs" % (end-start))

#fifth  part
def printf(text):
   print text
start = time.clock()               

tmp = df.loc[df.id.isin(data)].groupby(['id']).apply(lambda x: x.reset_index(drop=True)).reset_index(level=1)

# cleanup and rename index

tmp = tmp.rename(columns={'level_1': 'group'})
# print 20 first groups
for i in range(20): 

    lst= tmp.loc[tmp.group == i].apply(lambda x:x, axis=1)
    for x, row in lst.iterrows():
               print("code5 group",i,"=",row['id'])


end = time.clock()
print ("%.2gs" % (end-start))

代码的第一部分读取所有组并返回每个组的nth编号,但我只需要5个或6个或更多个。问题是我不知道这个团体的任何信息。我可以使用counter,然后我可以使用break,但是代码运行得太慢了,因为每次迭代都需要加载30000多条记录。这里我添加了data=['veh0', 'veh1', 'veh2', 'veh3']作为示例,但它可以随机选择。在

第二部分是我想要的,但是代码仍然运行缓慢。第二部分取0.43s,第一部分取0.14s,第三部分取0.077s,最好的方法是什么?在

谢谢你的帮助?在


Tags: iniddfforindextimestartend
3条回答

我想你不需要这样的for循环:

data = ['veh0', 'veh1', 'veh2', 'veh3']

# insert group index for each id group
tmp = (df.loc[df.id.isin(data)]
       .groupby(['id'])
       .apply(lambda x: x.reset_index(drop=True))
       .reset_index(level=1))
# cleanup and rename index
tmp = tmp.reset_index(drop=True)
tmp = tmp.rename(columns={'level_1': 'group'})

# fast print 20 first groups
print(tmp.loc[tmp.group.isin(list(range(20)))]
  .sort_values('group')[['group', 'id']]
  .to_string(formatters={'group':'group {} ='.format}, 
             index=False, 
             header=None))

# slow print with loop
for i in range(20): 
    lst = tmp.loc[tmp.group == i]
    for x, row in lst.iterrows():
               print("code5 group",i,"=",row['id'])

据我所知,你的问题:

>>> import pandas as pd
>>> df = \
pd.DataFrame(
    {
        'id': [i for i in range (1000)]*10,
        'col1': ['col1 occurence {} for id {}'.format(j, i) for j in range(10) for i in range (1000)],
        'col2': ['col2 occurence {} for id {}'.format(j, i) for j in range(10) for i in range (1000)]
    }
)
>>> df.head()

   id                       col1                       col2
0   0  col1 occurence 0 for id 0  col2 occurence 0 for id 0
1   1  col1 occurence 0 for id 1  col2 occurence 0 for id 1
2   2  col1 occurence 0 for id 2  col2 occurence 0 for id 2
3   3  col1 occurence 0 for id 3  col2 occurence 0 for id 3
4   4  col1 occurence 0 for id 4  col2 occurence 0 for id 4

这将为每个id精确地提供第0、第5和第9个数据行(根据您的情况修改列表[0,5,9]:

^{pr2}$

编辑: 也许这会对您有所帮助(根据您的情况修改列表[1300]:

>>> df[df['id'].isin([1,300])].groupby(['id']).nth([0]).reset_index()
    id                         col1                         col2
0    1    col1 occurence 0 for id 1    col2 occurence 0 for id 1
1  300  col1 occurence 0 for id 300  col2 occurence 0 for id 300

如果我没弄错你的意思,你希望这样做:

data = ['veh0', 'veh1', 'veh2', 'veh3']
n_groups = 3
df[df['id'].isin(data)].groupby('id').head(n_groups)['id']

与预期输出不同的唯一方法是组没有编号(在示例代码中是i)。但是你没有明确说明这是一个要求?在

如果我错了,请纠正我。在

相关问题 更多 >

    热门问题