我有超过1000个不同id
的分组,我只需要选择一个分组的特定数目,然后读取每个分组的nth
个数。Here我需要的示例:
#These are the codes from different answers
import pandas as pd
import numpy as np
import time
import sys
df = pd.DataFrame({
'index':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'id' : ['veh0', 'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192','veh0', 'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192'],
'veh_x' :[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'veh_y':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884]
}
)
data=['veh0', 'veh1', 'veh2', 'veh3']
# print(df.groupby(['id']).head(1))
#first part
start = time.clock()
for i in range(0,20):
g=df.groupby(['id']).nth([i]).reset_index()
for x in data:
for idx, row in g.iterrows():
if x==row['id']:
print("code1 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start) )
#second part
#This is what I need but it is running slowly when I add it to my whole dataset
start = time.clock()
for i in range(0,20):
for x in data: #these are the selected groups
g = df[df['id'].isin([x])].groupby(['id']).nth([i]).reset_index()
for x, row in g.iterrows():
print("code2 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start) )
#Third part
start = time.clock()
for i in range(0,20):
g=df[df['id'].isin(data)].groupby('id').nth([i]).reset_index()
for x, row in g.iterrows():
print("code3 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start))
#fourth part
start = time.clock()
df2 = df[df['id'].isin(data)]
for i in range(0,20):
for x in data:
row = df2.groupby('id').nth(i)
if(x in row.index):
print("code4 group",i, " = ", x)
end = time.clock()
print ("%.2gs" % (end-start))
#fifth part
def printf(text):
print text
start = time.clock()
tmp = df.loc[df.id.isin(data)].groupby(['id']).apply(lambda x: x.reset_index(drop=True)).reset_index(level=1)
# cleanup and rename index
tmp = tmp.rename(columns={'level_1': 'group'})
# print 20 first groups
for i in range(20):
lst= tmp.loc[tmp.group == i].apply(lambda x:x, axis=1)
for x, row in lst.iterrows():
print("code5 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start))
代码的第一部分读取所有组并返回每个组的nth
编号,但我只需要5个或6个或更多个。问题是我不知道这个团体的任何信息。我可以使用counter
,然后我可以使用break
,但是代码运行得太慢了,因为每次迭代都需要加载30000多条记录。这里我添加了data=['veh0', 'veh1', 'veh2', 'veh3']
作为示例,但它可以随机选择。在
第二部分是我想要的,但是代码仍然运行缓慢。第二部分取0.43s,第一部分取0.14s,第三部分取0.077s,最好的方法是什么?在
谢谢你的帮助?在
我想你不需要这样的for循环:
据我所知,你的问题:
这将为每个id精确地提供第0、第5和第9个数据行(根据您的情况修改列表[0,5,9]:
^{pr2}$编辑: 也许这会对您有所帮助(根据您的情况修改列表[1300]:
如果我没弄错你的意思,你希望这样做:
与预期输出不同的唯一方法是组没有编号(在示例代码中是
i
)。但是你没有明确说明这是一个要求?在如果我错了,请纠正我。在
相关问题 更多 >
编程相关推荐