回答此问题可获得 20 贡献值,回答如果被采纳可获得 50 分。
<p>我有超过1000个不同<code>id</code>的分组,我只需要选择一个分组的特定数目,然后读取每个分组的<code>nth</code>个数。<a href="http://tpcg.io/XyNS8UNP" rel="nofollow noreferrer">Here</a>我需要的示例:</p>
<pre><code> #These are the codes from different answers
import pandas as pd
import numpy as np
import time
import sys
df = pd.DataFrame({
'index':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'id' : ['veh0', 'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192','veh0', 'veh0', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1', 'veh2', 'veh3', 'veh0', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192', 'veh1192'],
'veh_x' :[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884],
'veh_y':[0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884,0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3863, 3864, 3865, 3866, 3867, 3868, 3869, 3870, 3871, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3881, 3882, 3883, 3884]
}
)
data=['veh0', 'veh1', 'veh2', 'veh3']
# print(df.groupby(['id']).head(1))
#first part
start = time.clock()
for i in range(0,20):
g=df.groupby(['id']).nth([i]).reset_index()
for x in data:
for idx, row in g.iterrows():
if x==row['id']:
print("code1 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start) )
#second part
#This is what I need but it is running slowly when I add it to my whole dataset
start = time.clock()
for i in range(0,20):
for x in data: #these are the selected groups
g = df[df['id'].isin([x])].groupby(['id']).nth([i]).reset_index()
for x, row in g.iterrows():
print("code2 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start) )
#Third part
start = time.clock()
for i in range(0,20):
g=df[df['id'].isin(data)].groupby('id').nth([i]).reset_index()
for x, row in g.iterrows():
print("code3 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start))
#fourth part
start = time.clock()
df2 = df[df['id'].isin(data)]
for i in range(0,20):
for x in data:
row = df2.groupby('id').nth(i)
if(x in row.index):
print("code4 group",i, " = ", x)
end = time.clock()
print ("%.2gs" % (end-start))
#fifth part
def printf(text):
print text
start = time.clock()
tmp = df.loc[df.id.isin(data)].groupby(['id']).apply(lambda x: x.reset_index(drop=True)).reset_index(level=1)
# cleanup and rename index
tmp = tmp.rename(columns={'level_1': 'group'})
# print 20 first groups
for i in range(20):
lst= tmp.loc[tmp.group == i].apply(lambda x:x, axis=1)
for x, row in lst.iterrows():
print("code5 group",i,"=",row['id'])
end = time.clock()
print ("%.2gs" % (end-start))
</code></pre>
<p>代码的第一部分读取所有组并返回每个组的<code>nth</code>编号,但我只需要5个或6个或更多个。问题是我不知道这个团体的任何信息。我可以使用<code>counter</code>,然后我可以使用<code>break</code>,但是代码运行得太慢了,因为每次迭代都需要加载30000多条记录。这里我添加了<code>data=['veh0', 'veh1', 'veh2', 'veh3']</code>作为示例,但它可以随机选择。在</p>
<p>第二部分是我想要的,但是代码仍然运行缓慢。第二部分取0.43s,第一部分取0.14s,第三部分取0.077s,最好的方法是什么?在</p>
<p>谢谢你的帮助?在</p>