<p>下面是另一个<code>numpy</code>解决方案。首先,将计时与@pirsquares进行比较,以供参考。在给出完全相同的结果的同时,我的代码在一个大型示例中的速度提高了约14倍。你知道吗</p>
<pre><code># both methods give the expected result on small OP example
result
id1 2
id2 1
id3 3
id4 6
id5 1
id6 3
id7 0
id8 3
id9 1
id10 2
result
id1 2
id2 1
id3 3
id4 6
id5 1
id6 3
id7 0
id8 3
id9 1
id10 2
# timings on 50,000 rows random example
pp 12.89263810031116
pi 189.0821446024347
# comparison of results
result True
dtype: bool
</code></pre>
<p>代码:</p>
<pre><code>import pandas as pd
import numpy as np
# OP example
idx = ['id1', 'id2', 'id3', 'id4', 'id5',
'id6', 'id7', 'id8', 'id9', 'id10']
data = {'Day1':[0,0,1,0,1,1,0,0,1,1],
'Day2':[0,1,1,1,2,1,0,1,1,2],
'Day3':[1,3,1,1,1,0,0,1,3,2],
'Day4':[1,2,0,1,1,0,0,2,1,1],
'Day5':[0,2,1,1,1,1,0,2,1,1],
'Day6':[1,0,1,1,2,1,0,2,1,1],
'Day7':[0,0,0,1,1,1,0,0,3,1]}
startday = pd.DataFrame([1,1,1,1,1,1,1,1,1,1],columns=['start'], index=idx)
endday = pd.DataFrame([7,7,7,7,7,7,7,7,7,7],columns=['end'], index=idx)
df = pd.DataFrame(data, index=idx)
Neg99 = -999
Neg90 = -900
# large example
IDX = [f'id{i}' for i in range(1,50_001)]
STARTDAY, ENDDAY = (pd.DataFrame({c:l}, index=IDX) for c,l in zip(('start','end'), np.sort(np.random.randint(1,8,(2,50_000)), axis=0)))
DF = pd.DataFrame({f'Day{i}':l for i,l in enumerate(np.random.randint(0,4,(7, 50_000)), 1)}, index=IDX)
def pp():
if restrict_max:
data = np.where((startday.values<=np.arange(1,8)) & (endday.values>=np.arange(1,8)), df.values, 0)
mask = data==np.maximum((data==0).all(1), data.max(1))[:, None]
else:
mask = (df.values==np.maximum((df.values==0).all(1), df.values.max(1))[:, None]) & (startday.values<=np.arange(1,8)) & (endday.values>=np.arange(1,8))
y, x = np.where(np.diff(mask, axis=1, prepend=False, append=False))
y = y[::2]
x = x[1::2]-x[::2]
res = np.zeros(df.values.shape[:1], int)
nl = np.flatnonzero(np.diff(y, prepend=-1))
res[y[nl]] = np.maximum.reduceat(x, nl)
return pd.DataFrame({'result': res}, index=df.index)
def pi():
sd = startday.start.values
ed = endday.end.values
dr = ed - sd + 1
i = np.arange(len(df)).repeat(dr)
j = np.concatenate([np.arange(s - 1, e) for s, e in zip(sd, ed)])
v = df.values
mx = np.empty(len(v), dtype=v.dtype)
mx.fill(v.min())
np.maximum.at(mx, i, v[i, j])
b = np.ones((v.shape[0], v.shape[1] + 2), bool)
b[i, j + 1] = (v[i, j] != mx[i]) | (mx[i] == 0)
x, y = np.where(b)
y_ = np.diff(y)
mask = y_ > 0
y__ = y_[mask]
x__ = x[1:][mask]
c = np.empty(len(v), int)
c.fill(y__.min())
np.maximum.at(c, x__, y__)
return pd.DataFrame({'result': c - 1}, index=df.index)
restrict_max=True
print(pp())
print(pi())
df, startday, endday = DF, STARTDAY, ENDDAY
from timeit import timeit
print('pp', timeit(pp,number=10)*100)
print('pi', timeit(pi,number=10)*100)
print((pp()==pi()).all())
</code></pre>