import numpy as np
import matplotlib.pyplot as plt
max_num = 10 # maximum number of pageviews we want to consider
# replicate the experiment ntrials times and find the probability for selection of any pageview
pageview_indices = []
ntrials = 10000
for _ in range(ntrials):
pageview_index = None # index of the single pageview to be kept
i = 0
while True: # streaming pageviews
i += 1 # next pageview
if i > max_num:
break
# keep first pageview and from next pageview onwards discard the old one kept with probability 1 - 1/i
pageview_index = 1 if i == 1 else np.random.choice([pageview_index, i], 1, p=[1-1./i, 1./i])[0]
#print 'pageview chosen:', pageview_index
print 'Final pageview chosen:', pageview_index
pageview_indices.append(pageview_index)
plt.hist(pageview_indices, max_num, normed=1, facecolor='green', alpha=0.75)
plt.xlabel('Pageview Index')
plt.ylabel('Probability Chosen')
plt.title('Reservoir Sampling')
plt.axis([0, max_num+1, 0, 0.15])
plt.xticks(range(1, max_num+1))
plt.grid(True)
根据水库采样算法(可在此处找到:https://en.wikipedia.org/wiki/Reservoir_sampling),其中我们只存储一个页面视图(水库大小=1),以下实现说明了从流式页面视图进行概率选择的策略如何导致统一的选择概率:
从上面可以看出,选择页面视图索引的概率几乎是一致的(10个页面视图中每个页面的概率是1/10),从数学上也可以证明它是一致的。在
相关问题 更多 >
编程相关推荐