Given an infinite stream (or a finite sequence of size n), choose k numbers randomly.
// n choose k
// keep a reservoir of size k
// Choose 1, 2, 3, ..., k first and put them into the reservoir.
// For k+i, pick it with a probability of k/(k+i), and randomly replace a number in the reservoir.
// Another easier way of calculating the probability is P(X was in the reservoir last time) * P(X is not replaced this time) = P(X was in the reservoir last time) * (1- P(X is replaced this time)) = k/(k+i-1) * (1 - k/(k+i) * 1/k) = k/(k+i).
// So for any element X, it was kept when i reaches n:
// 1. if X is the first k elements: P(X picked) * P(X not replaced in k+1) * P(not replaced in k+2) * ... = 1 * (1 - k/k+1 * 1/k) * (1 - k+1/k+2 * 1/k+1) = 1 * k/k+1 * k+1/k+2 * ... = k/n
// 2. else: P(X picked) * p(not replaced) = k/k+i * (1 - k+i/k+i+1 * 1/k+i) * ... = k/n
vector<int> ReservoirSampling(vector<int> v, int n, int k) {
assert(v.size() == n && k <= n);
// init: fill the first k elems into reservoir
vector<int> reservoirArray(v.begin(), v.begin() + k);
int i = 0;
int j = 0;
// start from the (k+1)th element to replace
for (i = k; i < n; ++i) {
j = rand() % (i + 1); // inclusive range [0, i]
if (j < k) {
reservoirArray[j] = v[i];
}
}
return reservoirArray;
}