Add space_saving_most_frequent util

This commit is contained in:
Guillaume Chatelet
2025-09-24 15:14:58 +00:00
committed by Corentin Le Molgat
parent 076468db33
commit 81cf14db67
3 changed files with 982 additions and 0 deletions

View File

@@ -279,6 +279,33 @@ cc_test(
],
)
cc_library(
name = "space_saving_most_frequent",
hdrs = ["space_saving_most_frequent.h"],
deps = [
"@abseil-cpp//absl/base:core_headers",
"@abseil-cpp//absl/base:nullability",
"@abseil-cpp//absl/container:flat_hash_map",
"@abseil-cpp//absl/hash",
"@abseil-cpp//absl/log:check",
],
)
cc_test(
name = "space_saving_most_frequent_test",
srcs = ["space_saving_most_frequent_test.cc"],
deps = [
":space_saving_most_frequent",
"//ortools/base:gmock_main",
"@abseil-cpp//absl/algorithm:container",
"@abseil-cpp//absl/base:nullability",
"@abseil-cpp//absl/log:check",
"@abseil-cpp//absl/random",
"@abseil-cpp//absl/random:distributions",
"@google_benchmark//:benchmark",
],
)
cc_library(
name = "sparse_permutation",
srcs = ["sparse_permutation.cc"],

View File

@@ -0,0 +1,480 @@
// Copyright 2010-2025 Google LLC
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
#define OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
#include <cstddef>
#include <cstdint>
#include <functional>
#include <utility>
#include <vector>
#include "absl/base/attributes.h"
#include "absl/base/nullability.h"
#include "absl/container/flat_hash_map.h"
#include "absl/hash/hash.h"
#include "absl/log/check.h"
namespace operations_research {
namespace ssmf_internal {
template <typename T>
class BoundedAllocator;
template <typename T>
class DoubleLinkedList;
} // namespace ssmf_internal
// Space-Saving is an approximate algorithm for finding the most frequent items
// in a data stream. It is conceptually very simple: we maintain a list of at
// most `storage_size` elements and the number of times each of them has been
// seen. When a new element is added and the list is full, we remove the least
// frequent item (the one with the lowest count). If there is a tie, we remove
// the oldest one. See space_saving_most_frequent_test.cc for a trivial
// implementation that yield identical results to this class but is much slower.
//
// The implementation is based on [1], which describes a way of storing the
// items so all the operations are O(1). The elements that have the same count
// (a "bucket") are stored in a doubly-linked list, ordered by the time of
// insertion. The buckets are also stored in a doubly-linked list, ordered by
// number of counts. Thus, to increment the count of an element we need to
// remove it from its bucket and add it to the next one, which is a removal and
// an inclusion in linked lists and thus takes O(1) time.
//
// [1] Graham Cormode, Marios Hadjieleftheriou. Methods for finding frequent
// items in data streams. The VLDB Journal (2010) 19: 3.
// http://dimacs.rutgers.edu/~graham/pubs/papers/freqvldbj.pdf
//
// This class is thread-compatible.
//
// TODO(user): Support move-only types.
template <typename T, typename Hash = absl::Hash<T>,
typename Eq = std::equal_to<T>>
class SpaceSavingMostFrequent {
public:
// Create a data structure holding at most `storage_size` elements in memory.
// That means that frequent elements that are added less frequently than
// `1/storage_size` will be ignored.
explicit SpaceSavingMostFrequent(int storage_size);
~SpaceSavingMostFrequent();
// Adds `value` to the data structure.
// Complexity: O(1).
void Add(T value);
// Removes all occurrences of `value` from the data structure. Does nothing if
// the element is not in the data structure.
// Complexity: O(1).
void FullyRemove(const T& value);
// TODO(user): Replace this by an iterator with a begin() and end().
std::vector<std::pair<T, int64_t>> GetMostFrequent(int num_samples) const;
// Equivalent to calling GetMostFrequent(1) and popping the first element.
T PopMostFrequent();
// Equivalent of GetMostFrequent(1).second. Returns zero if the data structure
// is empty.
int64_t CountOfMostFrequent() const;
private:
struct Bucket;
// The nodes of the doubly-linked list of elements for a given bucket (ie.,
// sharing the same count).
struct Item {
T value;
Bucket* absl_nonnull bucket;
Item* absl_nullable next = nullptr;
Item* absl_nullable prev = nullptr;
};
using ItemList = ssmf_internal::DoubleLinkedList<Item>;
// A bucket of elements with the same count. They are stored in a
// doubly-linked list ordered by the time of insertion.
struct Bucket {
int64_t count; // The count of this bucket.
ItemList items; // front (oldest), back (newest).
Bucket* absl_nullable next = nullptr; // Bucket with lower count.
Bucket* absl_nullable prev = nullptr; // Bucket with higher count.
};
using BucketList = ssmf_internal::DoubleLinkedList<Bucket>;
void RemoveIfEmpty(Bucket* absl_nonnull bucket) {
if (bucket->items.empty()) {
bucket_alloc_.Return(buckets_.erase(bucket));
}
}
Bucket* absl_nonnull GetBucketForCountOne() {
if (!buckets_.empty() && buckets_.back()->count == 1) {
return buckets_.back();
}
// We need to create a new empty bucket, which will be the last one.
Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New());
bucket->count = 1;
return bucket;
}
const int storage_size_;
ssmf_internal::BoundedAllocator<Item> item_alloc_;
ssmf_internal::BoundedAllocator<Bucket> bucket_alloc_;
BucketList buckets_; // front with highest count.
absl::flat_hash_map<T, Item* absl_nonnull, Hash, Eq> elem_to_item_;
};
template <typename T, typename Hash, typename Eq>
SpaceSavingMostFrequent<T, Hash, Eq>::SpaceSavingMostFrequent(int storage_size)
: storage_size_(storage_size),
item_alloc_(storage_size),
bucket_alloc_(storage_size + 1) {
CHECK_GT(storage_size, 0);
elem_to_item_.reserve(storage_size + 1);
}
// Properly return all buckets and items to their allocators to ensure proper
// destruction.
template <typename T, typename Hash, typename Eq>
SpaceSavingMostFrequent<T, Hash, Eq>::~SpaceSavingMostFrequent() {
#ifdef NDEBUG
bucket_alloc_.DisposeAll();
item_alloc_.DisposeAll();
#else
while (!buckets_.empty()) {
auto& items = buckets_.front()->items;
while (!items.empty()) {
item_alloc_.Return(items.pop_front());
}
bucket_alloc_.Return(buckets_.pop_front());
}
#endif
}
template <typename T, typename Hash, typename Eq>
void SpaceSavingMostFrequent<T, Hash, Eq>::Add(T value) {
if (buckets_.empty()) {
// We are adding an element to an empty data structure.
DCHECK(item_alloc_.empty());
DCHECK(elem_to_item_.empty());
Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New());
Item* absl_nonnull const item =
bucket->items.insert_front(item_alloc_.New());
item->bucket = bucket;
item->value = value;
bucket->count = 1;
elem_to_item_.emplace(value, item);
return;
}
DCHECK(!buckets_.empty());
auto [it, inserted] = elem_to_item_.try_emplace(value);
if (inserted) {
// We are adding a new element. First, check if we are full, and if so,
// remove the least frequent element.
if (item_alloc_.full()) {
// Remove an entry from the last bucket where the `count` is lowest.
Bucket* absl_nonnull const last_bucket = buckets_.back();
// We want to remove the oldest one, with the idea that it is potentially
// the real least frequent of the bucket since it was unseen for longer.
Item* absl_nonnull recycled_item = last_bucket->items.front();
// Reclaim its storage for the newly added element.
elem_to_item_.erase(recycled_item->value);
item_alloc_.Return(last_bucket->items.pop_front());
RemoveIfEmpty(last_bucket);
}
Bucket* absl_nonnull bucket = GetBucketForCountOne();
DCHECK_EQ(bucket->count, 1);
Item* absl_nonnull item = bucket->items.insert_back(item_alloc_.New());
item->value = value;
item->bucket = bucket;
it->second = item; // set item pointer back in map.
} else {
Item* absl_nonnull item = it->second;
Bucket* absl_nonnull bucket = item->bucket;
ItemList& current_bucket_items = bucket->items;
const int64_t new_count = bucket->count + 1;
const bool no_bucket_for_new_count =
(bucket->prev == nullptr) || (bucket->prev->count > new_count);
if (no_bucket_for_new_count && current_bucket_items.single()) {
// Small optimization for very common elements: if the element is alone in
// a bucket and there is no bucket for count + 1, we can just increment
// the count of the bucket.
bucket->count = new_count;
return;
}
// Extract item from this bucket.
auto dangling_item = current_bucket_items.erase(item);
// Fetch the bucket with the correct count.
Bucket* new_bucket = nullptr;
if (bucket->prev && bucket->prev->count == new_count) {
new_bucket = bucket->prev;
} else {
// We create a new empty bucket, which will be before the current bucket.
new_bucket = buckets_.insert_before(bucket, bucket_alloc_.New());
new_bucket->count = new_count;
}
// Insert the item in the new bucket at the end (newest).
dangling_item->bucket = new_bucket;
new_bucket->items.insert_back(std::move(dangling_item));
// Reclaim old bucket if it is empty.
RemoveIfEmpty(bucket);
}
}
template <typename T, typename Hash, typename Eq>
void SpaceSavingMostFrequent<T, Hash, Eq>::FullyRemove(const T& value) {
auto it = elem_to_item_.find(value);
if (it == elem_to_item_.end()) return;
Item* absl_nonnull item = it->second;
Bucket* absl_nonnull bucket = item->bucket;
item_alloc_.Return(bucket->items.erase(item));
RemoveIfEmpty(bucket);
elem_to_item_.erase(it);
}
template <typename T, typename Hash, typename Eq>
std::vector<std::pair<T, int64_t>>
SpaceSavingMostFrequent<T, Hash, Eq>::GetMostFrequent(int num_samples) const {
std::vector<std::pair<T, int64_t>> result;
result.reserve(num_samples);
if (!buckets_.empty()) {
for (Bucket* bucket = buckets_.front(); bucket; bucket = bucket->next) {
const int64_t count = bucket->count;
DCHECK(!bucket->items.empty());
for (Item* item = bucket->items.back(); item; item = item->prev) {
if (result.size() == num_samples) return result;
result.emplace_back(item->value, count);
}
}
}
return result;
}
template <typename T, typename Hash, typename Eq>
T SpaceSavingMostFrequent<T, Hash, Eq>::PopMostFrequent() {
CHECK(!buckets_.empty());
const T value = buckets_.front()->items.back()->value;
FullyRemove(value);
return value;
}
template <typename T, typename Hash, typename Eq>
int64_t SpaceSavingMostFrequent<T, Hash, Eq>::CountOfMostFrequent() const {
return buckets_.empty() ? 0 : buckets_.front()->count;
}
namespace ssmf_internal {
// This is semantically equivalent to a `std::unique_ptr` except that it does
// not own the object and that only `BoundedAllocator` and `DoubleLinkedList`
// are able to manage the stored pointer.
// Other clients can access the object with the guarantee that it is non null.
template <typename T>
class Ptr {
explicit Ptr(T* absl_nullable ptr) : ptr_(ptr) { get_nonnull(); }
T* absl_nonnull get_nonnull() const {
DCHECK(ptr_ != nullptr);
return ptr_;
};
T* absl_nonnull release() {
T* absl_nonnull ptr = get_nonnull();
ptr_ = nullptr;
return ptr;
}
T* absl_nullable ptr_ = nullptr;
public:
Ptr(const Ptr&) = delete;
Ptr(Ptr&& other) : ptr_(other.release()) {}
~Ptr() { DCHECK(ptr_ == nullptr); }
T* absl_nonnull operator->() const { return get_nonnull(); };
T& operator*() const { return *get_nonnull(); };
friend class BoundedAllocator<T>;
friend class DoubleLinkedList<T>;
};
// Allocator that allows creating up to `max_size` objects.
// Storage is allocated at once contiguously which helps with cache locality.
// Objects that are returned to the allocator are stored in a freelist for later
// use and are not destroyed right away. Objects that are extracted from the
// freelist are default initialized for correctness.
// The allocator makes sure that all created objects are returned to the pool
// upon destruction, this allows to catch logic errors. It is possible to bypass
// this behavior when it is safe to destroy all object at once by calling the
// `DisposeAll` method. Once this method is called the allocator cannot be used
// anymore.
template <typename T>
class BoundedAllocator {
public:
explicit BoundedAllocator(size_t max_size) : data_(max_size) {
freelist_.reserve(max_size);
for (auto& data : data_) {
freelist_.push_back(&data);
}
}
~BoundedAllocator() {
CHECK(empty()) << "some elements are not returned and won't be destroyed.";
}
bool full() const { return freelist_.empty(); }
bool empty() const { return data_.size() == freelist_.size(); }
Ptr<T> New() {
CHECK(!freelist_.empty());
T* absl_nonnull t = freelist_.back();
freelist_.pop_back();
return Ptr<T>(t);
}
void Return(Ptr<T> ptr) {
T* absl_nonnull t = ptr.release();
DCHECK(t != nullptr);
DCHECK_GE(t, &data_.front());
DCHECK_LE(t, &data_.back());
*t = T();
freelist_.push_back(t);
}
// Destroys all allocated objects.
// Once called, it is no more possible to allocate new objects.
void DisposeAll() {
freelist_.clear();
data_.clear();
}
private:
std::vector<T> data_;
std::vector<T* absl_nonnull> freelist_;
};
// A simple doubly linked list with ownership transfer.
// All elements added or extracted from the list are done though the `Ptr`
// abstraction. This guarantees that there is always only one owner.
template <typename T>
class DoubleLinkedList {
public:
DoubleLinkedList() = default;
DoubleLinkedList(const DoubleLinkedList&) = delete;
~DoubleLinkedList() {
DCHECK_EQ(front_, nullptr);
DCHECK_EQ(front_, nullptr);
}
bool empty() const {
DCHECK_EQ(front_ == nullptr, back_ == nullptr);
return front_ == nullptr;
}
bool single() const {
DCHECK_EQ(front_ == nullptr, back_ == nullptr);
return front_ != nullptr && front_ == back_;
}
T* absl_nonnull front() const {
DCHECK_NE(front_, nullptr);
return front_;
}
T* absl_nonnull back() const {
DCHECK_NE(back_, nullptr);
return back_;
}
T* absl_nonnull insert_after(T* absl_nonnull node, Ptr<T> new_node) {
T* absl_nonnull new_node_ptr = new_node.release();
new_node_ptr->prev = node;
if (node->next == nullptr) {
DCHECK_EQ(new_node_ptr->next, nullptr);
back_ = new_node_ptr;
} else {
new_node_ptr->next = node->next;
node->next->prev = new_node_ptr;
}
node->next = new_node_ptr;
return new_node_ptr;
}
T* absl_nonnull insert_before(T* absl_nonnull node, Ptr<T> new_node) {
T* absl_nonnull new_node_ptr = new_node.release();
new_node_ptr->next = node;
if (node->prev == nullptr) {
DCHECK_EQ(new_node_ptr->prev, nullptr);
front_ = new_node_ptr;
} else {
new_node_ptr->prev = node->prev;
node->prev->next = new_node_ptr;
}
node->prev = new_node_ptr;
return new_node_ptr;
}
T* absl_nonnull insert_front(Ptr<T> new_node) {
if (front_ != nullptr) {
return insert_before(front_, std::move(new_node));
}
T* absl_nonnull new_node_ptr = new_node.release();
front_ = new_node_ptr;
back_ = new_node_ptr;
new_node_ptr->prev = nullptr;
new_node_ptr->next = nullptr;
return new_node_ptr;
}
T* absl_nonnull insert_back(Ptr<T> new_node) {
if (back_ != nullptr) {
return insert_after(back_, std::move(new_node));
} else {
return insert_front(std::move(new_node));
}
}
ABSL_MUST_USE_RESULT Ptr<T> erase(T* absl_nonnull node) {
if (node->prev) {
node->prev->next = node->next;
} else {
front_ = node->next;
}
if (node->next) {
node->next->prev = node->prev;
} else {
back_ = node->prev;
}
node->next = nullptr;
node->prev = nullptr;
return Ptr<T>(node);
}
ABSL_MUST_USE_RESULT Ptr<T> pop_front() { return erase(front()); }
ABSL_MUST_USE_RESULT Ptr<T> pop_back() { return erase(back()); }
private:
T* absl_nullable front_ = nullptr;
T* absl_nullable back_ = nullptr;
};
} // namespace ssmf_internal
} // namespace operations_research
#endif // OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_

View File

@@ -0,0 +1,475 @@
// Copyright 2010-2025 Google LLC
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "ortools/algorithms/space_saving_most_frequent.h"
#include <cstdint>
#include <functional>
#include <optional>
#include <random>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
#include "absl/algorithm/container.h"
#include "absl/base/nullability.h"
#include "absl/log/check.h"
#include "absl/random/distributions.h"
#include "absl/random/random.h"
#include "benchmark/benchmark.h"
#include "gtest/gtest.h"
#include "ortools/base/gmock.h"
namespace operations_research {
namespace {
using ssmf_internal::BoundedAllocator;
using ssmf_internal::DoubleLinkedList;
using ssmf_internal::Ptr;
using ::testing::ElementsAre;
using ::testing::Pair;
TEST(BoundedAllocator, Alloc) {
BoundedAllocator<int> allocator(1);
EXPECT_TRUE(allocator.empty());
EXPECT_FALSE(allocator.full());
auto p = allocator.New();
EXPECT_FALSE(allocator.empty());
EXPECT_TRUE(allocator.full());
*p = 42;
allocator.Return(std::move(p));
EXPECT_TRUE(allocator.empty());
EXPECT_FALSE(allocator.full());
}
TEST(BoundedAllocator, FromFreeList) {
BoundedAllocator<int> allocator(1);
auto p = allocator.New();
*p = 42;
allocator.Return(std::move(p));
auto q = allocator.New();
EXPECT_EQ(*q, 0);
allocator.Return(std::move(q));
}
TEST(BoundedAllocator, UnReturnedItems) {
ASSERT_DEATH(
{
BoundedAllocator<int> allocator(1);
allocator.New(); // Allocated item not Return-ed
},
"");
}
TEST(BoundedAllocator, Disposed) {
BoundedAllocator<int> allocator(1);
EXPECT_TRUE(allocator.empty());
EXPECT_FALSE(allocator.full());
allocator.DisposeAll();
// Allocator becomes unusable.
EXPECT_TRUE(allocator.empty());
EXPECT_TRUE(allocator.full());
}
struct Node {
int value = 0;
Node* absl_nullable next = nullptr;
Node* absl_nullable prev = nullptr;
};
class DoublyLinkedListTest : public ::testing::Test {
public:
DoublyLinkedListTest() : allocator_(10) {}
void TearDown() override {
while (!list_.empty()) {
allocator_.Return(list_.pop_front());
}
}
static std::vector<int> AsVector(const DoubleLinkedList<Node>& list) {
std::vector<int> values;
if (!list.empty()) {
for (Node* node = list.front(); node != nullptr; node = node->next) {
values.push_back(node->value);
}
}
return values;
}
protected:
BoundedAllocator<Node> allocator_;
DoubleLinkedList<Node> list_;
};
TEST_F(DoublyLinkedListTest, EmptyList) {
EXPECT_TRUE(list_.empty());
EXPECT_FALSE(list_.single());
}
TEST_F(DoublyLinkedListTest, InsertFront) {
Ptr<Node> node1 = allocator_.New();
node1->value = 1;
list_.insert_front(std::move(node1));
EXPECT_FALSE(list_.empty());
EXPECT_TRUE(list_.single());
EXPECT_THAT(AsVector(list_), ElementsAre(1));
Ptr<Node> node2 = allocator_.New();
node2->value = 2;
list_.insert_front(std::move(node2));
EXPECT_FALSE(list_.empty());
EXPECT_FALSE(list_.single());
EXPECT_THAT(AsVector(list_), ElementsAre(2, 1));
}
TEST_F(DoublyLinkedListTest, InsertBack) {
auto* node1 = list_.insert_back(allocator_.New());
node1->value = 1;
EXPECT_FALSE(list_.empty());
EXPECT_TRUE(list_.single());
EXPECT_THAT(AsVector(list_), ElementsAre(1));
auto* node2 = list_.insert_back(allocator_.New());
node2->value = 2;
EXPECT_FALSE(list_.empty());
EXPECT_FALSE(list_.single());
EXPECT_THAT(AsVector(list_), ElementsAre(1, 2));
}
TEST_F(DoublyLinkedListTest, InsertAfter) {
Node* node1 = list_.insert_back(allocator_.New());
node1->value = 1;
Node* node2 = list_.insert_back(allocator_.New());
node2->value = 2;
Node* node3 = list_.insert_after(node1, allocator_.New());
node3->value = 3;
EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2));
Node* node4 = list_.insert_after(node2, allocator_.New());
node4->value = 4;
EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2, 4));
}
TEST_F(DoublyLinkedListTest, InsertBefore) {
auto* node1 = list_.insert_back(allocator_.New());
node1->value = 1;
auto* node2 = list_.insert_back(allocator_.New());
node2->value = 2;
auto* node3 = list_.insert_before(node2, allocator_.New());
node3->value = 3;
EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2));
auto* node4 = list_.insert_before(node1, allocator_.New());
node4->value = 4;
EXPECT_THAT(AsVector(list_), ElementsAre(4, 1, 3, 2));
}
TEST_F(DoublyLinkedListTest, Erase) {
auto* node1 = list_.insert_back(allocator_.New());
node1->value = 1;
auto* node2 = list_.insert_back(allocator_.New());
node2->value = 2;
auto* node3 = list_.insert_back(allocator_.New());
node3->value = 3;
EXPECT_THAT(AsVector(list_), ElementsAre(1, 2, 3));
allocator_.Return(list_.erase(node2));
EXPECT_THAT(AsVector(list_), ElementsAre(1, 3));
allocator_.Return(list_.erase(node1));
EXPECT_THAT(AsVector(list_), ElementsAre(3));
EXPECT_TRUE(list_.single());
allocator_.Return(list_.erase(node3));
EXPECT_THAT(AsVector(list_), ElementsAre());
EXPECT_TRUE(list_.empty());
}
// Very inefficient but very simple implementation of Space-Saving. Should
// return the same results as SpaceSavingMostFrequent.
template <typename T>
class SpaceSavingMostFrequentNaive {
public:
explicit SpaceSavingMostFrequentNaive(int storage_size)
: storage_size_(storage_size), contents_(storage_size) {
CHECK_GT(storage_size, 0);
}
void Add(T value) {
++current_timestamp_;
// If the value is already in the list, update its count and timestamp.
for (auto& item : contents_) {
if (item.value == value) {
item.IncrementAndUpdate(current_timestamp_);
return;
}
}
// Otherwise, replace the least frequent item with the new one.
auto it2 = absl::c_min_element(contents_);
*it2 = ItemCountAndTimestamp{
.count = 1, .timestamp = current_timestamp_, .value = value};
}
void FullyRemove(T value) {
for (auto& item : contents_) {
if (item.value == value) {
item.Clear();
return;
}
}
}
std::vector<std::pair<T, int64_t>> GetMostFrequent(int num_samples) {
std::vector<std::pair<T, int64_t>> result;
absl::c_sort(contents_, std::greater<ItemCountAndTimestamp>());
for (const auto& [count, timestamp, value] : contents_) {
if (count < 0) break;
if (result.size() == num_samples) break;
result.push_back({*value, count});
}
return result;
}
private:
int storage_size_;
int64_t current_timestamp_ = 0;
struct ItemCountAndTimestamp {
int64_t count = -1;
int64_t timestamp = -1;
std::optional<T> value;
void IncrementAndUpdate(int64_t new_timestamp) {
++count;
timestamp = new_timestamp;
}
void Clear() {
count = -1;
timestamp = -1;
value = std::nullopt;
}
auto AsTuple() const { return std::tie(count, timestamp, value); }
friend bool operator<(const ItemCountAndTimestamp& a,
const ItemCountAndTimestamp& b) {
return a.AsTuple() < b.AsTuple();
}
friend bool operator>(const ItemCountAndTimestamp& a,
const ItemCountAndTimestamp& b) {
return a.AsTuple() > b.AsTuple();
}
};
std::vector<ItemCountAndTimestamp> contents_;
};
template <typename T>
struct Implementations {
explicit Implementations(int storage_size)
: impl(storage_size), naive(storage_size) {}
void Add(T value) {
impl.Add(value);
naive.Add(value);
}
void FullyRemove(T value) {
impl.FullyRemove(value);
naive.FullyRemove(value);
}
std::vector<std::pair<T, int64_t>> GetMostFrequent(int num_samples) {
const std::vector<std::pair<T, int64_t>> impl_result =
impl.GetMostFrequent(num_samples);
const std::vector<std::pair<T, int64_t>> naive_result =
naive.GetMostFrequent(num_samples);
EXPECT_THAT(impl_result, naive_result);
return impl_result;
}
void CheckIdenticalResults(int num_samples) {
CHECK_EQ(impl.GetMostFrequent(num_samples),
naive.GetMostFrequent(num_samples));
}
SpaceSavingMostFrequent<T> impl;
SpaceSavingMostFrequentNaive<T> naive;
};
TEST(SpaceSavingMostFrequent, SimpleExamples) {
Implementations<std::string> most_frequent(5);
most_frequent.Add("a"); // 1 : a
most_frequent.Add("b"); // 1 : a, b
most_frequent.Add("c"); // 1 : a, b, c
most_frequent.Add("d"); // 1 : a, b, c, d
most_frequent.Add("e"); // 1 : a, b, c, d, e
most_frequent.Add("a"); // 2 : a | 1 : b, c, d, e
most_frequent.Add("a"); // 3 : a | 1 : b, c, d, e
most_frequent.Add("a"); // 4 : a | 1 : b, c, d, e
most_frequent.Add("b"); // 4 : a | 2 : b | 1 : c, d, e
most_frequent.Add("c"); // 4 : a | 2 : b, c | 1 : d, e
most_frequent.Add("d"); // 4 : a | 2 : b, c, d | 1 : e
most_frequent.Add("e"); // 4 : a | 2 : b, c, d, e
// Eviction starts.
most_frequent.Add("f"); // 4 : a | 2 : c, d, e | 1 : f (b was evicted)
most_frequent.Add("g"); // 4 : a | 2 : c, d, e | 1 : g (f was evicted)
most_frequent.Add("h"); // 4 : a | 2 : c, d, e | 1 : h (g was evicted)
most_frequent.Add("i"); // 4 : a | 2 : c, d, e | 1 : i (h was evicted)
most_frequent.Add("j"); // 4 : a | 2 : c, d, e | 1 : j (i was evicted)
most_frequent.Add("k"); // 4 : a | 2 : c, d, e | 1 : k (j was evicted)
most_frequent.Add("l"); // 4 : a | 2 : c, d, e | 1 : l (k was evicted)
most_frequent.Add("m"); // 4 : a | 2 : c, d, e | 1 : m (l was evicted)
most_frequent.Add("n"); // 4 : a | 2 : c, d, e | 1 : n (m was evicted)
most_frequent.Add("o"); // 4 : a | 2 : c, d, e | 1 : o (n was evicted)
most_frequent.Add("p"); // 4 : a | 2 : c, d, e | 1 : p (o was evicted)
most_frequent.Add("p"); // 4 : a | 2 : c, d, e, p
most_frequent.Add("p"); // 4 : a | 3 : p | 2 : c, d, e
EXPECT_THAT(most_frequent.GetMostFrequent(10),
ElementsAre(Pair("a", 4), Pair("p", 3), Pair("e", 2),
Pair("d", 2), Pair("c", 2)));
most_frequent.FullyRemove("c"); // 4 : a | 3 : p | 2 : d, e
most_frequent.Add("f"); // 4 : a | 3 : p | 2 : d, e | 1 : f
EXPECT_THAT(most_frequent.GetMostFrequent(10),
ElementsAre(Pair("a", 4), Pair("p", 3), Pair("e", 2),
Pair("d", 2), Pair("f", 1)));
}
TEST(SpaceSavingMostFrequent, CornerCase) {
Implementations<std::string> most_frequent(5);
most_frequent.Add("a"); // 1 : a
most_frequent.Add("b"); // 1 : a, b
most_frequent.Add("c"); // 1 : a, b, c
most_frequent.Add("d"); // 1 : a, b, c, d
most_frequent.Add("e"); // 1 : a, b, c, d, e
most_frequent.Add("f"); // 1 : b, c, d, e, f
most_frequent.Add("g"); // 1 : c, d, e, f, g
// Eviction starts.
most_frequent.Add("x"); // 1 : d, e, f, g, x (a was evicted)
most_frequent.Add("y"); // 1 : e, f, g, x, y (d was evicted)
// Here's an example of why we should remove the oldest item in case of a
// tie on the frequency count: we don't want "y" to remove the "x".
most_frequent.Add("x"); // 2 : x | 1 : e, f, g, y
most_frequent.Add("y"); // 2 : x, y | 1 : e, f, g
most_frequent.Add("x"); // 3 : x | 2 : y | 1 : e, f, g
most_frequent.Add("y"); // 3 : x, y | 1 : e, f, g
EXPECT_THAT(most_frequent.GetMostFrequent(10),
ElementsAre(Pair("y", 3), Pair("x", 3), Pair("g", 1),
Pair("f", 1), Pair("e", 1)));
}
TEST(SpaceSavingMostFrequent, RandomInstances) {
absl::BitGen gen;
static constexpr int kNumTests = 379;
for (int test = 0; test < kNumTests; ++test) {
const int num_items = absl::Uniform(gen, 0, 1000);
const int num_samples = absl::Uniform(gen, 0, 100);
const int storage_size = absl::Uniform(gen, 1, 100);
Implementations<int> most_frequent(storage_size);
std::vector<int> values;
values.reserve(num_items);
for (int i = 0; i < num_items; ++i) {
const int value = absl::Uniform(gen, 0, 1000);
most_frequent.Add(value);
if (absl::Bernoulli(gen, 0.1)) {
auto vec = most_frequent.GetMostFrequent(num_samples);
if (!vec.empty()) {
const int to_remove = absl::Uniform(gen, 0u, vec.size());
most_frequent.FullyRemove(vec[to_remove].first);
}
}
values.push_back(value);
}
most_frequent.CheckIdenticalResults(num_samples);
}
}
template <int kElementSize>
struct Element {
Element() = default;
explicit Element(int value) : value(value) {}
int value;
int zeros[kElementSize] = {};
template <typename H>
friend H AbslHashValue(H h, const Element& e) {
return H::combine(std::move(h), e.value);
}
friend bool operator==(const Element& a, const Element& b) {
return a.value == b.value;
}
};
template <int kSize, int kCapacity, int kElementSize>
void BM_Add_GeometricDistributed(benchmark::State& state) {
using Element = Element<kElementSize>;
static constexpr int kNumInputs = 100;
absl::BitGen random;
std::vector<std::vector<Element>> inputs;
inputs.reserve(kNumInputs);
std::geometric_distribution<int> distribution(1.0 / kCapacity);
for (int i = 0; i < kNumInputs; ++i) {
std::vector<Element>& input = inputs.emplace_back();
input.reserve(kSize);
for (int j = 0; j < kSize; ++j) {
input.push_back(Element(distribution(random)));
}
}
// Start the benchmark.
for (auto _ : state) {
for (const std::vector<Element>& input : inputs) {
SpaceSavingMostFrequent<Element> most_frequent(kCapacity);
for (const Element& value : input) {
most_frequent.Add(value);
}
}
}
state.SetItemsProcessed(state.iterations() * kNumInputs * kSize);
}
BENCHMARK(BM_Add_GeometricDistributed<30, 10, 0>);
BENCHMARK(BM_Add_GeometricDistributed<100, 30, 0>);
BENCHMARK(BM_Add_GeometricDistributed<1000, 100, 0>);
BENCHMARK(BM_Add_GeometricDistributed<10000, 1000, 0>);
BENCHMARK(BM_Add_GeometricDistributed<100000, 10000, 0>);
BENCHMARK(BM_Add_GeometricDistributed<30, 10, 4>);
BENCHMARK(BM_Add_GeometricDistributed<100, 30, 4>);
BENCHMARK(BM_Add_GeometricDistributed<1000, 100, 4>);
BENCHMARK(BM_Add_GeometricDistributed<10000, 1000, 4>);
BENCHMARK(BM_Add_GeometricDistributed<100000, 10000, 4>);
} // namespace
} // namespace operations_research