diff --git a/ortools/algorithms/BUILD.bazel b/ortools/algorithms/BUILD.bazel index 89721444fa..b0c3dd4e12 100644 --- a/ortools/algorithms/BUILD.bazel +++ b/ortools/algorithms/BUILD.bazel @@ -279,6 +279,33 @@ cc_test( ], ) +cc_library( + name = "space_saving_most_frequent", + hdrs = ["space_saving_most_frequent.h"], + deps = [ + "@abseil-cpp//absl/base:core_headers", + "@abseil-cpp//absl/base:nullability", + "@abseil-cpp//absl/container:flat_hash_map", + "@abseil-cpp//absl/hash", + "@abseil-cpp//absl/log:check", + ], +) + +cc_test( + name = "space_saving_most_frequent_test", + srcs = ["space_saving_most_frequent_test.cc"], + deps = [ + ":space_saving_most_frequent", + "//ortools/base:gmock_main", + "@abseil-cpp//absl/algorithm:container", + "@abseil-cpp//absl/base:nullability", + "@abseil-cpp//absl/log:check", + "@abseil-cpp//absl/random", + "@abseil-cpp//absl/random:distributions", + "@google_benchmark//:benchmark", + ], +) + cc_library( name = "sparse_permutation", srcs = ["sparse_permutation.cc"], diff --git a/ortools/algorithms/space_saving_most_frequent.h b/ortools/algorithms/space_saving_most_frequent.h new file mode 100644 index 0000000000..8cfa595de9 --- /dev/null +++ b/ortools/algorithms/space_saving_most_frequent.h @@ -0,0 +1,480 @@ +// Copyright 2010-2025 Google LLC +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_ +#define OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_ + +#include +#include +#include +#include +#include + +#include "absl/base/attributes.h" +#include "absl/base/nullability.h" +#include "absl/container/flat_hash_map.h" +#include "absl/hash/hash.h" +#include "absl/log/check.h" + +namespace operations_research { + +namespace ssmf_internal { + +template +class BoundedAllocator; + +template +class DoubleLinkedList; + +} // namespace ssmf_internal + +// Space-Saving is an approximate algorithm for finding the most frequent items +// in a data stream. It is conceptually very simple: we maintain a list of at +// most `storage_size` elements and the number of times each of them has been +// seen. When a new element is added and the list is full, we remove the least +// frequent item (the one with the lowest count). If there is a tie, we remove +// the oldest one. See space_saving_most_frequent_test.cc for a trivial +// implementation that yield identical results to this class but is much slower. +// +// The implementation is based on [1], which describes a way of storing the +// items so all the operations are O(1). The elements that have the same count +// (a "bucket") are stored in a doubly-linked list, ordered by the time of +// insertion. The buckets are also stored in a doubly-linked list, ordered by +// number of counts. Thus, to increment the count of an element we need to +// remove it from its bucket and add it to the next one, which is a removal and +// an inclusion in linked lists and thus takes O(1) time. +// +// [1] Graham Cormode, Marios Hadjieleftheriou. Methods for finding frequent +// items in data streams. The VLDB Journal (2010) 19: 3. +// http://dimacs.rutgers.edu/~graham/pubs/papers/freqvldbj.pdf +// +// This class is thread-compatible. +// +// TODO(user): Support move-only types. +template , + typename Eq = std::equal_to> +class SpaceSavingMostFrequent { + public: + // Create a data structure holding at most `storage_size` elements in memory. + // That means that frequent elements that are added less frequently than + // `1/storage_size` will be ignored. + explicit SpaceSavingMostFrequent(int storage_size); + + ~SpaceSavingMostFrequent(); + + // Adds `value` to the data structure. + // Complexity: O(1). + void Add(T value); + + // Removes all occurrences of `value` from the data structure. Does nothing if + // the element is not in the data structure. + // Complexity: O(1). + void FullyRemove(const T& value); + + // TODO(user): Replace this by an iterator with a begin() and end(). + std::vector> GetMostFrequent(int num_samples) const; + + // Equivalent to calling GetMostFrequent(1) and popping the first element. + T PopMostFrequent(); + + // Equivalent of GetMostFrequent(1).second. Returns zero if the data structure + // is empty. + int64_t CountOfMostFrequent() const; + + private: + struct Bucket; + + // The nodes of the doubly-linked list of elements for a given bucket (ie., + // sharing the same count). + struct Item { + T value; + Bucket* absl_nonnull bucket; + Item* absl_nullable next = nullptr; + Item* absl_nullable prev = nullptr; + }; + using ItemList = ssmf_internal::DoubleLinkedList; + + // A bucket of elements with the same count. They are stored in a + // doubly-linked list ordered by the time of insertion. + struct Bucket { + int64_t count; // The count of this bucket. + ItemList items; // front (oldest), back (newest). + Bucket* absl_nullable next = nullptr; // Bucket with lower count. + Bucket* absl_nullable prev = nullptr; // Bucket with higher count. + }; + using BucketList = ssmf_internal::DoubleLinkedList; + + void RemoveIfEmpty(Bucket* absl_nonnull bucket) { + if (bucket->items.empty()) { + bucket_alloc_.Return(buckets_.erase(bucket)); + } + } + + Bucket* absl_nonnull GetBucketForCountOne() { + if (!buckets_.empty() && buckets_.back()->count == 1) { + return buckets_.back(); + } + // We need to create a new empty bucket, which will be the last one. + Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New()); + bucket->count = 1; + return bucket; + } + + const int storage_size_; + ssmf_internal::BoundedAllocator item_alloc_; + ssmf_internal::BoundedAllocator bucket_alloc_; + BucketList buckets_; // front with highest count. + absl::flat_hash_map elem_to_item_; +}; + +template +SpaceSavingMostFrequent::SpaceSavingMostFrequent(int storage_size) + : storage_size_(storage_size), + item_alloc_(storage_size), + bucket_alloc_(storage_size + 1) { + CHECK_GT(storage_size, 0); + elem_to_item_.reserve(storage_size + 1); +} + +// Properly return all buckets and items to their allocators to ensure proper +// destruction. +template +SpaceSavingMostFrequent::~SpaceSavingMostFrequent() { +#ifdef NDEBUG + bucket_alloc_.DisposeAll(); + item_alloc_.DisposeAll(); +#else + while (!buckets_.empty()) { + auto& items = buckets_.front()->items; + while (!items.empty()) { + item_alloc_.Return(items.pop_front()); + } + bucket_alloc_.Return(buckets_.pop_front()); + } +#endif +} + +template +void SpaceSavingMostFrequent::Add(T value) { + if (buckets_.empty()) { + // We are adding an element to an empty data structure. + DCHECK(item_alloc_.empty()); + DCHECK(elem_to_item_.empty()); + Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New()); + Item* absl_nonnull const item = + bucket->items.insert_front(item_alloc_.New()); + item->bucket = bucket; + item->value = value; + bucket->count = 1; + elem_to_item_.emplace(value, item); + return; + } + + DCHECK(!buckets_.empty()); + + auto [it, inserted] = elem_to_item_.try_emplace(value); + if (inserted) { + // We are adding a new element. First, check if we are full, and if so, + // remove the least frequent element. + if (item_alloc_.full()) { + // Remove an entry from the last bucket where the `count` is lowest. + Bucket* absl_nonnull const last_bucket = buckets_.back(); + // We want to remove the oldest one, with the idea that it is potentially + // the real least frequent of the bucket since it was unseen for longer. + Item* absl_nonnull recycled_item = last_bucket->items.front(); + // Reclaim its storage for the newly added element. + elem_to_item_.erase(recycled_item->value); + item_alloc_.Return(last_bucket->items.pop_front()); + RemoveIfEmpty(last_bucket); + } + Bucket* absl_nonnull bucket = GetBucketForCountOne(); + DCHECK_EQ(bucket->count, 1); + Item* absl_nonnull item = bucket->items.insert_back(item_alloc_.New()); + item->value = value; + item->bucket = bucket; + it->second = item; // set item pointer back in map. + } else { + Item* absl_nonnull item = it->second; + Bucket* absl_nonnull bucket = item->bucket; + ItemList& current_bucket_items = bucket->items; + const int64_t new_count = bucket->count + 1; + const bool no_bucket_for_new_count = + (bucket->prev == nullptr) || (bucket->prev->count > new_count); + if (no_bucket_for_new_count && current_bucket_items.single()) { + // Small optimization for very common elements: if the element is alone in + // a bucket and there is no bucket for count + 1, we can just increment + // the count of the bucket. + bucket->count = new_count; + return; + } + // Extract item from this bucket. + auto dangling_item = current_bucket_items.erase(item); + // Fetch the bucket with the correct count. + Bucket* new_bucket = nullptr; + if (bucket->prev && bucket->prev->count == new_count) { + new_bucket = bucket->prev; + } else { + // We create a new empty bucket, which will be before the current bucket. + new_bucket = buckets_.insert_before(bucket, bucket_alloc_.New()); + new_bucket->count = new_count; + } + // Insert the item in the new bucket at the end (newest). + dangling_item->bucket = new_bucket; + new_bucket->items.insert_back(std::move(dangling_item)); + + // Reclaim old bucket if it is empty. + RemoveIfEmpty(bucket); + } +} + +template +void SpaceSavingMostFrequent::FullyRemove(const T& value) { + auto it = elem_to_item_.find(value); + if (it == elem_to_item_.end()) return; + Item* absl_nonnull item = it->second; + Bucket* absl_nonnull bucket = item->bucket; + item_alloc_.Return(bucket->items.erase(item)); + RemoveIfEmpty(bucket); + elem_to_item_.erase(it); +} + +template +std::vector> +SpaceSavingMostFrequent::GetMostFrequent(int num_samples) const { + std::vector> result; + result.reserve(num_samples); + if (!buckets_.empty()) { + for (Bucket* bucket = buckets_.front(); bucket; bucket = bucket->next) { + const int64_t count = bucket->count; + DCHECK(!bucket->items.empty()); + for (Item* item = bucket->items.back(); item; item = item->prev) { + if (result.size() == num_samples) return result; + result.emplace_back(item->value, count); + } + } + } + return result; +} + +template +T SpaceSavingMostFrequent::PopMostFrequent() { + CHECK(!buckets_.empty()); + const T value = buckets_.front()->items.back()->value; + FullyRemove(value); + return value; +} + +template +int64_t SpaceSavingMostFrequent::CountOfMostFrequent() const { + return buckets_.empty() ? 0 : buckets_.front()->count; +} + +namespace ssmf_internal { + +// This is semantically equivalent to a `std::unique_ptr` except that it does +// not own the object and that only `BoundedAllocator` and `DoubleLinkedList` +// are able to manage the stored pointer. +// Other clients can access the object with the guarantee that it is non null. +template +class Ptr { + explicit Ptr(T* absl_nullable ptr) : ptr_(ptr) { get_nonnull(); } + + T* absl_nonnull get_nonnull() const { + DCHECK(ptr_ != nullptr); + return ptr_; + }; + + T* absl_nonnull release() { + T* absl_nonnull ptr = get_nonnull(); + ptr_ = nullptr; + return ptr; + } + + T* absl_nullable ptr_ = nullptr; + + public: + Ptr(const Ptr&) = delete; + Ptr(Ptr&& other) : ptr_(other.release()) {} + ~Ptr() { DCHECK(ptr_ == nullptr); } + T* absl_nonnull operator->() const { return get_nonnull(); }; + T& operator*() const { return *get_nonnull(); }; + friend class BoundedAllocator; + friend class DoubleLinkedList; +}; + +// Allocator that allows creating up to `max_size` objects. +// Storage is allocated at once contiguously which helps with cache locality. +// Objects that are returned to the allocator are stored in a freelist for later +// use and are not destroyed right away. Objects that are extracted from the +// freelist are default initialized for correctness. +// The allocator makes sure that all created objects are returned to the pool +// upon destruction, this allows to catch logic errors. It is possible to bypass +// this behavior when it is safe to destroy all object at once by calling the +// `DisposeAll` method. Once this method is called the allocator cannot be used +// anymore. +template +class BoundedAllocator { + public: + explicit BoundedAllocator(size_t max_size) : data_(max_size) { + freelist_.reserve(max_size); + for (auto& data : data_) { + freelist_.push_back(&data); + } + } + + ~BoundedAllocator() { + CHECK(empty()) << "some elements are not returned and won't be destroyed."; + } + + bool full() const { return freelist_.empty(); } + + bool empty() const { return data_.size() == freelist_.size(); } + + Ptr New() { + CHECK(!freelist_.empty()); + T* absl_nonnull t = freelist_.back(); + freelist_.pop_back(); + return Ptr(t); + } + + void Return(Ptr ptr) { + T* absl_nonnull t = ptr.release(); + DCHECK(t != nullptr); + DCHECK_GE(t, &data_.front()); + DCHECK_LE(t, &data_.back()); + *t = T(); + freelist_.push_back(t); + } + + // Destroys all allocated objects. + // Once called, it is no more possible to allocate new objects. + void DisposeAll() { + freelist_.clear(); + data_.clear(); + } + + private: + std::vector data_; + std::vector freelist_; +}; + +// A simple doubly linked list with ownership transfer. +// All elements added or extracted from the list are done though the `Ptr` +// abstraction. This guarantees that there is always only one owner. +template +class DoubleLinkedList { + public: + DoubleLinkedList() = default; + DoubleLinkedList(const DoubleLinkedList&) = delete; + ~DoubleLinkedList() { + DCHECK_EQ(front_, nullptr); + DCHECK_EQ(front_, nullptr); + } + + bool empty() const { + DCHECK_EQ(front_ == nullptr, back_ == nullptr); + return front_ == nullptr; + } + + bool single() const { + DCHECK_EQ(front_ == nullptr, back_ == nullptr); + return front_ != nullptr && front_ == back_; + } + + T* absl_nonnull front() const { + DCHECK_NE(front_, nullptr); + return front_; + } + + T* absl_nonnull back() const { + DCHECK_NE(back_, nullptr); + return back_; + } + + T* absl_nonnull insert_after(T* absl_nonnull node, Ptr new_node) { + T* absl_nonnull new_node_ptr = new_node.release(); + new_node_ptr->prev = node; + if (node->next == nullptr) { + DCHECK_EQ(new_node_ptr->next, nullptr); + back_ = new_node_ptr; + } else { + new_node_ptr->next = node->next; + node->next->prev = new_node_ptr; + } + node->next = new_node_ptr; + return new_node_ptr; + } + + T* absl_nonnull insert_before(T* absl_nonnull node, Ptr new_node) { + T* absl_nonnull new_node_ptr = new_node.release(); + new_node_ptr->next = node; + if (node->prev == nullptr) { + DCHECK_EQ(new_node_ptr->prev, nullptr); + front_ = new_node_ptr; + } else { + new_node_ptr->prev = node->prev; + node->prev->next = new_node_ptr; + } + node->prev = new_node_ptr; + return new_node_ptr; + } + + T* absl_nonnull insert_front(Ptr new_node) { + if (front_ != nullptr) { + return insert_before(front_, std::move(new_node)); + } + T* absl_nonnull new_node_ptr = new_node.release(); + front_ = new_node_ptr; + back_ = new_node_ptr; + new_node_ptr->prev = nullptr; + new_node_ptr->next = nullptr; + return new_node_ptr; + } + + T* absl_nonnull insert_back(Ptr new_node) { + if (back_ != nullptr) { + return insert_after(back_, std::move(new_node)); + } else { + return insert_front(std::move(new_node)); + } + } + + ABSL_MUST_USE_RESULT Ptr erase(T* absl_nonnull node) { + if (node->prev) { + node->prev->next = node->next; + } else { + front_ = node->next; + } + if (node->next) { + node->next->prev = node->prev; + } else { + back_ = node->prev; + } + node->next = nullptr; + node->prev = nullptr; + return Ptr(node); + } + + ABSL_MUST_USE_RESULT Ptr pop_front() { return erase(front()); } + + ABSL_MUST_USE_RESULT Ptr pop_back() { return erase(back()); } + + private: + T* absl_nullable front_ = nullptr; + T* absl_nullable back_ = nullptr; +}; +} // namespace ssmf_internal + +} // namespace operations_research + +#endif // OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_ diff --git a/ortools/algorithms/space_saving_most_frequent_test.cc b/ortools/algorithms/space_saving_most_frequent_test.cc new file mode 100644 index 0000000000..157da6b540 --- /dev/null +++ b/ortools/algorithms/space_saving_most_frequent_test.cc @@ -0,0 +1,475 @@ +// Copyright 2010-2025 Google LLC +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ortools/algorithms/space_saving_most_frequent.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "absl/algorithm/container.h" +#include "absl/base/nullability.h" +#include "absl/log/check.h" +#include "absl/random/distributions.h" +#include "absl/random/random.h" +#include "benchmark/benchmark.h" +#include "gtest/gtest.h" +#include "ortools/base/gmock.h" + +namespace operations_research { +namespace { + +using ssmf_internal::BoundedAllocator; +using ssmf_internal::DoubleLinkedList; +using ssmf_internal::Ptr; +using ::testing::ElementsAre; +using ::testing::Pair; + +TEST(BoundedAllocator, Alloc) { + BoundedAllocator allocator(1); + EXPECT_TRUE(allocator.empty()); + EXPECT_FALSE(allocator.full()); + auto p = allocator.New(); + EXPECT_FALSE(allocator.empty()); + EXPECT_TRUE(allocator.full()); + *p = 42; + allocator.Return(std::move(p)); + EXPECT_TRUE(allocator.empty()); + EXPECT_FALSE(allocator.full()); +} + +TEST(BoundedAllocator, FromFreeList) { + BoundedAllocator allocator(1); + auto p = allocator.New(); + *p = 42; + allocator.Return(std::move(p)); + auto q = allocator.New(); + EXPECT_EQ(*q, 0); + allocator.Return(std::move(q)); +} + +TEST(BoundedAllocator, UnReturnedItems) { + ASSERT_DEATH( + { + BoundedAllocator allocator(1); + allocator.New(); // Allocated item not Return-ed + }, + ""); +} + +TEST(BoundedAllocator, Disposed) { + BoundedAllocator allocator(1); + EXPECT_TRUE(allocator.empty()); + EXPECT_FALSE(allocator.full()); + allocator.DisposeAll(); + // Allocator becomes unusable. + EXPECT_TRUE(allocator.empty()); + EXPECT_TRUE(allocator.full()); +} + +struct Node { + int value = 0; + Node* absl_nullable next = nullptr; + Node* absl_nullable prev = nullptr; +}; + +class DoublyLinkedListTest : public ::testing::Test { + public: + DoublyLinkedListTest() : allocator_(10) {} + + void TearDown() override { + while (!list_.empty()) { + allocator_.Return(list_.pop_front()); + } + } + + static std::vector AsVector(const DoubleLinkedList& list) { + std::vector values; + if (!list.empty()) { + for (Node* node = list.front(); node != nullptr; node = node->next) { + values.push_back(node->value); + } + } + return values; + } + + protected: + BoundedAllocator allocator_; + DoubleLinkedList list_; +}; + +TEST_F(DoublyLinkedListTest, EmptyList) { + EXPECT_TRUE(list_.empty()); + EXPECT_FALSE(list_.single()); +} + +TEST_F(DoublyLinkedListTest, InsertFront) { + Ptr node1 = allocator_.New(); + node1->value = 1; + list_.insert_front(std::move(node1)); + EXPECT_FALSE(list_.empty()); + EXPECT_TRUE(list_.single()); + EXPECT_THAT(AsVector(list_), ElementsAre(1)); + + Ptr node2 = allocator_.New(); + node2->value = 2; + list_.insert_front(std::move(node2)); + EXPECT_FALSE(list_.empty()); + EXPECT_FALSE(list_.single()); + EXPECT_THAT(AsVector(list_), ElementsAre(2, 1)); +} + +TEST_F(DoublyLinkedListTest, InsertBack) { + auto* node1 = list_.insert_back(allocator_.New()); + node1->value = 1; + EXPECT_FALSE(list_.empty()); + EXPECT_TRUE(list_.single()); + EXPECT_THAT(AsVector(list_), ElementsAre(1)); + + auto* node2 = list_.insert_back(allocator_.New()); + node2->value = 2; + EXPECT_FALSE(list_.empty()); + EXPECT_FALSE(list_.single()); + EXPECT_THAT(AsVector(list_), ElementsAre(1, 2)); +} + +TEST_F(DoublyLinkedListTest, InsertAfter) { + Node* node1 = list_.insert_back(allocator_.New()); + node1->value = 1; + + Node* node2 = list_.insert_back(allocator_.New()); + node2->value = 2; + + Node* node3 = list_.insert_after(node1, allocator_.New()); + node3->value = 3; + EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2)); + + Node* node4 = list_.insert_after(node2, allocator_.New()); + node4->value = 4; + EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2, 4)); +} + +TEST_F(DoublyLinkedListTest, InsertBefore) { + auto* node1 = list_.insert_back(allocator_.New()); + node1->value = 1; + + auto* node2 = list_.insert_back(allocator_.New()); + node2->value = 2; + + auto* node3 = list_.insert_before(node2, allocator_.New()); + node3->value = 3; + EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2)); + + auto* node4 = list_.insert_before(node1, allocator_.New()); + node4->value = 4; + EXPECT_THAT(AsVector(list_), ElementsAre(4, 1, 3, 2)); +} + +TEST_F(DoublyLinkedListTest, Erase) { + auto* node1 = list_.insert_back(allocator_.New()); + node1->value = 1; + + auto* node2 = list_.insert_back(allocator_.New()); + node2->value = 2; + + auto* node3 = list_.insert_back(allocator_.New()); + node3->value = 3; + EXPECT_THAT(AsVector(list_), ElementsAre(1, 2, 3)); + + allocator_.Return(list_.erase(node2)); + EXPECT_THAT(AsVector(list_), ElementsAre(1, 3)); + + allocator_.Return(list_.erase(node1)); + EXPECT_THAT(AsVector(list_), ElementsAre(3)); + EXPECT_TRUE(list_.single()); + + allocator_.Return(list_.erase(node3)); + EXPECT_THAT(AsVector(list_), ElementsAre()); + EXPECT_TRUE(list_.empty()); +} + +// Very inefficient but very simple implementation of Space-Saving. Should +// return the same results as SpaceSavingMostFrequent. +template +class SpaceSavingMostFrequentNaive { + public: + explicit SpaceSavingMostFrequentNaive(int storage_size) + : storage_size_(storage_size), contents_(storage_size) { + CHECK_GT(storage_size, 0); + } + + void Add(T value) { + ++current_timestamp_; + + // If the value is already in the list, update its count and timestamp. + for (auto& item : contents_) { + if (item.value == value) { + item.IncrementAndUpdate(current_timestamp_); + return; + } + } + + // Otherwise, replace the least frequent item with the new one. + auto it2 = absl::c_min_element(contents_); + *it2 = ItemCountAndTimestamp{ + .count = 1, .timestamp = current_timestamp_, .value = value}; + } + + void FullyRemove(T value) { + for (auto& item : contents_) { + if (item.value == value) { + item.Clear(); + return; + } + } + } + + std::vector> GetMostFrequent(int num_samples) { + std::vector> result; + + absl::c_sort(contents_, std::greater()); + for (const auto& [count, timestamp, value] : contents_) { + if (count < 0) break; + if (result.size() == num_samples) break; + result.push_back({*value, count}); + } + return result; + } + + private: + int storage_size_; + int64_t current_timestamp_ = 0; + + struct ItemCountAndTimestamp { + int64_t count = -1; + int64_t timestamp = -1; + std::optional value; + + void IncrementAndUpdate(int64_t new_timestamp) { + ++count; + timestamp = new_timestamp; + } + + void Clear() { + count = -1; + timestamp = -1; + value = std::nullopt; + } + + auto AsTuple() const { return std::tie(count, timestamp, value); } + + friend bool operator<(const ItemCountAndTimestamp& a, + const ItemCountAndTimestamp& b) { + return a.AsTuple() < b.AsTuple(); + } + + friend bool operator>(const ItemCountAndTimestamp& a, + const ItemCountAndTimestamp& b) { + return a.AsTuple() > b.AsTuple(); + } + }; + + std::vector contents_; +}; + +template +struct Implementations { + explicit Implementations(int storage_size) + : impl(storage_size), naive(storage_size) {} + + void Add(T value) { + impl.Add(value); + naive.Add(value); + } + void FullyRemove(T value) { + impl.FullyRemove(value); + naive.FullyRemove(value); + } + + std::vector> GetMostFrequent(int num_samples) { + const std::vector> impl_result = + impl.GetMostFrequent(num_samples); + const std::vector> naive_result = + naive.GetMostFrequent(num_samples); + EXPECT_THAT(impl_result, naive_result); + return impl_result; + } + + void CheckIdenticalResults(int num_samples) { + CHECK_EQ(impl.GetMostFrequent(num_samples), + naive.GetMostFrequent(num_samples)); + } + + SpaceSavingMostFrequent impl; + SpaceSavingMostFrequentNaive naive; +}; + +TEST(SpaceSavingMostFrequent, SimpleExamples) { + Implementations most_frequent(5); + + most_frequent.Add("a"); // 1 : a + most_frequent.Add("b"); // 1 : a, b + most_frequent.Add("c"); // 1 : a, b, c + most_frequent.Add("d"); // 1 : a, b, c, d + most_frequent.Add("e"); // 1 : a, b, c, d, e + most_frequent.Add("a"); // 2 : a | 1 : b, c, d, e + most_frequent.Add("a"); // 3 : a | 1 : b, c, d, e + most_frequent.Add("a"); // 4 : a | 1 : b, c, d, e + most_frequent.Add("b"); // 4 : a | 2 : b | 1 : c, d, e + most_frequent.Add("c"); // 4 : a | 2 : b, c | 1 : d, e + most_frequent.Add("d"); // 4 : a | 2 : b, c, d | 1 : e + most_frequent.Add("e"); // 4 : a | 2 : b, c, d, e + + // Eviction starts. + most_frequent.Add("f"); // 4 : a | 2 : c, d, e | 1 : f (b was evicted) + most_frequent.Add("g"); // 4 : a | 2 : c, d, e | 1 : g (f was evicted) + most_frequent.Add("h"); // 4 : a | 2 : c, d, e | 1 : h (g was evicted) + most_frequent.Add("i"); // 4 : a | 2 : c, d, e | 1 : i (h was evicted) + most_frequent.Add("j"); // 4 : a | 2 : c, d, e | 1 : j (i was evicted) + most_frequent.Add("k"); // 4 : a | 2 : c, d, e | 1 : k (j was evicted) + most_frequent.Add("l"); // 4 : a | 2 : c, d, e | 1 : l (k was evicted) + most_frequent.Add("m"); // 4 : a | 2 : c, d, e | 1 : m (l was evicted) + most_frequent.Add("n"); // 4 : a | 2 : c, d, e | 1 : n (m was evicted) + most_frequent.Add("o"); // 4 : a | 2 : c, d, e | 1 : o (n was evicted) + most_frequent.Add("p"); // 4 : a | 2 : c, d, e | 1 : p (o was evicted) + most_frequent.Add("p"); // 4 : a | 2 : c, d, e, p + most_frequent.Add("p"); // 4 : a | 3 : p | 2 : c, d, e + + EXPECT_THAT(most_frequent.GetMostFrequent(10), + ElementsAre(Pair("a", 4), Pair("p", 3), Pair("e", 2), + Pair("d", 2), Pair("c", 2))); + + most_frequent.FullyRemove("c"); // 4 : a | 3 : p | 2 : d, e + most_frequent.Add("f"); // 4 : a | 3 : p | 2 : d, e | 1 : f + + EXPECT_THAT(most_frequent.GetMostFrequent(10), + ElementsAre(Pair("a", 4), Pair("p", 3), Pair("e", 2), + Pair("d", 2), Pair("f", 1))); +} + +TEST(SpaceSavingMostFrequent, CornerCase) { + Implementations most_frequent(5); + + most_frequent.Add("a"); // 1 : a + most_frequent.Add("b"); // 1 : a, b + most_frequent.Add("c"); // 1 : a, b, c + most_frequent.Add("d"); // 1 : a, b, c, d + most_frequent.Add("e"); // 1 : a, b, c, d, e + most_frequent.Add("f"); // 1 : b, c, d, e, f + most_frequent.Add("g"); // 1 : c, d, e, f, g + + // Eviction starts. + most_frequent.Add("x"); // 1 : d, e, f, g, x (a was evicted) + most_frequent.Add("y"); // 1 : e, f, g, x, y (d was evicted) + + // Here's an example of why we should remove the oldest item in case of a + // tie on the frequency count: we don't want "y" to remove the "x". + most_frequent.Add("x"); // 2 : x | 1 : e, f, g, y + most_frequent.Add("y"); // 2 : x, y | 1 : e, f, g + most_frequent.Add("x"); // 3 : x | 2 : y | 1 : e, f, g + most_frequent.Add("y"); // 3 : x, y | 1 : e, f, g + + EXPECT_THAT(most_frequent.GetMostFrequent(10), + ElementsAre(Pair("y", 3), Pair("x", 3), Pair("g", 1), + Pair("f", 1), Pair("e", 1))); +} + +TEST(SpaceSavingMostFrequent, RandomInstances) { + absl::BitGen gen; + static constexpr int kNumTests = 379; + for (int test = 0; test < kNumTests; ++test) { + const int num_items = absl::Uniform(gen, 0, 1000); + const int num_samples = absl::Uniform(gen, 0, 100); + const int storage_size = absl::Uniform(gen, 1, 100); + + Implementations most_frequent(storage_size); + std::vector values; + values.reserve(num_items); + for (int i = 0; i < num_items; ++i) { + const int value = absl::Uniform(gen, 0, 1000); + most_frequent.Add(value); + if (absl::Bernoulli(gen, 0.1)) { + auto vec = most_frequent.GetMostFrequent(num_samples); + if (!vec.empty()) { + const int to_remove = absl::Uniform(gen, 0u, vec.size()); + most_frequent.FullyRemove(vec[to_remove].first); + } + } + values.push_back(value); + } + most_frequent.CheckIdenticalResults(num_samples); + } +} + +template +struct Element { + Element() = default; + explicit Element(int value) : value(value) {} + int value; + int zeros[kElementSize] = {}; + template + friend H AbslHashValue(H h, const Element& e) { + return H::combine(std::move(h), e.value); + } + friend bool operator==(const Element& a, const Element& b) { + return a.value == b.value; + } +}; + +template +void BM_Add_GeometricDistributed(benchmark::State& state) { + using Element = Element; + static constexpr int kNumInputs = 100; + absl::BitGen random; + std::vector> inputs; + inputs.reserve(kNumInputs); + std::geometric_distribution distribution(1.0 / kCapacity); + for (int i = 0; i < kNumInputs; ++i) { + std::vector& input = inputs.emplace_back(); + input.reserve(kSize); + for (int j = 0; j < kSize; ++j) { + input.push_back(Element(distribution(random))); + } + } + + // Start the benchmark. + for (auto _ : state) { + for (const std::vector& input : inputs) { + SpaceSavingMostFrequent most_frequent(kCapacity); + for (const Element& value : input) { + most_frequent.Add(value); + } + } + } + state.SetItemsProcessed(state.iterations() * kNumInputs * kSize); +} + +BENCHMARK(BM_Add_GeometricDistributed<30, 10, 0>); +BENCHMARK(BM_Add_GeometricDistributed<100, 30, 0>); +BENCHMARK(BM_Add_GeometricDistributed<1000, 100, 0>); +BENCHMARK(BM_Add_GeometricDistributed<10000, 1000, 0>); +BENCHMARK(BM_Add_GeometricDistributed<100000, 10000, 0>); + +BENCHMARK(BM_Add_GeometricDistributed<30, 10, 4>); +BENCHMARK(BM_Add_GeometricDistributed<100, 30, 4>); +BENCHMARK(BM_Add_GeometricDistributed<1000, 100, 4>); +BENCHMARK(BM_Add_GeometricDistributed<10000, 1000, 4>); +BENCHMARK(BM_Add_GeometricDistributed<100000, 10000, 4>); + +} // namespace +} // namespace operations_research