Files
ortools-clone/ortools/algorithms/space_saving_most_frequent.h
2025-09-29 17:21:58 +02:00

509 lines
17 KiB
C++

// Copyright 2010-2025 Google LLC
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
#define OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
#include <cstddef>
#include <cstdint>
#include <functional>
#include <utility>
#include <vector>
#include "absl/base/attributes.h"
#include "absl/base/nullability.h"
#include "absl/container/flat_hash_set.h"
#include "absl/hash/hash.h"
#include "absl/log/check.h"
namespace operations_research {
namespace ssmf_internal {
template <typename T>
class BoundedAllocator;
template <typename T>
class DoubleLinkedList;
} // namespace ssmf_internal
// Space-Saving is an approximate algorithm for finding the most frequent items
// in a data stream. It is conceptually very simple: we maintain a list of at
// most `storage_size` elements and the number of times each of them has been
// seen. When a new element is added and the list is full, we remove the least
// frequent item (the one with the lowest count). If there is a tie, we remove
// the oldest one. See space_saving_most_frequent_test.cc for a trivial
// implementation that yield identical results to this class but is much slower.
//
// The implementation is based on [1], which describes a way of storing the
// items so all the operations are O(1). The elements that have the same count
// (a "bucket") are stored in a doubly-linked list, ordered by the time of
// insertion. The buckets are also stored in a doubly-linked list, ordered by
// number of counts. Thus, to increment the count of an element we need to
// remove it from its bucket and add it to the next one, which is a removal and
// an inclusion in linked lists and thus takes O(1) time.
//
// [1] Graham Cormode, Marios Hadjieleftheriou. Methods for finding frequent
// items in data streams. The VLDB Journal (2010) 19: 3.
// http://dimacs.rutgers.edu/~graham/pubs/papers/freqvldbj.pdf
//
// This class is thread-compatible.
template <typename T, typename Hash = absl::Hash<T>,
typename Eq = std::equal_to<T>>
class SpaceSavingMostFrequent {
public:
// Create a data structure holding at most `storage_size` elements in memory.
// That means that frequent elements that are added less frequently than
// `1/storage_size` will be ignored.
explicit SpaceSavingMostFrequent(int storage_size);
~SpaceSavingMostFrequent();
// Adds `value` to the data structure.
// Complexity: O(1).
void Add(T value);
// Removes all occurrences of `value` from the data structure. Does nothing if
// the element is not in the data structure.
// Complexity: O(1).
void FullyRemove(const T& value);
// Returns the `num_samples` most frequent elements in the data structure
// sorted by decreasing count. Note: this does not work with non-copyable
// types.
// TODO(user): Replace this by an iterator with a begin() and end().
std::vector<std::pair<T, int64_t>> GetMostFrequent(int num_samples) const;
// Equivalent to calling GetMostFrequent(1) and popping the first element.
T PopMostFrequent();
// Equivalent of GetMostFrequent(1).second. Returns zero if the data structure
// is empty.
int64_t CountOfMostFrequent() const;
private:
struct Bucket;
// The nodes of the doubly-linked list of elements for a given bucket (ie.,
// sharing the same count).
struct Item {
T value;
Bucket* absl_nonnull bucket;
Item* absl_nullable next = nullptr;
Item* absl_nullable prev = nullptr;
};
using ItemList = ssmf_internal::DoubleLinkedList<Item>;
// A bucket of elements with the same count. They are stored in a
// doubly-linked list ordered by the time of insertion.
struct Bucket {
int64_t count; // The count of this bucket.
ItemList items; // front (oldest), back (newest).
Bucket* absl_nullable next = nullptr; // Bucket with lower count.
Bucket* absl_nullable prev = nullptr; // Bucket with higher count.
};
using BucketList = ssmf_internal::DoubleLinkedList<Bucket>;
void RemoveIfEmpty(Bucket* absl_nonnull bucket) {
if (bucket->items.empty()) {
bucket_alloc_.Return(buckets_.erase(bucket));
}
}
void RemoveFromLinkedList(Item* absl_nonnull item) {
Bucket* absl_nonnull bucket = item->bucket;
item_alloc_.Return(bucket->items.erase(item));
RemoveIfEmpty(bucket);
}
Bucket* absl_nonnull GetBucketForCountOne() {
if (!buckets_.empty() && buckets_.back()->count == 1) {
return buckets_.back();
}
// We need to create a new empty bucket, which will be the last one.
Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New());
bucket->count = 1;
return bucket;
}
const int storage_size_;
ssmf_internal::BoundedAllocator<Item> item_alloc_;
ssmf_internal::BoundedAllocator<Bucket> bucket_alloc_;
BucketList buckets_; // front with highest count.
struct HashItemPtr {
using is_transparent = void;
size_t operator()(const Item* absl_nonnull value) const {
return Hash()(value->value);
}
size_t operator()(const T& value) const { return Hash()(value); }
};
struct EqItemPtr {
using is_transparent = void;
bool operator()(const Item* absl_nonnull a,
const Item* absl_nonnull b) const {
return Eq()(a->value, b->value);
}
bool operator()(const Item* absl_nonnull a, const T& b) const {
return Eq()(a->value, b);
}
bool operator()(const T& a, const Item* absl_nonnull b) const {
return Eq()(a, b->value);
}
};
absl::flat_hash_set<Item* absl_nonnull, HashItemPtr, EqItemPtr> item_ptr_set_;
};
template <typename T, typename Hash, typename Eq>
SpaceSavingMostFrequent<T, Hash, Eq>::SpaceSavingMostFrequent(int storage_size)
: storage_size_(storage_size),
item_alloc_(storage_size),
bucket_alloc_(storage_size + 1) {
CHECK_GT(storage_size, 0);
item_ptr_set_.reserve(2 * storage_size);
}
// Properly return all buckets and items to their allocators to ensure proper
// destruction.
template <typename T, typename Hash, typename Eq>
SpaceSavingMostFrequent<T, Hash, Eq>::~SpaceSavingMostFrequent() {
#ifdef NDEBUG
bucket_alloc_.DisposeAll();
item_alloc_.DisposeAll();
#else
while (!buckets_.empty()) {
auto& items = buckets_.front()->items;
while (!items.empty()) {
item_alloc_.Return(items.pop_front());
}
bucket_alloc_.Return(buckets_.pop_front());
}
#endif
}
template <typename T, typename Hash, typename Eq>
void SpaceSavingMostFrequent<T, Hash, Eq>::Add(T value) {
if (buckets_.empty()) {
// We are adding an element to an empty data structure.
DCHECK(item_alloc_.empty());
DCHECK(item_ptr_set_.empty());
Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New());
Item* absl_nonnull const item =
bucket->items.insert_front(item_alloc_.New());
item->bucket = bucket;
item->value = std::move(value);
bucket->count = 1;
item_ptr_set_.emplace(item);
return;
}
DCHECK(!buckets_.empty());
auto it = item_ptr_set_.find(value);
if (it == item_ptr_set_.end()) {
// We are adding a new element. First, check if we are full, and if so,
// remove the least frequent element.
if (item_alloc_.full()) {
// Remove an entry from the last bucket where the `count` is lowest.
Bucket* absl_nonnull const last_bucket = buckets_.back();
// We want to remove the oldest one, with the idea that it is potentially
// the real least frequent of the bucket since it was unseen for longer.
Item* absl_nonnull recycled_item = last_bucket->items.front();
// Reclaim its storage for the newly added element.
item_ptr_set_.erase(recycled_item);
item_alloc_.Return(last_bucket->items.pop_front());
RemoveIfEmpty(last_bucket);
}
Bucket* absl_nonnull bucket = GetBucketForCountOne();
DCHECK_EQ(bucket->count, 1);
Item* absl_nonnull item = bucket->items.insert_back(item_alloc_.New());
item->value = std::move(value);
item->bucket = bucket;
item_ptr_set_.emplace_hint(it, item);
} else {
Item* absl_nonnull item = *it;
Bucket* absl_nonnull bucket = item->bucket;
ItemList& current_bucket_items = bucket->items;
const int64_t new_count = bucket->count + 1;
const bool no_bucket_for_new_count =
(bucket->prev == nullptr) || (bucket->prev->count > new_count);
if (no_bucket_for_new_count && current_bucket_items.single()) {
// Small optimization for very common elements: if the element is alone in
// a bucket and there is no bucket for count + 1, we can just increment
// the count of the bucket.
bucket->count = new_count;
return;
}
// Extract item from this bucket.
auto dangling_item = current_bucket_items.erase(item);
// Fetch the bucket with the correct count.
Bucket* new_bucket = nullptr;
if (bucket->prev && bucket->prev->count == new_count) {
new_bucket = bucket->prev;
} else {
// We create a new empty bucket, which will be before the current bucket.
new_bucket = buckets_.insert_before(bucket, bucket_alloc_.New());
new_bucket->count = new_count;
}
// Insert the item in the new bucket at the end (newest).
dangling_item->bucket = new_bucket;
new_bucket->items.insert_back(std::move(dangling_item));
// Reclaim old bucket if it is empty.
RemoveIfEmpty(bucket);
}
}
template <typename T, typename Hash, typename Eq>
void SpaceSavingMostFrequent<T, Hash, Eq>::FullyRemove(const T& value) {
auto node = item_ptr_set_.extract(value);
if (node.empty()) return;
RemoveFromLinkedList(node.value());
}
template <typename T, typename Hash, typename Eq>
std::vector<std::pair<T, int64_t>>
SpaceSavingMostFrequent<T, Hash, Eq>::GetMostFrequent(int num_samples) const {
std::vector<std::pair<T, int64_t>> result;
result.reserve(num_samples);
if (!buckets_.empty()) {
for (Bucket* bucket = buckets_.front(); bucket; bucket = bucket->next) {
const int64_t count = bucket->count;
DCHECK(!bucket->items.empty());
for (Item* item = bucket->items.back(); item; item = item->prev) {
if (result.size() == num_samples) return result;
result.emplace_back(item->value, count);
}
}
}
return result;
}
template <typename T, typename Hash, typename Eq>
T SpaceSavingMostFrequent<T, Hash, Eq>::PopMostFrequent() {
CHECK(!buckets_.empty());
Item* absl_nonnull item = buckets_.front()->items.back();
DCHECK(item_ptr_set_.contains(item));
item_ptr_set_.erase(item);
T value = std::move(item->value);
RemoveFromLinkedList(item);
return value;
}
template <typename T, typename Hash, typename Eq>
int64_t SpaceSavingMostFrequent<T, Hash, Eq>::CountOfMostFrequent() const {
return buckets_.empty() ? 0 : buckets_.front()->count;
}
namespace ssmf_internal {
// This is semantically equivalent to a `std::unique_ptr` except that it does
// not own the object and that only `BoundedAllocator` and `DoubleLinkedList`
// are able to manage the stored pointer.
// Other clients can access the object with the guarantee that it is non null.
template <typename T>
class Ptr {
explicit Ptr(T* absl_nullable ptr) : ptr_(ptr) { get_nonnull(); }
T* absl_nonnull get_nonnull() const {
DCHECK(ptr_ != nullptr);
return ptr_;
};
T* absl_nonnull release() {
T* absl_nonnull ptr = get_nonnull();
ptr_ = nullptr;
return ptr;
}
T* absl_nullable ptr_ = nullptr;
public:
Ptr(const Ptr&) = delete;
Ptr(Ptr&& other) : ptr_(other.release()) {}
~Ptr() { DCHECK(ptr_ == nullptr); }
T* absl_nonnull operator->() const { return get_nonnull(); };
T& operator*() const { return *get_nonnull(); };
friend class BoundedAllocator<T>;
friend class DoubleLinkedList<T>;
};
// Allocator that allows creating up to `max_size` objects.
// Storage is allocated at once contiguously which helps with cache locality.
// Objects that are returned to the allocator are stored in a freelist for later
// use and are not destroyed right away. Objects that are extracted from the
// freelist are default initialized for correctness.
// The allocator makes sure that all created objects are returned to the pool
// upon destruction, this allows to catch logic errors. It is possible to bypass
// this behavior when it is safe to destroy all object at once by calling the
// `DisposeAll` method. Once this method is called the allocator cannot be used
// anymore.
template <typename T>
class BoundedAllocator {
public:
explicit BoundedAllocator(size_t max_size) : data_(max_size) {
freelist_.reserve(max_size);
for (auto& data : data_) {
freelist_.push_back(&data);
}
}
~BoundedAllocator() {
CHECK(empty()) << "some elements are not returned and won't be destroyed.";
}
bool full() const { return freelist_.empty(); }
bool empty() const { return data_.size() == freelist_.size(); }
Ptr<T> New() {
CHECK(!freelist_.empty());
T* absl_nonnull t = freelist_.back();
freelist_.pop_back();
return Ptr<T>(t);
}
void Return(Ptr<T> ptr) {
T* absl_nonnull t = ptr.release();
DCHECK(t != nullptr);
DCHECK_GE(t, &data_.front());
DCHECK_LE(t, &data_.back());
*t = T();
freelist_.push_back(t);
}
// Destroys all allocated objects.
// Once called, it is no more possible to allocate new objects.
void DisposeAll() {
freelist_.clear();
data_.clear();
}
private:
std::vector<T> data_;
std::vector<T* absl_nonnull> freelist_;
};
// A simple doubly linked list with ownership transfer.
// All elements added or extracted from the list are done though the `Ptr`
// abstraction. This guarantees that there is always only one owner.
template <typename T>
class DoubleLinkedList {
public:
DoubleLinkedList() = default;
DoubleLinkedList(const DoubleLinkedList&) = delete;
~DoubleLinkedList() {
DCHECK_EQ(front_, nullptr);
DCHECK_EQ(front_, nullptr);
}
bool empty() const {
DCHECK_EQ(front_ == nullptr, back_ == nullptr);
return front_ == nullptr;
}
bool single() const {
DCHECK_EQ(front_ == nullptr, back_ == nullptr);
return front_ != nullptr && front_ == back_;
}
T* absl_nonnull front() const {
DCHECK_NE(front_, nullptr);
return front_;
}
T* absl_nonnull back() const {
DCHECK_NE(back_, nullptr);
return back_;
}
T* absl_nonnull insert_after(T* absl_nonnull node, Ptr<T> new_node) {
T* absl_nonnull new_node_ptr = new_node.release();
new_node_ptr->prev = node;
if (node->next == nullptr) {
DCHECK_EQ(new_node_ptr->next, nullptr);
back_ = new_node_ptr;
} else {
new_node_ptr->next = node->next;
node->next->prev = new_node_ptr;
}
node->next = new_node_ptr;
return new_node_ptr;
}
T* absl_nonnull insert_before(T* absl_nonnull node, Ptr<T> new_node) {
T* absl_nonnull new_node_ptr = new_node.release();
new_node_ptr->next = node;
if (node->prev == nullptr) {
DCHECK_EQ(new_node_ptr->prev, nullptr);
front_ = new_node_ptr;
} else {
new_node_ptr->prev = node->prev;
node->prev->next = new_node_ptr;
}
node->prev = new_node_ptr;
return new_node_ptr;
}
T* absl_nonnull insert_front(Ptr<T> new_node) {
if (front_ != nullptr) {
return insert_before(front_, std::move(new_node));
}
T* absl_nonnull new_node_ptr = new_node.release();
front_ = new_node_ptr;
back_ = new_node_ptr;
new_node_ptr->prev = nullptr;
new_node_ptr->next = nullptr;
return new_node_ptr;
}
T* absl_nonnull insert_back(Ptr<T> new_node) {
if (back_ != nullptr) {
return insert_after(back_, std::move(new_node));
} else {
return insert_front(std::move(new_node));
}
}
ABSL_MUST_USE_RESULT Ptr<T> erase(T* absl_nonnull node) {
if (node->prev) {
node->prev->next = node->next;
} else {
front_ = node->next;
}
if (node->next) {
node->next->prev = node->prev;
} else {
back_ = node->prev;
}
node->next = nullptr;
node->prev = nullptr;
return Ptr<T>(node);
}
ABSL_MUST_USE_RESULT Ptr<T> pop_front() { return erase(front()); }
ABSL_MUST_USE_RESULT Ptr<T> pop_back() { return erase(back()); }
private:
T* absl_nullable front_ = nullptr;
T* absl_nullable back_ = nullptr;
};
} // namespace ssmf_internal
} // namespace operations_research
#endif // OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_