Add space_saving_most_frequent util
This commit is contained in:
committed by
Corentin Le Molgat
parent
076468db33
commit
81cf14db67
@@ -279,6 +279,33 @@ cc_test(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "space_saving_most_frequent",
|
||||
hdrs = ["space_saving_most_frequent.h"],
|
||||
deps = [
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/base:nullability",
|
||||
"@abseil-cpp//absl/container:flat_hash_map",
|
||||
"@abseil-cpp//absl/hash",
|
||||
"@abseil-cpp//absl/log:check",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "space_saving_most_frequent_test",
|
||||
srcs = ["space_saving_most_frequent_test.cc"],
|
||||
deps = [
|
||||
":space_saving_most_frequent",
|
||||
"//ortools/base:gmock_main",
|
||||
"@abseil-cpp//absl/algorithm:container",
|
||||
"@abseil-cpp//absl/base:nullability",
|
||||
"@abseil-cpp//absl/log:check",
|
||||
"@abseil-cpp//absl/random",
|
||||
"@abseil-cpp//absl/random:distributions",
|
||||
"@google_benchmark//:benchmark",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "sparse_permutation",
|
||||
srcs = ["sparse_permutation.cc"],
|
||||
|
||||
480
ortools/algorithms/space_saving_most_frequent.h
Normal file
480
ortools/algorithms/space_saving_most_frequent.h
Normal file
@@ -0,0 +1,480 @@
|
||||
// Copyright 2010-2025 Google LLC
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
|
||||
#define OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/nullability.h"
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "absl/hash/hash.h"
|
||||
#include "absl/log/check.h"
|
||||
|
||||
namespace operations_research {
|
||||
|
||||
namespace ssmf_internal {
|
||||
|
||||
template <typename T>
|
||||
class BoundedAllocator;
|
||||
|
||||
template <typename T>
|
||||
class DoubleLinkedList;
|
||||
|
||||
} // namespace ssmf_internal
|
||||
|
||||
// Space-Saving is an approximate algorithm for finding the most frequent items
|
||||
// in a data stream. It is conceptually very simple: we maintain a list of at
|
||||
// most `storage_size` elements and the number of times each of them has been
|
||||
// seen. When a new element is added and the list is full, we remove the least
|
||||
// frequent item (the one with the lowest count). If there is a tie, we remove
|
||||
// the oldest one. See space_saving_most_frequent_test.cc for a trivial
|
||||
// implementation that yield identical results to this class but is much slower.
|
||||
//
|
||||
// The implementation is based on [1], which describes a way of storing the
|
||||
// items so all the operations are O(1). The elements that have the same count
|
||||
// (a "bucket") are stored in a doubly-linked list, ordered by the time of
|
||||
// insertion. The buckets are also stored in a doubly-linked list, ordered by
|
||||
// number of counts. Thus, to increment the count of an element we need to
|
||||
// remove it from its bucket and add it to the next one, which is a removal and
|
||||
// an inclusion in linked lists and thus takes O(1) time.
|
||||
//
|
||||
// [1] Graham Cormode, Marios Hadjieleftheriou. Methods for finding frequent
|
||||
// items in data streams. The VLDB Journal (2010) 19: 3.
|
||||
// http://dimacs.rutgers.edu/~graham/pubs/papers/freqvldbj.pdf
|
||||
//
|
||||
// This class is thread-compatible.
|
||||
//
|
||||
// TODO(user): Support move-only types.
|
||||
template <typename T, typename Hash = absl::Hash<T>,
|
||||
typename Eq = std::equal_to<T>>
|
||||
class SpaceSavingMostFrequent {
|
||||
public:
|
||||
// Create a data structure holding at most `storage_size` elements in memory.
|
||||
// That means that frequent elements that are added less frequently than
|
||||
// `1/storage_size` will be ignored.
|
||||
explicit SpaceSavingMostFrequent(int storage_size);
|
||||
|
||||
~SpaceSavingMostFrequent();
|
||||
|
||||
// Adds `value` to the data structure.
|
||||
// Complexity: O(1).
|
||||
void Add(T value);
|
||||
|
||||
// Removes all occurrences of `value` from the data structure. Does nothing if
|
||||
// the element is not in the data structure.
|
||||
// Complexity: O(1).
|
||||
void FullyRemove(const T& value);
|
||||
|
||||
// TODO(user): Replace this by an iterator with a begin() and end().
|
||||
std::vector<std::pair<T, int64_t>> GetMostFrequent(int num_samples) const;
|
||||
|
||||
// Equivalent to calling GetMostFrequent(1) and popping the first element.
|
||||
T PopMostFrequent();
|
||||
|
||||
// Equivalent of GetMostFrequent(1).second. Returns zero if the data structure
|
||||
// is empty.
|
||||
int64_t CountOfMostFrequent() const;
|
||||
|
||||
private:
|
||||
struct Bucket;
|
||||
|
||||
// The nodes of the doubly-linked list of elements for a given bucket (ie.,
|
||||
// sharing the same count).
|
||||
struct Item {
|
||||
T value;
|
||||
Bucket* absl_nonnull bucket;
|
||||
Item* absl_nullable next = nullptr;
|
||||
Item* absl_nullable prev = nullptr;
|
||||
};
|
||||
using ItemList = ssmf_internal::DoubleLinkedList<Item>;
|
||||
|
||||
// A bucket of elements with the same count. They are stored in a
|
||||
// doubly-linked list ordered by the time of insertion.
|
||||
struct Bucket {
|
||||
int64_t count; // The count of this bucket.
|
||||
ItemList items; // front (oldest), back (newest).
|
||||
Bucket* absl_nullable next = nullptr; // Bucket with lower count.
|
||||
Bucket* absl_nullable prev = nullptr; // Bucket with higher count.
|
||||
};
|
||||
using BucketList = ssmf_internal::DoubleLinkedList<Bucket>;
|
||||
|
||||
void RemoveIfEmpty(Bucket* absl_nonnull bucket) {
|
||||
if (bucket->items.empty()) {
|
||||
bucket_alloc_.Return(buckets_.erase(bucket));
|
||||
}
|
||||
}
|
||||
|
||||
Bucket* absl_nonnull GetBucketForCountOne() {
|
||||
if (!buckets_.empty() && buckets_.back()->count == 1) {
|
||||
return buckets_.back();
|
||||
}
|
||||
// We need to create a new empty bucket, which will be the last one.
|
||||
Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New());
|
||||
bucket->count = 1;
|
||||
return bucket;
|
||||
}
|
||||
|
||||
const int storage_size_;
|
||||
ssmf_internal::BoundedAllocator<Item> item_alloc_;
|
||||
ssmf_internal::BoundedAllocator<Bucket> bucket_alloc_;
|
||||
BucketList buckets_; // front with highest count.
|
||||
absl::flat_hash_map<T, Item* absl_nonnull, Hash, Eq> elem_to_item_;
|
||||
};
|
||||
|
||||
template <typename T, typename Hash, typename Eq>
|
||||
SpaceSavingMostFrequent<T, Hash, Eq>::SpaceSavingMostFrequent(int storage_size)
|
||||
: storage_size_(storage_size),
|
||||
item_alloc_(storage_size),
|
||||
bucket_alloc_(storage_size + 1) {
|
||||
CHECK_GT(storage_size, 0);
|
||||
elem_to_item_.reserve(storage_size + 1);
|
||||
}
|
||||
|
||||
// Properly return all buckets and items to their allocators to ensure proper
|
||||
// destruction.
|
||||
template <typename T, typename Hash, typename Eq>
|
||||
SpaceSavingMostFrequent<T, Hash, Eq>::~SpaceSavingMostFrequent() {
|
||||
#ifdef NDEBUG
|
||||
bucket_alloc_.DisposeAll();
|
||||
item_alloc_.DisposeAll();
|
||||
#else
|
||||
while (!buckets_.empty()) {
|
||||
auto& items = buckets_.front()->items;
|
||||
while (!items.empty()) {
|
||||
item_alloc_.Return(items.pop_front());
|
||||
}
|
||||
bucket_alloc_.Return(buckets_.pop_front());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename Hash, typename Eq>
|
||||
void SpaceSavingMostFrequent<T, Hash, Eq>::Add(T value) {
|
||||
if (buckets_.empty()) {
|
||||
// We are adding an element to an empty data structure.
|
||||
DCHECK(item_alloc_.empty());
|
||||
DCHECK(elem_to_item_.empty());
|
||||
Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New());
|
||||
Item* absl_nonnull const item =
|
||||
bucket->items.insert_front(item_alloc_.New());
|
||||
item->bucket = bucket;
|
||||
item->value = value;
|
||||
bucket->count = 1;
|
||||
elem_to_item_.emplace(value, item);
|
||||
return;
|
||||
}
|
||||
|
||||
DCHECK(!buckets_.empty());
|
||||
|
||||
auto [it, inserted] = elem_to_item_.try_emplace(value);
|
||||
if (inserted) {
|
||||
// We are adding a new element. First, check if we are full, and if so,
|
||||
// remove the least frequent element.
|
||||
if (item_alloc_.full()) {
|
||||
// Remove an entry from the last bucket where the `count` is lowest.
|
||||
Bucket* absl_nonnull const last_bucket = buckets_.back();
|
||||
// We want to remove the oldest one, with the idea that it is potentially
|
||||
// the real least frequent of the bucket since it was unseen for longer.
|
||||
Item* absl_nonnull recycled_item = last_bucket->items.front();
|
||||
// Reclaim its storage for the newly added element.
|
||||
elem_to_item_.erase(recycled_item->value);
|
||||
item_alloc_.Return(last_bucket->items.pop_front());
|
||||
RemoveIfEmpty(last_bucket);
|
||||
}
|
||||
Bucket* absl_nonnull bucket = GetBucketForCountOne();
|
||||
DCHECK_EQ(bucket->count, 1);
|
||||
Item* absl_nonnull item = bucket->items.insert_back(item_alloc_.New());
|
||||
item->value = value;
|
||||
item->bucket = bucket;
|
||||
it->second = item; // set item pointer back in map.
|
||||
} else {
|
||||
Item* absl_nonnull item = it->second;
|
||||
Bucket* absl_nonnull bucket = item->bucket;
|
||||
ItemList& current_bucket_items = bucket->items;
|
||||
const int64_t new_count = bucket->count + 1;
|
||||
const bool no_bucket_for_new_count =
|
||||
(bucket->prev == nullptr) || (bucket->prev->count > new_count);
|
||||
if (no_bucket_for_new_count && current_bucket_items.single()) {
|
||||
// Small optimization for very common elements: if the element is alone in
|
||||
// a bucket and there is no bucket for count + 1, we can just increment
|
||||
// the count of the bucket.
|
||||
bucket->count = new_count;
|
||||
return;
|
||||
}
|
||||
// Extract item from this bucket.
|
||||
auto dangling_item = current_bucket_items.erase(item);
|
||||
// Fetch the bucket with the correct count.
|
||||
Bucket* new_bucket = nullptr;
|
||||
if (bucket->prev && bucket->prev->count == new_count) {
|
||||
new_bucket = bucket->prev;
|
||||
} else {
|
||||
// We create a new empty bucket, which will be before the current bucket.
|
||||
new_bucket = buckets_.insert_before(bucket, bucket_alloc_.New());
|
||||
new_bucket->count = new_count;
|
||||
}
|
||||
// Insert the item in the new bucket at the end (newest).
|
||||
dangling_item->bucket = new_bucket;
|
||||
new_bucket->items.insert_back(std::move(dangling_item));
|
||||
|
||||
// Reclaim old bucket if it is empty.
|
||||
RemoveIfEmpty(bucket);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename Hash, typename Eq>
|
||||
void SpaceSavingMostFrequent<T, Hash, Eq>::FullyRemove(const T& value) {
|
||||
auto it = elem_to_item_.find(value);
|
||||
if (it == elem_to_item_.end()) return;
|
||||
Item* absl_nonnull item = it->second;
|
||||
Bucket* absl_nonnull bucket = item->bucket;
|
||||
item_alloc_.Return(bucket->items.erase(item));
|
||||
RemoveIfEmpty(bucket);
|
||||
elem_to_item_.erase(it);
|
||||
}
|
||||
|
||||
template <typename T, typename Hash, typename Eq>
|
||||
std::vector<std::pair<T, int64_t>>
|
||||
SpaceSavingMostFrequent<T, Hash, Eq>::GetMostFrequent(int num_samples) const {
|
||||
std::vector<std::pair<T, int64_t>> result;
|
||||
result.reserve(num_samples);
|
||||
if (!buckets_.empty()) {
|
||||
for (Bucket* bucket = buckets_.front(); bucket; bucket = bucket->next) {
|
||||
const int64_t count = bucket->count;
|
||||
DCHECK(!bucket->items.empty());
|
||||
for (Item* item = bucket->items.back(); item; item = item->prev) {
|
||||
if (result.size() == num_samples) return result;
|
||||
result.emplace_back(item->value, count);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T, typename Hash, typename Eq>
|
||||
T SpaceSavingMostFrequent<T, Hash, Eq>::PopMostFrequent() {
|
||||
CHECK(!buckets_.empty());
|
||||
const T value = buckets_.front()->items.back()->value;
|
||||
FullyRemove(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename T, typename Hash, typename Eq>
|
||||
int64_t SpaceSavingMostFrequent<T, Hash, Eq>::CountOfMostFrequent() const {
|
||||
return buckets_.empty() ? 0 : buckets_.front()->count;
|
||||
}
|
||||
|
||||
namespace ssmf_internal {
|
||||
|
||||
// This is semantically equivalent to a `std::unique_ptr` except that it does
|
||||
// not own the object and that only `BoundedAllocator` and `DoubleLinkedList`
|
||||
// are able to manage the stored pointer.
|
||||
// Other clients can access the object with the guarantee that it is non null.
|
||||
template <typename T>
|
||||
class Ptr {
|
||||
explicit Ptr(T* absl_nullable ptr) : ptr_(ptr) { get_nonnull(); }
|
||||
|
||||
T* absl_nonnull get_nonnull() const {
|
||||
DCHECK(ptr_ != nullptr);
|
||||
return ptr_;
|
||||
};
|
||||
|
||||
T* absl_nonnull release() {
|
||||
T* absl_nonnull ptr = get_nonnull();
|
||||
ptr_ = nullptr;
|
||||
return ptr;
|
||||
}
|
||||
|
||||
T* absl_nullable ptr_ = nullptr;
|
||||
|
||||
public:
|
||||
Ptr(const Ptr&) = delete;
|
||||
Ptr(Ptr&& other) : ptr_(other.release()) {}
|
||||
~Ptr() { DCHECK(ptr_ == nullptr); }
|
||||
T* absl_nonnull operator->() const { return get_nonnull(); };
|
||||
T& operator*() const { return *get_nonnull(); };
|
||||
friend class BoundedAllocator<T>;
|
||||
friend class DoubleLinkedList<T>;
|
||||
};
|
||||
|
||||
// Allocator that allows creating up to `max_size` objects.
|
||||
// Storage is allocated at once contiguously which helps with cache locality.
|
||||
// Objects that are returned to the allocator are stored in a freelist for later
|
||||
// use and are not destroyed right away. Objects that are extracted from the
|
||||
// freelist are default initialized for correctness.
|
||||
// The allocator makes sure that all created objects are returned to the pool
|
||||
// upon destruction, this allows to catch logic errors. It is possible to bypass
|
||||
// this behavior when it is safe to destroy all object at once by calling the
|
||||
// `DisposeAll` method. Once this method is called the allocator cannot be used
|
||||
// anymore.
|
||||
template <typename T>
|
||||
class BoundedAllocator {
|
||||
public:
|
||||
explicit BoundedAllocator(size_t max_size) : data_(max_size) {
|
||||
freelist_.reserve(max_size);
|
||||
for (auto& data : data_) {
|
||||
freelist_.push_back(&data);
|
||||
}
|
||||
}
|
||||
|
||||
~BoundedAllocator() {
|
||||
CHECK(empty()) << "some elements are not returned and won't be destroyed.";
|
||||
}
|
||||
|
||||
bool full() const { return freelist_.empty(); }
|
||||
|
||||
bool empty() const { return data_.size() == freelist_.size(); }
|
||||
|
||||
Ptr<T> New() {
|
||||
CHECK(!freelist_.empty());
|
||||
T* absl_nonnull t = freelist_.back();
|
||||
freelist_.pop_back();
|
||||
return Ptr<T>(t);
|
||||
}
|
||||
|
||||
void Return(Ptr<T> ptr) {
|
||||
T* absl_nonnull t = ptr.release();
|
||||
DCHECK(t != nullptr);
|
||||
DCHECK_GE(t, &data_.front());
|
||||
DCHECK_LE(t, &data_.back());
|
||||
*t = T();
|
||||
freelist_.push_back(t);
|
||||
}
|
||||
|
||||
// Destroys all allocated objects.
|
||||
// Once called, it is no more possible to allocate new objects.
|
||||
void DisposeAll() {
|
||||
freelist_.clear();
|
||||
data_.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<T> data_;
|
||||
std::vector<T* absl_nonnull> freelist_;
|
||||
};
|
||||
|
||||
// A simple doubly linked list with ownership transfer.
|
||||
// All elements added or extracted from the list are done though the `Ptr`
|
||||
// abstraction. This guarantees that there is always only one owner.
|
||||
template <typename T>
|
||||
class DoubleLinkedList {
|
||||
public:
|
||||
DoubleLinkedList() = default;
|
||||
DoubleLinkedList(const DoubleLinkedList&) = delete;
|
||||
~DoubleLinkedList() {
|
||||
DCHECK_EQ(front_, nullptr);
|
||||
DCHECK_EQ(front_, nullptr);
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
DCHECK_EQ(front_ == nullptr, back_ == nullptr);
|
||||
return front_ == nullptr;
|
||||
}
|
||||
|
||||
bool single() const {
|
||||
DCHECK_EQ(front_ == nullptr, back_ == nullptr);
|
||||
return front_ != nullptr && front_ == back_;
|
||||
}
|
||||
|
||||
T* absl_nonnull front() const {
|
||||
DCHECK_NE(front_, nullptr);
|
||||
return front_;
|
||||
}
|
||||
|
||||
T* absl_nonnull back() const {
|
||||
DCHECK_NE(back_, nullptr);
|
||||
return back_;
|
||||
}
|
||||
|
||||
T* absl_nonnull insert_after(T* absl_nonnull node, Ptr<T> new_node) {
|
||||
T* absl_nonnull new_node_ptr = new_node.release();
|
||||
new_node_ptr->prev = node;
|
||||
if (node->next == nullptr) {
|
||||
DCHECK_EQ(new_node_ptr->next, nullptr);
|
||||
back_ = new_node_ptr;
|
||||
} else {
|
||||
new_node_ptr->next = node->next;
|
||||
node->next->prev = new_node_ptr;
|
||||
}
|
||||
node->next = new_node_ptr;
|
||||
return new_node_ptr;
|
||||
}
|
||||
|
||||
T* absl_nonnull insert_before(T* absl_nonnull node, Ptr<T> new_node) {
|
||||
T* absl_nonnull new_node_ptr = new_node.release();
|
||||
new_node_ptr->next = node;
|
||||
if (node->prev == nullptr) {
|
||||
DCHECK_EQ(new_node_ptr->prev, nullptr);
|
||||
front_ = new_node_ptr;
|
||||
} else {
|
||||
new_node_ptr->prev = node->prev;
|
||||
node->prev->next = new_node_ptr;
|
||||
}
|
||||
node->prev = new_node_ptr;
|
||||
return new_node_ptr;
|
||||
}
|
||||
|
||||
T* absl_nonnull insert_front(Ptr<T> new_node) {
|
||||
if (front_ != nullptr) {
|
||||
return insert_before(front_, std::move(new_node));
|
||||
}
|
||||
T* absl_nonnull new_node_ptr = new_node.release();
|
||||
front_ = new_node_ptr;
|
||||
back_ = new_node_ptr;
|
||||
new_node_ptr->prev = nullptr;
|
||||
new_node_ptr->next = nullptr;
|
||||
return new_node_ptr;
|
||||
}
|
||||
|
||||
T* absl_nonnull insert_back(Ptr<T> new_node) {
|
||||
if (back_ != nullptr) {
|
||||
return insert_after(back_, std::move(new_node));
|
||||
} else {
|
||||
return insert_front(std::move(new_node));
|
||||
}
|
||||
}
|
||||
|
||||
ABSL_MUST_USE_RESULT Ptr<T> erase(T* absl_nonnull node) {
|
||||
if (node->prev) {
|
||||
node->prev->next = node->next;
|
||||
} else {
|
||||
front_ = node->next;
|
||||
}
|
||||
if (node->next) {
|
||||
node->next->prev = node->prev;
|
||||
} else {
|
||||
back_ = node->prev;
|
||||
}
|
||||
node->next = nullptr;
|
||||
node->prev = nullptr;
|
||||
return Ptr<T>(node);
|
||||
}
|
||||
|
||||
ABSL_MUST_USE_RESULT Ptr<T> pop_front() { return erase(front()); }
|
||||
|
||||
ABSL_MUST_USE_RESULT Ptr<T> pop_back() { return erase(back()); }
|
||||
|
||||
private:
|
||||
T* absl_nullable front_ = nullptr;
|
||||
T* absl_nullable back_ = nullptr;
|
||||
};
|
||||
} // namespace ssmf_internal
|
||||
|
||||
} // namespace operations_research
|
||||
|
||||
#endif // OR_TOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
|
||||
475
ortools/algorithms/space_saving_most_frequent_test.cc
Normal file
475
ortools/algorithms/space_saving_most_frequent_test.cc
Normal file
@@ -0,0 +1,475 @@
|
||||
// Copyright 2010-2025 Google LLC
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "ortools/algorithms/space_saving_most_frequent.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/algorithm/container.h"
|
||||
#include "absl/base/nullability.h"
|
||||
#include "absl/log/check.h"
|
||||
#include "absl/random/distributions.h"
|
||||
#include "absl/random/random.h"
|
||||
#include "benchmark/benchmark.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "ortools/base/gmock.h"
|
||||
|
||||
namespace operations_research {
|
||||
namespace {
|
||||
|
||||
using ssmf_internal::BoundedAllocator;
|
||||
using ssmf_internal::DoubleLinkedList;
|
||||
using ssmf_internal::Ptr;
|
||||
using ::testing::ElementsAre;
|
||||
using ::testing::Pair;
|
||||
|
||||
TEST(BoundedAllocator, Alloc) {
|
||||
BoundedAllocator<int> allocator(1);
|
||||
EXPECT_TRUE(allocator.empty());
|
||||
EXPECT_FALSE(allocator.full());
|
||||
auto p = allocator.New();
|
||||
EXPECT_FALSE(allocator.empty());
|
||||
EXPECT_TRUE(allocator.full());
|
||||
*p = 42;
|
||||
allocator.Return(std::move(p));
|
||||
EXPECT_TRUE(allocator.empty());
|
||||
EXPECT_FALSE(allocator.full());
|
||||
}
|
||||
|
||||
TEST(BoundedAllocator, FromFreeList) {
|
||||
BoundedAllocator<int> allocator(1);
|
||||
auto p = allocator.New();
|
||||
*p = 42;
|
||||
allocator.Return(std::move(p));
|
||||
auto q = allocator.New();
|
||||
EXPECT_EQ(*q, 0);
|
||||
allocator.Return(std::move(q));
|
||||
}
|
||||
|
||||
TEST(BoundedAllocator, UnReturnedItems) {
|
||||
ASSERT_DEATH(
|
||||
{
|
||||
BoundedAllocator<int> allocator(1);
|
||||
allocator.New(); // Allocated item not Return-ed
|
||||
},
|
||||
"");
|
||||
}
|
||||
|
||||
TEST(BoundedAllocator, Disposed) {
|
||||
BoundedAllocator<int> allocator(1);
|
||||
EXPECT_TRUE(allocator.empty());
|
||||
EXPECT_FALSE(allocator.full());
|
||||
allocator.DisposeAll();
|
||||
// Allocator becomes unusable.
|
||||
EXPECT_TRUE(allocator.empty());
|
||||
EXPECT_TRUE(allocator.full());
|
||||
}
|
||||
|
||||
struct Node {
|
||||
int value = 0;
|
||||
Node* absl_nullable next = nullptr;
|
||||
Node* absl_nullable prev = nullptr;
|
||||
};
|
||||
|
||||
class DoublyLinkedListTest : public ::testing::Test {
|
||||
public:
|
||||
DoublyLinkedListTest() : allocator_(10) {}
|
||||
|
||||
void TearDown() override {
|
||||
while (!list_.empty()) {
|
||||
allocator_.Return(list_.pop_front());
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<int> AsVector(const DoubleLinkedList<Node>& list) {
|
||||
std::vector<int> values;
|
||||
if (!list.empty()) {
|
||||
for (Node* node = list.front(); node != nullptr; node = node->next) {
|
||||
values.push_back(node->value);
|
||||
}
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
protected:
|
||||
BoundedAllocator<Node> allocator_;
|
||||
DoubleLinkedList<Node> list_;
|
||||
};
|
||||
|
||||
TEST_F(DoublyLinkedListTest, EmptyList) {
|
||||
EXPECT_TRUE(list_.empty());
|
||||
EXPECT_FALSE(list_.single());
|
||||
}
|
||||
|
||||
TEST_F(DoublyLinkedListTest, InsertFront) {
|
||||
Ptr<Node> node1 = allocator_.New();
|
||||
node1->value = 1;
|
||||
list_.insert_front(std::move(node1));
|
||||
EXPECT_FALSE(list_.empty());
|
||||
EXPECT_TRUE(list_.single());
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(1));
|
||||
|
||||
Ptr<Node> node2 = allocator_.New();
|
||||
node2->value = 2;
|
||||
list_.insert_front(std::move(node2));
|
||||
EXPECT_FALSE(list_.empty());
|
||||
EXPECT_FALSE(list_.single());
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(2, 1));
|
||||
}
|
||||
|
||||
TEST_F(DoublyLinkedListTest, InsertBack) {
|
||||
auto* node1 = list_.insert_back(allocator_.New());
|
||||
node1->value = 1;
|
||||
EXPECT_FALSE(list_.empty());
|
||||
EXPECT_TRUE(list_.single());
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(1));
|
||||
|
||||
auto* node2 = list_.insert_back(allocator_.New());
|
||||
node2->value = 2;
|
||||
EXPECT_FALSE(list_.empty());
|
||||
EXPECT_FALSE(list_.single());
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(1, 2));
|
||||
}
|
||||
|
||||
TEST_F(DoublyLinkedListTest, InsertAfter) {
|
||||
Node* node1 = list_.insert_back(allocator_.New());
|
||||
node1->value = 1;
|
||||
|
||||
Node* node2 = list_.insert_back(allocator_.New());
|
||||
node2->value = 2;
|
||||
|
||||
Node* node3 = list_.insert_after(node1, allocator_.New());
|
||||
node3->value = 3;
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2));
|
||||
|
||||
Node* node4 = list_.insert_after(node2, allocator_.New());
|
||||
node4->value = 4;
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2, 4));
|
||||
}
|
||||
|
||||
TEST_F(DoublyLinkedListTest, InsertBefore) {
|
||||
auto* node1 = list_.insert_back(allocator_.New());
|
||||
node1->value = 1;
|
||||
|
||||
auto* node2 = list_.insert_back(allocator_.New());
|
||||
node2->value = 2;
|
||||
|
||||
auto* node3 = list_.insert_before(node2, allocator_.New());
|
||||
node3->value = 3;
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(1, 3, 2));
|
||||
|
||||
auto* node4 = list_.insert_before(node1, allocator_.New());
|
||||
node4->value = 4;
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(4, 1, 3, 2));
|
||||
}
|
||||
|
||||
TEST_F(DoublyLinkedListTest, Erase) {
|
||||
auto* node1 = list_.insert_back(allocator_.New());
|
||||
node1->value = 1;
|
||||
|
||||
auto* node2 = list_.insert_back(allocator_.New());
|
||||
node2->value = 2;
|
||||
|
||||
auto* node3 = list_.insert_back(allocator_.New());
|
||||
node3->value = 3;
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(1, 2, 3));
|
||||
|
||||
allocator_.Return(list_.erase(node2));
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(1, 3));
|
||||
|
||||
allocator_.Return(list_.erase(node1));
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre(3));
|
||||
EXPECT_TRUE(list_.single());
|
||||
|
||||
allocator_.Return(list_.erase(node3));
|
||||
EXPECT_THAT(AsVector(list_), ElementsAre());
|
||||
EXPECT_TRUE(list_.empty());
|
||||
}
|
||||
|
||||
// Very inefficient but very simple implementation of Space-Saving. Should
|
||||
// return the same results as SpaceSavingMostFrequent.
|
||||
template <typename T>
|
||||
class SpaceSavingMostFrequentNaive {
|
||||
public:
|
||||
explicit SpaceSavingMostFrequentNaive(int storage_size)
|
||||
: storage_size_(storage_size), contents_(storage_size) {
|
||||
CHECK_GT(storage_size, 0);
|
||||
}
|
||||
|
||||
void Add(T value) {
|
||||
++current_timestamp_;
|
||||
|
||||
// If the value is already in the list, update its count and timestamp.
|
||||
for (auto& item : contents_) {
|
||||
if (item.value == value) {
|
||||
item.IncrementAndUpdate(current_timestamp_);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, replace the least frequent item with the new one.
|
||||
auto it2 = absl::c_min_element(contents_);
|
||||
*it2 = ItemCountAndTimestamp{
|
||||
.count = 1, .timestamp = current_timestamp_, .value = value};
|
||||
}
|
||||
|
||||
void FullyRemove(T value) {
|
||||
for (auto& item : contents_) {
|
||||
if (item.value == value) {
|
||||
item.Clear();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<T, int64_t>> GetMostFrequent(int num_samples) {
|
||||
std::vector<std::pair<T, int64_t>> result;
|
||||
|
||||
absl::c_sort(contents_, std::greater<ItemCountAndTimestamp>());
|
||||
for (const auto& [count, timestamp, value] : contents_) {
|
||||
if (count < 0) break;
|
||||
if (result.size() == num_samples) break;
|
||||
result.push_back({*value, count});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
int storage_size_;
|
||||
int64_t current_timestamp_ = 0;
|
||||
|
||||
struct ItemCountAndTimestamp {
|
||||
int64_t count = -1;
|
||||
int64_t timestamp = -1;
|
||||
std::optional<T> value;
|
||||
|
||||
void IncrementAndUpdate(int64_t new_timestamp) {
|
||||
++count;
|
||||
timestamp = new_timestamp;
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
count = -1;
|
||||
timestamp = -1;
|
||||
value = std::nullopt;
|
||||
}
|
||||
|
||||
auto AsTuple() const { return std::tie(count, timestamp, value); }
|
||||
|
||||
friend bool operator<(const ItemCountAndTimestamp& a,
|
||||
const ItemCountAndTimestamp& b) {
|
||||
return a.AsTuple() < b.AsTuple();
|
||||
}
|
||||
|
||||
friend bool operator>(const ItemCountAndTimestamp& a,
|
||||
const ItemCountAndTimestamp& b) {
|
||||
return a.AsTuple() > b.AsTuple();
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<ItemCountAndTimestamp> contents_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct Implementations {
|
||||
explicit Implementations(int storage_size)
|
||||
: impl(storage_size), naive(storage_size) {}
|
||||
|
||||
void Add(T value) {
|
||||
impl.Add(value);
|
||||
naive.Add(value);
|
||||
}
|
||||
void FullyRemove(T value) {
|
||||
impl.FullyRemove(value);
|
||||
naive.FullyRemove(value);
|
||||
}
|
||||
|
||||
std::vector<std::pair<T, int64_t>> GetMostFrequent(int num_samples) {
|
||||
const std::vector<std::pair<T, int64_t>> impl_result =
|
||||
impl.GetMostFrequent(num_samples);
|
||||
const std::vector<std::pair<T, int64_t>> naive_result =
|
||||
naive.GetMostFrequent(num_samples);
|
||||
EXPECT_THAT(impl_result, naive_result);
|
||||
return impl_result;
|
||||
}
|
||||
|
||||
void CheckIdenticalResults(int num_samples) {
|
||||
CHECK_EQ(impl.GetMostFrequent(num_samples),
|
||||
naive.GetMostFrequent(num_samples));
|
||||
}
|
||||
|
||||
SpaceSavingMostFrequent<T> impl;
|
||||
SpaceSavingMostFrequentNaive<T> naive;
|
||||
};
|
||||
|
||||
TEST(SpaceSavingMostFrequent, SimpleExamples) {
|
||||
Implementations<std::string> most_frequent(5);
|
||||
|
||||
most_frequent.Add("a"); // 1 : a
|
||||
most_frequent.Add("b"); // 1 : a, b
|
||||
most_frequent.Add("c"); // 1 : a, b, c
|
||||
most_frequent.Add("d"); // 1 : a, b, c, d
|
||||
most_frequent.Add("e"); // 1 : a, b, c, d, e
|
||||
most_frequent.Add("a"); // 2 : a | 1 : b, c, d, e
|
||||
most_frequent.Add("a"); // 3 : a | 1 : b, c, d, e
|
||||
most_frequent.Add("a"); // 4 : a | 1 : b, c, d, e
|
||||
most_frequent.Add("b"); // 4 : a | 2 : b | 1 : c, d, e
|
||||
most_frequent.Add("c"); // 4 : a | 2 : b, c | 1 : d, e
|
||||
most_frequent.Add("d"); // 4 : a | 2 : b, c, d | 1 : e
|
||||
most_frequent.Add("e"); // 4 : a | 2 : b, c, d, e
|
||||
|
||||
// Eviction starts.
|
||||
most_frequent.Add("f"); // 4 : a | 2 : c, d, e | 1 : f (b was evicted)
|
||||
most_frequent.Add("g"); // 4 : a | 2 : c, d, e | 1 : g (f was evicted)
|
||||
most_frequent.Add("h"); // 4 : a | 2 : c, d, e | 1 : h (g was evicted)
|
||||
most_frequent.Add("i"); // 4 : a | 2 : c, d, e | 1 : i (h was evicted)
|
||||
most_frequent.Add("j"); // 4 : a | 2 : c, d, e | 1 : j (i was evicted)
|
||||
most_frequent.Add("k"); // 4 : a | 2 : c, d, e | 1 : k (j was evicted)
|
||||
most_frequent.Add("l"); // 4 : a | 2 : c, d, e | 1 : l (k was evicted)
|
||||
most_frequent.Add("m"); // 4 : a | 2 : c, d, e | 1 : m (l was evicted)
|
||||
most_frequent.Add("n"); // 4 : a | 2 : c, d, e | 1 : n (m was evicted)
|
||||
most_frequent.Add("o"); // 4 : a | 2 : c, d, e | 1 : o (n was evicted)
|
||||
most_frequent.Add("p"); // 4 : a | 2 : c, d, e | 1 : p (o was evicted)
|
||||
most_frequent.Add("p"); // 4 : a | 2 : c, d, e, p
|
||||
most_frequent.Add("p"); // 4 : a | 3 : p | 2 : c, d, e
|
||||
|
||||
EXPECT_THAT(most_frequent.GetMostFrequent(10),
|
||||
ElementsAre(Pair("a", 4), Pair("p", 3), Pair("e", 2),
|
||||
Pair("d", 2), Pair("c", 2)));
|
||||
|
||||
most_frequent.FullyRemove("c"); // 4 : a | 3 : p | 2 : d, e
|
||||
most_frequent.Add("f"); // 4 : a | 3 : p | 2 : d, e | 1 : f
|
||||
|
||||
EXPECT_THAT(most_frequent.GetMostFrequent(10),
|
||||
ElementsAre(Pair("a", 4), Pair("p", 3), Pair("e", 2),
|
||||
Pair("d", 2), Pair("f", 1)));
|
||||
}
|
||||
|
||||
TEST(SpaceSavingMostFrequent, CornerCase) {
|
||||
Implementations<std::string> most_frequent(5);
|
||||
|
||||
most_frequent.Add("a"); // 1 : a
|
||||
most_frequent.Add("b"); // 1 : a, b
|
||||
most_frequent.Add("c"); // 1 : a, b, c
|
||||
most_frequent.Add("d"); // 1 : a, b, c, d
|
||||
most_frequent.Add("e"); // 1 : a, b, c, d, e
|
||||
most_frequent.Add("f"); // 1 : b, c, d, e, f
|
||||
most_frequent.Add("g"); // 1 : c, d, e, f, g
|
||||
|
||||
// Eviction starts.
|
||||
most_frequent.Add("x"); // 1 : d, e, f, g, x (a was evicted)
|
||||
most_frequent.Add("y"); // 1 : e, f, g, x, y (d was evicted)
|
||||
|
||||
// Here's an example of why we should remove the oldest item in case of a
|
||||
// tie on the frequency count: we don't want "y" to remove the "x".
|
||||
most_frequent.Add("x"); // 2 : x | 1 : e, f, g, y
|
||||
most_frequent.Add("y"); // 2 : x, y | 1 : e, f, g
|
||||
most_frequent.Add("x"); // 3 : x | 2 : y | 1 : e, f, g
|
||||
most_frequent.Add("y"); // 3 : x, y | 1 : e, f, g
|
||||
|
||||
EXPECT_THAT(most_frequent.GetMostFrequent(10),
|
||||
ElementsAre(Pair("y", 3), Pair("x", 3), Pair("g", 1),
|
||||
Pair("f", 1), Pair("e", 1)));
|
||||
}
|
||||
|
||||
TEST(SpaceSavingMostFrequent, RandomInstances) {
|
||||
absl::BitGen gen;
|
||||
static constexpr int kNumTests = 379;
|
||||
for (int test = 0; test < kNumTests; ++test) {
|
||||
const int num_items = absl::Uniform(gen, 0, 1000);
|
||||
const int num_samples = absl::Uniform(gen, 0, 100);
|
||||
const int storage_size = absl::Uniform(gen, 1, 100);
|
||||
|
||||
Implementations<int> most_frequent(storage_size);
|
||||
std::vector<int> values;
|
||||
values.reserve(num_items);
|
||||
for (int i = 0; i < num_items; ++i) {
|
||||
const int value = absl::Uniform(gen, 0, 1000);
|
||||
most_frequent.Add(value);
|
||||
if (absl::Bernoulli(gen, 0.1)) {
|
||||
auto vec = most_frequent.GetMostFrequent(num_samples);
|
||||
if (!vec.empty()) {
|
||||
const int to_remove = absl::Uniform(gen, 0u, vec.size());
|
||||
most_frequent.FullyRemove(vec[to_remove].first);
|
||||
}
|
||||
}
|
||||
values.push_back(value);
|
||||
}
|
||||
most_frequent.CheckIdenticalResults(num_samples);
|
||||
}
|
||||
}
|
||||
|
||||
template <int kElementSize>
|
||||
struct Element {
|
||||
Element() = default;
|
||||
explicit Element(int value) : value(value) {}
|
||||
int value;
|
||||
int zeros[kElementSize] = {};
|
||||
template <typename H>
|
||||
friend H AbslHashValue(H h, const Element& e) {
|
||||
return H::combine(std::move(h), e.value);
|
||||
}
|
||||
friend bool operator==(const Element& a, const Element& b) {
|
||||
return a.value == b.value;
|
||||
}
|
||||
};
|
||||
|
||||
template <int kSize, int kCapacity, int kElementSize>
|
||||
void BM_Add_GeometricDistributed(benchmark::State& state) {
|
||||
using Element = Element<kElementSize>;
|
||||
static constexpr int kNumInputs = 100;
|
||||
absl::BitGen random;
|
||||
std::vector<std::vector<Element>> inputs;
|
||||
inputs.reserve(kNumInputs);
|
||||
std::geometric_distribution<int> distribution(1.0 / kCapacity);
|
||||
for (int i = 0; i < kNumInputs; ++i) {
|
||||
std::vector<Element>& input = inputs.emplace_back();
|
||||
input.reserve(kSize);
|
||||
for (int j = 0; j < kSize; ++j) {
|
||||
input.push_back(Element(distribution(random)));
|
||||
}
|
||||
}
|
||||
|
||||
// Start the benchmark.
|
||||
for (auto _ : state) {
|
||||
for (const std::vector<Element>& input : inputs) {
|
||||
SpaceSavingMostFrequent<Element> most_frequent(kCapacity);
|
||||
for (const Element& value : input) {
|
||||
most_frequent.Add(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
state.SetItemsProcessed(state.iterations() * kNumInputs * kSize);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_Add_GeometricDistributed<30, 10, 0>);
|
||||
BENCHMARK(BM_Add_GeometricDistributed<100, 30, 0>);
|
||||
BENCHMARK(BM_Add_GeometricDistributed<1000, 100, 0>);
|
||||
BENCHMARK(BM_Add_GeometricDistributed<10000, 1000, 0>);
|
||||
BENCHMARK(BM_Add_GeometricDistributed<100000, 10000, 0>);
|
||||
|
||||
BENCHMARK(BM_Add_GeometricDistributed<30, 10, 4>);
|
||||
BENCHMARK(BM_Add_GeometricDistributed<100, 30, 4>);
|
||||
BENCHMARK(BM_Add_GeometricDistributed<1000, 100, 4>);
|
||||
BENCHMARK(BM_Add_GeometricDistributed<10000, 1000, 4>);
|
||||
BENCHMARK(BM_Add_GeometricDistributed<100000, 10000, 4>);
|
||||
|
||||
} // namespace
|
||||
} // namespace operations_research
|
||||
Reference in New Issue
Block a user