ortools-clone/ortools/sat/inclusion.h

// Copyright 2010-2025 Google LLC
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef ORTOOLS_SAT_INCLUSION_H_
#define ORTOOLS_SAT_INCLUSION_H_

#include <stddef.h>
#include <stdint.h>

#include <algorithm>
#include <cstdint>
#include <functional>
#include <limits>
#include <tuple>
#include <utility>
#include <vector>

#include "absl/log/check.h"
#include "absl/types/span.h"
#include "ortools/base/logging.h"
#include "ortools/sat/util.h"
#include "ortools/util/bitset.h"
#include "ortools/util/time_limit.h"

namespace operations_research {
namespace sat {

// An helper class to process many sets of integer in [0, n] and detects all the
// set included in each others. This is a common operations in presolve, and
// while it can be slow the algorithm used here is pretty efficient in practice.
//
// The algorithm is based on the SAT preprocessing algorithm to detect clauses
// that subsumes others. It uses a one-watcher scheme where each subset
// candidate has only one element watched. To identify all potential subset of a
// superset, one need to inspect the watch list for all element of the superset
// candidate.
//
// The number n will be detected automatically but we allocate various vector
// of size n, so avoid having large integer values in your sets.
//
// All set contents will be accessed via storage_[index].
// This can be used with a vector<vector<>> or our CompactVectorVector that we
// use in a few place. But it can also be anything that support:
// - storage_.size()
// - range iteration over storage_[index]
// - storage_[index].size()
template <class Storage>
class InclusionDetector {
 public:
  InclusionDetector(const Storage& storage, TimeLimit* time_limit)
      : storage_(storage), time_limit_(time_limit) {}

  // Resets the class to an empty state.
  void Reset() {
    num_potential_subsets_ = 0;
    num_potential_supersets_ = 0;
    candidates_.clear();
  }

  // Adds a candidate set to consider for the next DetectInclusions() call.
  // The argument is an index that will only be used via storage_[index] to get
  // the content of the candidate set.
  //
  // Note that set with no element are just ignored and will never be returned
  // as part of an inclusion.
  void AddPotentialSubset(int index);
  void AddPotentialSuperset(int index);
  void AddPotentialSet(int index);

  // By default we will detect all inclusions. It is possible to make sure we
  // don't do more than O(work_limit) operations and eventually abort early by
  // setting this. Note that we don't reset it on Reset().
  //
  // This is needed, because for m candidates of size n, we can have O(m ^ 2)
  // inclusions, each requiring O(n) work to check.
  void SetWorkLimit(uint64_t work_limit) { work_limit_ = work_limit; }

  // Finds all subset included in a superset and call "process" on each of the
  // detected inclusion. The std::function argument corresponds to indices
  // passed to the Add*() calls.
  //
  // The order of detection will be by increasing superset size. For superset
  // with the same size, the order will be deterministic but not specified. And
  // similarly, for a given superset, the order of the included subsets is
  // deterministic but not specified.
  //
  // Note that only the candidate marked as such can be a subset/superset.
  // For the candidate than can be both and are duplicates (i.e. same set), only
  // one pair will be returned. We will also never return identity inclusion and
  // we always have subset != superset.
  void DetectInclusions(
      const std::function<void(int subset, int superset)>& process);

  // Function that should only be used from within "process()".
  // Stop will abort the current search. The other two will cause the
  // corresponding candidate set to never appear in any future inclusion.
  void StopProcessingCurrentSubset() { stop_with_current_subset_ = true; }
  void StopProcessingCurrentSuperset() { stop_with_current_superset_ = true; }
  void Stop() {
    stop_ = true;
    signatures_.clear();
    one_watcher_.clear();
    is_in_superset_.resize(0);
  }

  // The algorithm here can detect many small set included in a big set while
  // only scanning the superset once. So if we do scan the superset in the
  // process function, we can do a lot more work. This is here to reuse the
  // deterministic limit mechanism.
  void IncreaseWorkDone(uint64_t increase) { work_done_ += increase; }

  // Stats.
  int num_potential_subsets() const { return num_potential_subsets_; }
  int num_potential_supersets() const { return num_potential_supersets_; }
  uint64_t work_done() const { return work_done_; }
  bool Stopped() const { return stop_; }

 private:
  // Allows to access the elements of each candidates via storage_[index];
  const Storage& storage_;

  TimeLimit* time_limit_;

  // List of candidates, this will be sorted.
  struct Candidate {
    int index;  // Storage index.
    int size;

    // For identical sizes, we need this order for correctness
    // 0: subset only, 1: both, 2: superset only.
    int order = 1;

    bool CanBeSubset() const { return order <= 1; }
    bool CanBeSuperset() const { return order >= 1; }

    // We use this with stable_sort, so no need to add the index.
    bool operator<(const Candidate& other) const {
      return std::tie(size, order) < std::tie(other.size, other.order);
    }
  };
  std::vector<Candidate> candidates_;

  int num_potential_subsets_ = 0;
  int num_potential_supersets_ = 0;
  uint64_t work_done_ = 0;
  uint64_t work_limit_ = std::numeric_limits<uint64_t>::max();

  bool stop_ = false;
  bool stop_with_current_subset_ = false;
  bool stop_with_current_superset_ = false;
  std::vector<uint64_t> signatures_;
  std::vector<std::vector<int>> one_watcher_;  // Index in candidates_.
  std::vector<int> superset_elements_;
  Bitset64<int> is_in_superset_;
};

// Similar API and purpose to InclusionDetector. But this one is a bit simpler
// and faster if it fit your usage. This assume an initial given set of
// potential subsets, that will be queried against supersets one by one.
template <class Storage>
class SubsetsDetector {
 public:
  SubsetsDetector(const Storage& storage, TimeLimit* time_limit)
      : storage_(storage), time_limit_(time_limit) {}

  void SetWorkLimit(uint64_t work_limit) { work_limit_ = work_limit; }
  void StopProcessingCurrentSubset() { stop_with_current_subset_ = true; }
  void StopProcessingCurrentSuperset() { stop_with_current_superset_ = true; }
  void Stop() {
    stop_ = true;
    one_watcher_.clear();
    is_in_superset_.resize(0);
  }

  uint64_t work_done() const { return work_done_; }
  bool Stopped() const { return stop_; }

  // Different API than InclusionDetector.
  // 1/ Add all potential subset to the storage_.
  // 2/ Call IndexAllStorageAsSubsets()
  // 3/ Call one or more time FindSubsets().
  //    - process() can call StopProcessingCurrentSuperset() to abort early
  //    - process() can call StopProcessingCurrentSubset() to never consider
  //      that subset again.
  // 4/ Call Stop() to reclaim some memory.
  //
  // Optimization: next_index_to_try is an index in superset that can be used
  // to skip some position for which we already called FindSubsets().
  void IndexAllStorageAsSubsets();
  void FindSubsets(absl::Span<const int> superset, int* next_index_to_try,
                   const std::function<void(int subset)>& process);

 private:
  // Allows to access the elements of each subsets via storage_[index];
  const Storage& storage_;

  TimeLimit* time_limit_;
  uint64_t work_done_ = 0;
  uint64_t work_limit_ = std::numeric_limits<uint64_t>::max();

  struct OneWatcherData {
    int index;
    int other_element;
    uint64_t signature;
  };

  bool stop_ = false;
  bool stop_with_current_subset_ = false;
  bool stop_with_current_superset_ = false;
  CompactVectorVector<int, OneWatcherData> one_watcher_;
  Bitset64<int> is_in_superset_;
};

// Deduction guide.
template <typename Storage>
InclusionDetector(const Storage& storage) -> InclusionDetector<Storage>;

template <typename Storage>
SubsetsDetector(const Storage& storage) -> SubsetsDetector<Storage>;

template <typename Storage>
inline void InclusionDetector<Storage>::AddPotentialSet(int index) {
  DCHECK_GE(index, 0);
  DCHECK_LT(index, storage_.size());
  const int num_elements = storage_[index].size();
  if (num_elements == 0) return;

  ++num_potential_subsets_;
  ++num_potential_supersets_;
  candidates_.push_back({index, num_elements, /*order=*/1});
}

template <typename Storage>
inline void InclusionDetector<Storage>::AddPotentialSubset(int index) {
  DCHECK_GE(index, 0);
  DCHECK_LT(index, storage_.size());
  const int num_elements = storage_[index].size();
  if (num_elements == 0) return;

  ++num_potential_subsets_;
  candidates_.push_back({index, num_elements, /*order=*/0});
}

template <typename Storage>
inline void InclusionDetector<Storage>::AddPotentialSuperset(int index) {
  DCHECK_GE(index, 0);
  DCHECK_LT(index, storage_.size());
  const int num_elements = storage_[index].size();
  if (num_elements == 0) return;

  DCHECK_GE(index, 0);
  DCHECK_LT(index, storage_.size());
  ++num_potential_supersets_;
  candidates_.push_back({index, num_elements, /*order=*/2});
}

// Compute the signature and the maximum element. We want a
// signature that is order invariant and is compatible with inclusion.
inline std::pair<uint64_t, int> ComputeSignatureAndMaxElement(
    absl::Span<const int> elements) {
  uint64_t signature = 0;
  int max_element = 0;
  for (const int e : elements) {
    DCHECK_GE(e, 0);
    max_element = std::max(max_element, e);
    signature |= (int64_t{1} << (e & 63));
  }
  return {signature, max_element};
}

template <typename Storage>
inline void InclusionDetector<Storage>::DetectInclusions(
    const std::function<void(int subset, int superset)>& process) {
  // No need to do any work in these cases.
  if (candidates_.size() <= 1) return;
  if (num_potential_subsets_ == 0) return;
  if (num_potential_supersets_ == 0) return;

  // Temp data must be ready to use.
  stop_ = false;
  DCHECK(signatures_.empty());
  DCHECK(one_watcher_.empty());

  // We check each time our work_done_ has increased by more than this.
  constexpr int64_t kCheckTimeLimitInterval = 1000;
  int64_t next_time_limit_check = kCheckTimeLimitInterval;

  // Main algo.
  work_done_ = 0;
  std::stable_sort(candidates_.begin(), candidates_.end());
  for (const Candidate& candidate : candidates_) {
    const auto& candidate_elements = storage_[candidate.index];
    const int candidate_index = signatures_.size();

    const auto [signature, max_element] =
        ComputeSignatureAndMaxElement(candidate_elements);
    signatures_.push_back(signature);
    DCHECK_EQ(is_in_superset_.size(), one_watcher_.size());
    if (max_element >= is_in_superset_.size()) {
      is_in_superset_.resize(max_element + 1);
      one_watcher_.resize(max_element + 1);
    }

    stop_with_current_superset_ = false;
    if (candidate.CanBeSuperset()) {
      const Candidate& superset = candidate;

      // Bitset should be cleared.
      DCHECK(std::all_of(is_in_superset_.begin(), is_in_superset_.end(),
                         [](bool b) { return !b; }));

      // Find any subset included in current superset.
      work_done_ += 2 * superset.size;
      if (work_done_ > work_limit_) return Stop();
      if (work_done_ > next_time_limit_check) {
        if (time_limit_->LimitReached()) return Stop();
        next_time_limit_check = work_done_ + kCheckTimeLimitInterval;
      }

      // We make a copy because process() might alter the content of the
      // storage when it returns "stop_with_current_superset_" and we need
      // to clean is_in_superset_ properly.
      //
      // TODO(user): Alternatively, we could clean is_in_superset_ in the
      // call to StopProcessingCurrentSuperset() and force client to call it
      // before altering the superset content.
      superset_elements_.assign(candidate_elements.begin(),
                                candidate_elements.end());
      for (const int e : superset_elements_) {
        is_in_superset_.Set(e);
      }

      const uint64_t superset_signature = signatures_.back();
      const auto is_in_superset_view = is_in_superset_.const_view();
      for (const int superset_e : superset_elements_) {
        work_done_ += one_watcher_[superset_e].size();
        for (int i = 0; i < one_watcher_[superset_e].size(); ++i) {
          const int c_index = one_watcher_[superset_e][i];
          const Candidate& subset = candidates_[c_index];
          DCHECK_LE(subset.size, superset.size);

          // Quick check with signature.
          if ((signatures_[c_index] & ~superset_signature) != 0) continue;

          // Long check with bitset.
          bool is_included = true;
          work_done_ += subset.size;
          if (work_done_ > work_limit_) return Stop();
          if (work_done_ > next_time_limit_check) {
            if (time_limit_->LimitReached()) return Stop();
            next_time_limit_check = work_done_ + kCheckTimeLimitInterval;
          }
          for (const int subset_e : storage_[subset.index]) {
            if (!is_in_superset_view[subset_e]) {
              is_included = false;
              break;
            }
          }
          if (!is_included) continue;

          stop_with_current_subset_ = false;
          process(subset.index, superset.index);

          if (stop_) return;
          if (work_done_ > work_limit_) return Stop();
          if (work_done_ > next_time_limit_check) {
            if (time_limit_->LimitReached()) return Stop();
            next_time_limit_check = work_done_ + kCheckTimeLimitInterval;
          }

          if (stop_with_current_subset_) {
            // Remove from the watcher list.
            std::swap(one_watcher_[superset_e][i],
                      one_watcher_[superset_e].back());
            one_watcher_[superset_e].pop_back();
            --i;
          }
          if (stop_with_current_superset_) break;
        }
        if (stop_with_current_superset_) break;
      }

      // Cleanup.
      for (const int e : superset_elements_) {
        is_in_superset_.ClearBucket(e);
      }
    }

    // Add new subset candidate to the watchers.
    //
    // Tricky: If this was also a superset and has been removed, we don't want
    // to watch it!
    if (candidate.CanBeSubset() && !stop_with_current_superset_) {
      // Choose to watch the one with smallest list.
      int best_choice = -1;
      work_done_ += candidate.size;
      if (work_done_ > work_limit_) return Stop();
      for (const int e : candidate_elements) {
        DCHECK_GE(e, 0);
        DCHECK_LT(e, one_watcher_.size());
        if (best_choice == -1 ||
            one_watcher_[e].size() < one_watcher_[best_choice].size()) {
          best_choice = e;
        }
      }
      DCHECK_NE(best_choice, -1);
      one_watcher_[best_choice].push_back(candidate_index);
    }
  }

  // Stop also performs some cleanup.
  Stop();
}

template <typename Storage>
inline void SubsetsDetector<Storage>::IndexAllStorageAsSubsets() {
  stop_ = false;

  // Flat representation of one_watcher_, we will fill it in one go from there.
  std::vector<int> tmp_keys;
  std::vector<OneWatcherData> tmp_values;
  std::vector<int> element_to_num_watched;

  work_done_ = 0;
  for (int index = 0; index < storage_.size(); ++index) {
    const auto& subset = storage_[index];
    CHECK_GE(subset.size(), 2);

    const auto [signature, max_element] = ComputeSignatureAndMaxElement(subset);
    if (max_element >= is_in_superset_.size()) {
      is_in_superset_.resize(max_element + 1);
    }
    if (max_element >= element_to_num_watched.size()) {
      element_to_num_watched.resize(max_element + 1);
    }

    // Choose to watch the one with smallest list so far.
    int best_choice = -1;
    int best_value = -1;
    int second_choice = -1;
    int second_value = -1;
    work_done_ += subset.size();
    if (work_done_ > work_limit_) return Stop();
    for (const int e : subset) {
      DCHECK_GE(e, 0);
      DCHECK_LT(e, element_to_num_watched.size());
      const int value = element_to_num_watched[e];
      if (value >= best_value) {
        second_choice = best_choice;
        second_value = best_value;
        best_choice = e;
        best_value = value;
      } else if (value > second_value) {
        second_choice = e;
        second_value = value;
      }
    }
    DCHECK_NE(best_choice, -1);
    DCHECK_NE(second_choice, -1);
    DCHECK_NE(best_choice, second_choice);

    element_to_num_watched[best_choice]++;
    tmp_keys.push_back(best_choice);
    tmp_values.push_back({index, second_choice, signature});
  }

  one_watcher_.ResetFromFlatMapping(tmp_keys, tmp_values);
}

template <typename Storage>
inline void SubsetsDetector<Storage>::FindSubsets(
    absl::Span<const int> superset, int* next_index_to_try,
    const std::function<void(int subset)>& process) {
  // We check each time our work_done_ has increased by more than this.
  constexpr int64_t kCheckTimeLimitInterval = 1000;
  int64_t next_time_limit_check = kCheckTimeLimitInterval;

  // Compute the signature and also resize vector if needed. We want a
  // signature that is order invariant and is compatible with inclusion.
  const auto [superset_signature, max_element] =
      ComputeSignatureAndMaxElement(superset);
  if (max_element >= is_in_superset_.size()) {
    is_in_superset_.resize(max_element + 1);
  }

  // Find any subset included in current superset.
  work_done_ += 2 * superset.size();
  if (work_done_ > work_limit_) return Stop();
  if (work_done_ > next_time_limit_check) {
    if (time_limit_->LimitReached()) return Stop();
    next_time_limit_check = work_done_ + kCheckTimeLimitInterval;
  }

  // Bitset should be cleared.
  DCHECK(std::all_of(is_in_superset_.begin(), is_in_superset_.end(),
                     [](bool b) { return !b; }));
  for (const int e : superset) {
    is_in_superset_.Set(e);
  }

  stop_with_current_superset_ = false;
  const auto is_in_superset_view = is_in_superset_.const_view();
  for (; *next_index_to_try < superset.size(); ++*next_index_to_try) {
    const int superset_e = superset[*next_index_to_try];
    if (superset_e >= one_watcher_.size()) continue;
    auto cached_span = one_watcher_[superset_e];
    for (int i = 0; i < cached_span.size(); ++i) {
      ++work_done_;

      // Do a bunch of quick checks. The second one is optimized for size 2
      // which happens a lot in our usage of merging clique with implications.
      const auto [subset_index, other_e, subset_signature] = cached_span[i];
      if ((subset_signature & ~superset_signature) != 0) continue;
      if (!is_in_superset_view[other_e]) continue;

      // Long check with bitset.
      const absl::Span<const int> subset = storage_[subset_index];
      if (subset.size() > superset.size()) continue;

      // TODO(user): Technically we do not need to check the watched position or
      // the "other element" position, we could do that by permuting them first
      // or last and iterating on a subspan. However, in many slow situation, we
      // have millions of size 2 sets, and the time is dominated by the first
      // check.
      bool is_included = true;
      work_done_ += subset.size();
      if (work_done_ > work_limit_) return Stop();
      if (work_done_ > next_time_limit_check) {
        if (time_limit_->LimitReached()) return Stop();
        next_time_limit_check = work_done_ + kCheckTimeLimitInterval;
      }
      for (const int subset_e : subset) {
        if (!is_in_superset_view[subset_e]) {
          is_included = false;
          break;
        }
      }
      if (!is_included) continue;

      stop_with_current_subset_ = false;
      process(subset_index);

      // TODO(user): Remove this and the more complex API need once we move
      // class.
      if (stop_) return;
      if (work_done_ > work_limit_) return Stop();
      if (work_done_ > next_time_limit_check) {
        if (time_limit_->LimitReached()) return Stop();
        next_time_limit_check = work_done_ + kCheckTimeLimitInterval;
      }

      if (stop_with_current_subset_) {
        one_watcher_.RemoveBySwap(superset_e, i);
        cached_span.remove_suffix(1);
        --i;
      }
      if (stop_with_current_superset_) break;
    }
    if (stop_with_current_superset_) break;
  }

  // Cleanup.
  for (const int e : superset) {
    is_in_superset_.ClearBucket(e);
  }
}

}  // namespace sat
}  // namespace operations_research

#endif  // ORTOOLS_SAT_INCLUSION_H_