forked from google/or-tools
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathduplicate_remover.h
142 lines (122 loc) · 5 KB
/
duplicate_remover.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Copyright 2010-2024 Google LLC
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef OR_TOOLS_ALGORITHMS_DUPLICATE_REMOVER_H_
#define OR_TOOLS_ALGORITHMS_DUPLICATE_REMOVER_H_
#include <cstddef>
#include <cstdint>
#include <vector>
#include "absl/log/check.h"
#include "absl/numeric/bits.h"
#include "absl/random/distributions.h"
#include "absl/random/random.h"
#include "absl/types/span.h"
#include "google/protobuf/repeated_field.h"
namespace operations_research {
// This class offers an alternative to gtl::linked_hash_set<> which is:
// - stateless: it works directly on a vector<int> or any similar container,
// without storing extra data anywhere;
// - faster when the number of unique values is 5K or above.
//
// The memory usage can be O(num_distinct_values) at any time if you use
// AppendAndLazilyRemoveDuplicates(). In fact, unit tests verify that the
// average number of elements kept is ≤ 1.5 * num_distinct_values, making
// it comparable to a flat_hash_set<int> (whose overhead factor is ~1.68).
//
// Usage pattern:
//
// // One instance of this can handle many sets on the same [0, n) domain.
// int N = 100'000;
// DenseIntDuplicateRemover deduper(N); // Uses N/8 bytes of memory.
// std::vector<int> values; // Your container. Could be RepeatedField<int>.
// for (int x : ...) {
// deduper.AppendAndLazilyRemoveDuplicates(x, &values); // O(1) amortized.
// }
// deduper.RemoveDuplicates(&values); // O(values.size())
//
class DenseIntDuplicateRemover {
public:
explicit DenseIntDuplicateRemover(int n)
: n_(n),
tmp_mask_storage_((n + 7) / 8, 0),
tmp_mask_(tmp_mask_storage_) {}
template <class IntContainer>
void RemoveDuplicates(IntContainer* container);
template <class IntContainer>
void AppendAndLazilyRemoveDuplicates(int x, IntContainer* container);
private:
template <class IntContainer>
void Append(int x, IntContainer* container);
template <class IntContainer>
void Truncate(size_t new_size, IntContainer* container);
size_t RemoveDuplicatesInternal(absl::Span<int> span);
absl::BitGen random_;
const int n_;
std::vector<uint8_t> tmp_mask_storage_;
const absl::Span<uint8_t> tmp_mask_;
};
// _____________________________________________________________________________
// Implementation of the templates.
template <class IntContainer>
void DenseIntDuplicateRemover::RemoveDuplicates(IntContainer* container) {
const size_t new_size = RemoveDuplicatesInternal(absl::MakeSpan(*container));
Truncate(new_size, container);
}
template <class IntContainer>
void DenseIntDuplicateRemover::AppendAndLazilyRemoveDuplicates(
int x, IntContainer* container) {
DCHECK_GE(x, 0);
DCHECK_LT(x, n_);
Append(x, container);
// ALGORITHM:
// In order to remain stateless, yet call RemoveDuplicates() often enough
// that the size of the container remains O(num_distinct_elements), but not
// too often since we must remain O(1) time amortized, we randomize:
// every time we append an element, we'll call RemoveDuplicates() with
// probability 1/k, where k is the current size of the container.
// That way, the added expected complexity is O(k)*1/k = O(1), yet we know
// that we'll eventually call it. See the unit tests that verify the claims.
// As an important optimization, since drawing the pseudo-random number is
// expensive, we only perform it every kCheckPeriod, and to compensate we
// multiply the probability by the same amount.
constexpr int kCheckPeriod = 8;
static_assert(absl::popcount(unsigned(kCheckPeriod)) == 1,
"must be power of two");
const size_t size = container->size();
if (size & (kCheckPeriod - 1)) return;
if (size >= 2 * n_ ||
absl::Uniform<size_t>(random_, 0, container->size()) < kCheckPeriod) {
RemoveDuplicates(container);
}
}
template <>
inline void DenseIntDuplicateRemover::Append(int x,
std::vector<int>* container) {
container->push_back(x);
}
template <>
inline void DenseIntDuplicateRemover::Append(
int x, google::protobuf::RepeatedField<int>* container) {
container->Add(x);
}
template <>
inline void DenseIntDuplicateRemover::Truncate(size_t new_size,
std::vector<int>* container) {
container->resize(new_size);
}
template <>
inline void DenseIntDuplicateRemover::Truncate(
size_t new_size, google::protobuf::RepeatedField<int>* container) {
container->Truncate(new_size);
}
} // namespace operations_research
#endif // OR_TOOLS_ALGORITHMS_DUPLICATE_REMOVER_H_