Skip to content

Commit

Permalink
Merge pull request #431 from apache/filter
Browse files Browse the repository at this point in the history
implemented filter
  • Loading branch information
AlexanderSaydakov authored May 10, 2024
2 parents 85254b7 + af8e2b9 commit 836b87e
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 1 deletion.
29 changes: 28 additions & 1 deletion tuple/include/tuple_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,15 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
*/
compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;

/**
* Produces a Compact Tuple sketch from this sketch
* by applying a given predicate to each entry.
* @param predicate should return true for the entries to keep
* @return compact sketch with the entries retained according to the predicate
*/
template<typename Predicate>
compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;

virtual iterator begin();
virtual iterator end();
virtual const_iterator begin() const;
Expand Down Expand Up @@ -480,6 +489,25 @@ class compact_tuple_sketch: public tuple_sketch<Summary, Allocator> {
virtual uint32_t get_num_retained() const;
virtual uint16_t get_seed_hash() const;

/**
* Produces a Compact Tuple sketch from this sketch
* by applying a given predicate to each entry.
* @param predicate should return true for the entries to keep
* @return compact sketch with the entries retained according to the predicate
*/
template<typename Predicate>
compact_tuple_sketch filter(const Predicate& predicate) const;

/**
* Produces a Compact Tuple sketch from a given sketch (Update or Compact)
* by applying a given predicate to each entry.
* @param sketch input sketch
* @param predicate should return true for the entries to keep
* @return compact sketch with the entries retained according to the predicate
*/
template<typename Sketch, typename Predicate>
static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);

/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
Expand Down Expand Up @@ -579,7 +607,6 @@ class compact_tuple_sketch: public tuple_sketch<Summary, Allocator> {
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);

};

/// Tuple base builder
Expand Down
33 changes: 33 additions & 0 deletions tuple/include/tuple_sketch_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,12 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered
return compact_tuple_sketch<S, A>(*this, ordered);
}

template<typename S, typename U, typename P, typename A>
template<typename Predicate>
compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const Predicate& predicate) const {
return compact_tuple_sketch<S, A>::filter(*this, predicate);
}

template<typename S, typename U, typename P, typename A>
void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
Expand Down Expand Up @@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch<S, A>::get_seed_hash() const {
return seed_hash_;
}

template<typename S, typename A>
template<typename Predicate>
compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& predicate) const {
return filter(*this, predicate);
}

template<typename S, typename A>
template<typename Sketch, typename Predicate>
compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& sketch, const Predicate& predicate) {
std::vector<Entry, AllocEntry> entries(sketch.get_allocator());
entries.reserve(sketch.get_num_retained());
std::copy_if(
sketch.begin(),
sketch.end(),
std::back_inserter(entries),
[&predicate](const Entry& e) {return predicate(e.second);}
);
entries.shrink_to_fit();
return compact_tuple_sketch(
!sketch.is_estimation_mode() && entries.empty(),
sketch.is_ordered(),
sketch.get_seed_hash(),
sketch.get_theta64(),
std::move(entries)
);
}

// implementation for fixed-size arithmetic types (integral and floating point)
template<typename S, typename A>
template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type>
Expand Down
61 changes: 61 additions & 0 deletions tuple/test/tuple_sketch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sk
REQUIRE(sketch.get_num_retained() == 3);
}

TEST_CASE("filter", "[tuple_sketch]") {
auto usk = update_tuple_sketch<int>::builder().build();

{ // empty update sketch
auto sk = usk.filter([](int){return true;});
REQUIRE(sk.is_empty());
REQUIRE(sk.is_ordered());
REQUIRE(sk.get_num_retained() == 0);
}

{ // empty compact sketch
auto sk = usk.compact().filter([](int){return true;});
REQUIRE(sk.is_empty());
REQUIRE(sk.is_ordered());
REQUIRE(sk.get_num_retained() == 0);
}

usk.update(1, 1);
usk.update(1, 1);
usk.update(2, 1);
usk.update(2, 1);
usk.update(3, 1);

{ // exact mode update sketch
auto sk = usk.filter([](int v){return v > 1;});
REQUIRE_FALSE(sk.is_empty());
REQUIRE_FALSE(sk.is_ordered());
REQUIRE_FALSE(sk.is_estimation_mode());
REQUIRE(sk.get_num_retained() == 2);
}

{ // exact mode compact sketch
auto sk = usk.compact().filter([](int v){return v > 1;});
REQUIRE_FALSE(sk.is_empty());
REQUIRE(sk.is_ordered());
REQUIRE_FALSE(sk.is_estimation_mode());
REQUIRE(sk.get_num_retained() == 2);
}

// only keys 1 and 2 had values of 2, which will become 3 after this update
// some entries are discarded in estimation mode, but these happen to survive
// the process is deterministic, so the test will always work
for (int i = 0; i < 10000; ++i) usk.update(i, 1);

{ // estimation mode update sketch
auto sk = usk.filter([](int v){return v > 2;});
REQUIRE_FALSE(sk.is_empty());
REQUIRE_FALSE(sk.is_ordered());
REQUIRE(sk.is_estimation_mode());
REQUIRE(sk.get_num_retained() == 2);
}

{ // estimation mode compact sketch
auto sk = usk.compact().filter([](int v){return v > 2;});
REQUIRE_FALSE(sk.is_empty());
REQUIRE(sk.is_ordered());
REQUIRE(sk.is_estimation_mode());
REQUIRE(sk.get_num_retained() == 2);
}
}

} /* namespace datasketches */

0 comments on commit 836b87e

Please sign in to comment.