Skip to content

Commit

Permalink
Merge pull request #396 from jmalkin/ebpps
Browse files Browse the repository at this point in the history
Sampling: Exact and Bounded, Probability Proportional to Size
  • Loading branch information
jmalkin authored Oct 19, 2023
2 parents 253eacf + 0328686 commit 90176b6
Show file tree
Hide file tree
Showing 12 changed files with 2,107 additions and 9 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
# Enable testing
option(BUILD_TESTS "Build unit tests" ON)
if (BUILD_TESTS)
list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure")
enable_testing()
endif()

Expand Down
5 changes: 4 additions & 1 deletion common/include/common_defs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,11 @@ namespace random_utils {
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
static thread_local std::mt19937_64 rand(rd());
static thread_local std::uniform_real_distribution<> next_double(0.0, 1.0);
}

inline void override_seed(uint64_t s) {
rand.seed(s);
}
}

// utility function to hide unused compiler warning
// usually has no additional cost
Expand Down
4 changes: 4 additions & 0 deletions sampling/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,8 @@ install(FILES
include/var_opt_sketch_impl.hpp
include/var_opt_union.hpp
include/var_opt_union_impl.hpp
include/ebpps_sample.hpp
include/ebpps_sample_impl.hpp
include/ebpps_sketch.hpp
include/ebpps_sketch_impl.hpp
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
210 changes: 210 additions & 0 deletions sampling/include/ebpps_sample.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#ifndef _EBPPS_SAMPLE_HPP_
#define _EBPPS_SAMPLE_HPP_

#include "common_defs.hpp"
#include "optional.hpp"
#include "serde.hpp"

#include <memory>
#include <vector>

namespace datasketches {

template<
typename T,
typename A = std::allocator<T>
>
class ebpps_sample {
public:
explicit ebpps_sample(uint32_t k, const A& allocator = A());

// constructor used to create a sample to merge one itme
template<typename TT>
ebpps_sample(TT&& item, double theta, const A& allocator = A());

// for deserialization
class items_deleter;
ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator = A());

void reset();
void downsample(double theta);

template<typename FwdSample>
void merge(FwdSample&& other);

// standard way to query the sample
using result_type = std::vector<T, A>;
result_type get_sample() const;

double get_c() const;

// intended for internal use
// returns only full items
result_type get_full_items() const;

// intended for internal use
// handles only the partial item
bool has_partial_item() const;
T get_partial_item() const;

string<A> to_string() const;

/**
* @brief Returns the number of items contained in the sample
* Computes the number of items, full or partial, currently in the sample.
* The result should match ceiling(c);
* @return the number of items contained in the sample
*/
inline uint32_t get_num_retained_items() const;

/**
* Computes size needed to serialize the current state of the sample.
* This version is for fixed-size arithmetic types (integral and floating point).
* @param sd instance of a SerDe
* @return size in bytes needed to serialize the items in this sample
*/
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;

/**
* Computes size needed to serialize the items in the sample.
* This version is for all other types and can be expensive since every item needs to be looked at.
* @param sd instance of a SerDe
* @return size in bytes needed to serialize the items in this sample
*/
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;

// This is a convenience alias for users
// The type returned by the following serialize method
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;

/**
* This method serializes the sample as a vector of bytes.
* @param ptr pointer to start of pre-allocated memory to use
* @param end_ptr pointer to end of pre-allocated memory to use
* @param sd instance of a SerDe
* @return number of bytes written
*/
template<typename SerDe = serde<T>>
size_t serialize(uint8_t* ptr, const uint8_t* end_ptr, const SerDe& sd = SerDe()) const;

/**
* This method serializes the sample into a given stream in a binary form
* @param os output stream
* @param sd instance of a SerDe
*/
template<typename SerDe = serde<T>>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;

/**
* This method deserializes a sample from a given array of bytes.
* @param ptr pointer to the array of bytes
* @param size the size of the array
* @param sd instance of a SerDe
* @param allocator instance of an allocator
* @return a std::pair with a sample and the number of bytes read
*/
template<typename SerDe = serde<T>>
static std::pair<ebpps_sample, size_t> deserialize(const uint8_t* ptr, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());

/**
* This method deserializes a sample from a given stream.
* @param is input stream
* @param sd instance of a SerDe
* @param allocator instance of an allocator
* @return an instance of a sample
*/
template<typename SerDe = serde<T>>
static ebpps_sample deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());

class const_iterator;

/**
* Iterator pointing to the first item in the sample.
* If the sample is empty, the returned iterator must not be dereferenced or incremented
* @param force_partial forces the inclusion of the partial item, if one exists
* @return iterator pointing to the first item in the sample
*/
const_iterator begin() const;

/**
* Iterator pointing to the past-the-end item in the sample.
* The past-the-end item is the hypothetical item that would follow the last item.
* It does not point to any item, and must not be dereferenced or incremented.
* @return iterator pointing to the past-the-end item in the sample
*/
const_iterator end() const;

private:
A allocator_;
double c_; // Current sample size, including fractional part
optional<T> partial_item_; // a sample item corresponding to a partial weight
std::vector<T, A> data_; // stored sampled items

template<typename FwdItem>
inline void set_partial(FwdItem&& item);
void swap_with_partial();
void move_one_to_partial();
void subsample(uint32_t num_samples);

static inline uint32_t random_idx(uint32_t max);
static inline double next_double();

friend class const_iterator;
};

template<typename T, typename A>
class ebpps_sample<T, A>::const_iterator {
public:
using iterator_category = std::input_iterator_tag;
using value_type = const T&;
using difference_type = void;
using pointer = const return_value_holder<value_type>;
using reference = value_type;

const_iterator(const const_iterator& other);
const_iterator& operator++();
const_iterator& operator++(int);
bool operator==(const const_iterator& other) const;
bool operator!=(const const_iterator& other) const;
reference operator*() const;
pointer operator->() const;

private:
static const size_t PARTIAL_IDX = static_cast<size_t>(-1);

// default iterator over sample
const_iterator(const ebpps_sample<T, A>* sample);

const ebpps_sample<T, A>* sample_;
size_t idx_;
bool use_partial_;

friend class ebpps_sample;
};

} // namespace datasketches

#include "ebpps_sample_impl.hpp"

#endif // _EBPPS_SAMPLE_HPP_
Loading

0 comments on commit 90176b6

Please sign in to comment.