-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #396 from jmalkin/ebpps
Sampling: Exact and Bounded, Probability Proportional to Size
- Loading branch information
Showing
12 changed files
with
2,107 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
#ifndef _EBPPS_SAMPLE_HPP_ | ||
#define _EBPPS_SAMPLE_HPP_ | ||
|
||
#include "common_defs.hpp" | ||
#include "optional.hpp" | ||
#include "serde.hpp" | ||
|
||
#include <memory> | ||
#include <vector> | ||
|
||
namespace datasketches { | ||
|
||
template< | ||
typename T, | ||
typename A = std::allocator<T> | ||
> | ||
class ebpps_sample { | ||
public: | ||
explicit ebpps_sample(uint32_t k, const A& allocator = A()); | ||
|
||
// constructor used to create a sample to merge one itme | ||
template<typename TT> | ||
ebpps_sample(TT&& item, double theta, const A& allocator = A()); | ||
|
||
// for deserialization | ||
class items_deleter; | ||
ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator = A()); | ||
|
||
void reset(); | ||
void downsample(double theta); | ||
|
||
template<typename FwdSample> | ||
void merge(FwdSample&& other); | ||
|
||
// standard way to query the sample | ||
using result_type = std::vector<T, A>; | ||
result_type get_sample() const; | ||
|
||
double get_c() const; | ||
|
||
// intended for internal use | ||
// returns only full items | ||
result_type get_full_items() const; | ||
|
||
// intended for internal use | ||
// handles only the partial item | ||
bool has_partial_item() const; | ||
T get_partial_item() const; | ||
|
||
string<A> to_string() const; | ||
|
||
/** | ||
* @brief Returns the number of items contained in the sample | ||
* Computes the number of items, full or partial, currently in the sample. | ||
* The result should match ceiling(c); | ||
* @return the number of items contained in the sample | ||
*/ | ||
inline uint32_t get_num_retained_items() const; | ||
|
||
/** | ||
* Computes size needed to serialize the current state of the sample. | ||
* This version is for fixed-size arithmetic types (integral and floating point). | ||
* @param sd instance of a SerDe | ||
* @return size in bytes needed to serialize the items in this sample | ||
*/ | ||
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0> | ||
inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const; | ||
|
||
/** | ||
* Computes size needed to serialize the items in the sample. | ||
* This version is for all other types and can be expensive since every item needs to be looked at. | ||
* @param sd instance of a SerDe | ||
* @return size in bytes needed to serialize the items in this sample | ||
*/ | ||
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0> | ||
inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const; | ||
|
||
// This is a convenience alias for users | ||
// The type returned by the following serialize method | ||
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>; | ||
|
||
/** | ||
* This method serializes the sample as a vector of bytes. | ||
* @param ptr pointer to start of pre-allocated memory to use | ||
* @param end_ptr pointer to end of pre-allocated memory to use | ||
* @param sd instance of a SerDe | ||
* @return number of bytes written | ||
*/ | ||
template<typename SerDe = serde<T>> | ||
size_t serialize(uint8_t* ptr, const uint8_t* end_ptr, const SerDe& sd = SerDe()) const; | ||
|
||
/** | ||
* This method serializes the sample into a given stream in a binary form | ||
* @param os output stream | ||
* @param sd instance of a SerDe | ||
*/ | ||
template<typename SerDe = serde<T>> | ||
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const; | ||
|
||
/** | ||
* This method deserializes a sample from a given array of bytes. | ||
* @param ptr pointer to the array of bytes | ||
* @param size the size of the array | ||
* @param sd instance of a SerDe | ||
* @param allocator instance of an allocator | ||
* @return a std::pair with a sample and the number of bytes read | ||
*/ | ||
template<typename SerDe = serde<T>> | ||
static std::pair<ebpps_sample, size_t> deserialize(const uint8_t* ptr, size_t size, const SerDe& sd = SerDe(), const A& allocator = A()); | ||
|
||
/** | ||
* This method deserializes a sample from a given stream. | ||
* @param is input stream | ||
* @param sd instance of a SerDe | ||
* @param allocator instance of an allocator | ||
* @return an instance of a sample | ||
*/ | ||
template<typename SerDe = serde<T>> | ||
static ebpps_sample deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A()); | ||
|
||
class const_iterator; | ||
|
||
/** | ||
* Iterator pointing to the first item in the sample. | ||
* If the sample is empty, the returned iterator must not be dereferenced or incremented | ||
* @param force_partial forces the inclusion of the partial item, if one exists | ||
* @return iterator pointing to the first item in the sample | ||
*/ | ||
const_iterator begin() const; | ||
|
||
/** | ||
* Iterator pointing to the past-the-end item in the sample. | ||
* The past-the-end item is the hypothetical item that would follow the last item. | ||
* It does not point to any item, and must not be dereferenced or incremented. | ||
* @return iterator pointing to the past-the-end item in the sample | ||
*/ | ||
const_iterator end() const; | ||
|
||
private: | ||
A allocator_; | ||
double c_; // Current sample size, including fractional part | ||
optional<T> partial_item_; // a sample item corresponding to a partial weight | ||
std::vector<T, A> data_; // stored sampled items | ||
|
||
template<typename FwdItem> | ||
inline void set_partial(FwdItem&& item); | ||
void swap_with_partial(); | ||
void move_one_to_partial(); | ||
void subsample(uint32_t num_samples); | ||
|
||
static inline uint32_t random_idx(uint32_t max); | ||
static inline double next_double(); | ||
|
||
friend class const_iterator; | ||
}; | ||
|
||
template<typename T, typename A> | ||
class ebpps_sample<T, A>::const_iterator { | ||
public: | ||
using iterator_category = std::input_iterator_tag; | ||
using value_type = const T&; | ||
using difference_type = void; | ||
using pointer = const return_value_holder<value_type>; | ||
using reference = value_type; | ||
|
||
const_iterator(const const_iterator& other); | ||
const_iterator& operator++(); | ||
const_iterator& operator++(int); | ||
bool operator==(const const_iterator& other) const; | ||
bool operator!=(const const_iterator& other) const; | ||
reference operator*() const; | ||
pointer operator->() const; | ||
|
||
private: | ||
static const size_t PARTIAL_IDX = static_cast<size_t>(-1); | ||
|
||
// default iterator over sample | ||
const_iterator(const ebpps_sample<T, A>* sample); | ||
|
||
const ebpps_sample<T, A>* sample_; | ||
size_t idx_; | ||
bool use_partial_; | ||
|
||
friend class ebpps_sample; | ||
}; | ||
|
||
} // namespace datasketches | ||
|
||
#include "ebpps_sample_impl.hpp" | ||
|
||
#endif // _EBPPS_SAMPLE_HPP_ |
Oops, something went wrong.