diff --git a/.github/workflows/ocs-operator-ci.yaml b/.github/workflows/ocs-operator-ci.yaml index f98e348cdd..cfc5d55afd 100644 --- a/.github/workflows/ocs-operator-ci.yaml +++ b/.github/workflows/ocs-operator-ci.yaml @@ -1,6 +1,9 @@ --- name: ocs-operator sanity checks +env: + CGO_CFLAGS: "-I/home/runner/work/ocs-operator/ocs-operator/shared/" + on: push: branches: ["*"] @@ -103,7 +106,7 @@ jobs: with: check_filenames: true check_hidden: true - skip: vendor,go.sum,api/go.sum,go.work.sum + skip: vendor,go.sum,api/go.sum,go.work.sum,shared ignore_words_list: xdescribe,contails,shouldnot commitlint: diff --git a/shared/rados/buffer.h b/shared/rados/buffer.h new file mode 100644 index 0000000000..10dceaec29 --- /dev/null +++ b/shared/rados/buffer.h @@ -0,0 +1,1294 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_BUFFER_H +#define CEPH_BUFFER_H + +#if defined(__linux__) || defined(__FreeBSD__) +#include +#endif +#include + +#ifndef _XOPEN_SOURCE +# define _XOPEN_SOURCE 600 +#endif + +#include +#include + +#if defined(__linux__) // For malloc(2). +#include +#endif + +#include +#include +#include + +#if !defined(__CYGWIN__) && !defined(_WIN32) +# include +#endif + +#include +#include +#include +#include +#include +#include +#if __cplusplus >= 201703L +#include +#endif // __cplusplus >= 201703L + +#include +#include + +#include "page.h" +#include "crc32c.h" +#include "buffer_fwd.h" + + +#ifdef __CEPH__ +# include "include/ceph_assert.h" +#else +# include +#endif + +#include "inline_memory.h" + +#define CEPH_BUFFER_API + +#ifdef HAVE_SEASTAR +namespace seastar { +template class temporary_buffer; +namespace net { +class packet; +} +} +#endif // HAVE_SEASTAR +class deleter; + +template class DencDumper; + +namespace ceph { + +template +struct nop_delete { + void operator()(T*) {} +}; + +// This is not unique_ptr-like smart pointer! It just signalizes ownership +// but DOES NOT manage the resource. It WILL LEAK if not manually deleted. +// It's rather a replacement for raw pointer than any other smart one. +// +// Considered options: +// * unique_ptr with custom deleter implemented in .cc (would provide +// the non-zero-cost resource management), +// * GSL's owner (pretty neat but would impose an extra depedency), +// * unique_ptr with nop deleter, +// * raw pointer (doesn't embed ownership enforcement - std::move). +template +struct unique_leakable_ptr : public std::unique_ptr> { + using std::unique_ptr>::unique_ptr; +}; + +namespace buffer CEPH_BUFFER_API { +inline namespace v15_2_0 { + +/// Actual definitions in common/error_code.h +struct error; +struct bad_alloc; +struct end_of_buffer; +struct malformed_input; +struct error_code; + + /// count of cached crc hits (matching input) + int get_cached_crc(); + /// count of cached crc hits (mismatching input, required adjustment) + int get_cached_crc_adjusted(); + /// count of crc cache misses + int get_missed_crc(); + /// enable/disable tracking of cached crcs + void track_cached_crc(bool b); + + /* + * an abstract raw buffer. with a reference count. + */ + class raw; + class raw_malloc; + class raw_static; + class raw_posix_aligned; + class raw_hack_aligned; + class raw_claimed_char; + class raw_unshareable; // diagnostic, unshareable char buffer + class raw_combined; + class raw_claim_buffer; + + + /* + * named constructors + */ + ceph::unique_leakable_ptr copy(const char *c, unsigned len); + ceph::unique_leakable_ptr create(unsigned len); + ceph::unique_leakable_ptr create(unsigned len, char c); + ceph::unique_leakable_ptr create_in_mempool(unsigned len, int mempool); + ceph::unique_leakable_ptr claim_char(unsigned len, char *buf); + ceph::unique_leakable_ptr create_malloc(unsigned len); + ceph::unique_leakable_ptr claim_malloc(unsigned len, char *buf); + ceph::unique_leakable_ptr create_static(unsigned len, char *buf); + ceph::unique_leakable_ptr create_aligned(unsigned len, unsigned align); + ceph::unique_leakable_ptr create_aligned_in_mempool(unsigned len, unsigned align, int mempool); + ceph::unique_leakable_ptr create_page_aligned(unsigned len); + ceph::unique_leakable_ptr create_small_page_aligned(unsigned len); + ceph::unique_leakable_ptr claim_buffer(unsigned len, char *buf, deleter del); + +#ifdef HAVE_SEASTAR + /// create a raw buffer to wrap seastar cpu-local memory, using foreign_ptr to + /// make it safe to share between cpus + ceph::unique_leakable_ptr create(seastar::temporary_buffer&& buf); + /// create a raw buffer to wrap seastar cpu-local memory, without the safety + /// of foreign_ptr. the caller must otherwise guarantee that the buffer ptr is + /// destructed on this cpu + ceph::unique_leakable_ptr create_local(seastar::temporary_buffer&& buf); +#endif + + /* + * a buffer pointer. references (a subsequence of) a raw buffer. + */ + class CEPH_BUFFER_API ptr { + friend class list; + protected: + raw *_raw; + unsigned _off, _len; + private: + + void release(); + + template + class iterator_impl { + const ptr *bp; ///< parent ptr + const char *start; ///< starting pointer into bp->c_str() + const char *pos; ///< pointer into bp->c_str() + const char *end_ptr; ///< pointer to bp->end_c_str() + const bool deep; ///< if true, do not allow shallow ptr copies + + iterator_impl(typename std::conditional::type p, + size_t offset, bool d) + : bp(p), + start(p->c_str() + offset), + pos(start), + end_ptr(p->end_c_str()), + deep(d) + {} + + friend class ptr; + + public: + using pointer = typename std::conditional::type; + pointer get_pos_add(size_t n) { + auto r = pos; + *this += n; + return r; + } + ptr get_ptr(size_t len) { + if (deep) { + return buffer::copy(get_pos_add(len), len); + } else { + size_t off = pos - bp->c_str(); + *this += len; + return ptr(*bp, off, len); + } + } + + iterator_impl& operator+=(size_t len); + + const char *get_pos() { + return pos; + } + const char *get_end() { + return end_ptr; + } + + size_t get_offset() { + return pos - start; + } + + bool end() const { + return pos == end_ptr; + } + }; + + public: + using const_iterator = iterator_impl; + using iterator = iterator_impl; + + ptr() : _raw(nullptr), _off(0), _len(0) {} + ptr(ceph::unique_leakable_ptr r); + // cppcheck-suppress noExplicitConstructor + ptr(unsigned l); + ptr(const char *d, unsigned l); + ptr(const ptr& p); + ptr(ptr&& p) noexcept; + ptr(const ptr& p, unsigned o, unsigned l); + ptr(const ptr& p, ceph::unique_leakable_ptr r); + ptr& operator= (const ptr& p); + ptr& operator= (ptr&& p) noexcept; + ~ptr() { + // BE CAREFUL: this destructor is called also for hypercombined ptr_node. + // After freeing underlying raw, `*this` can become inaccessible as well! + release(); + } + + bool have_raw() const { return _raw ? true:false; } + + void swap(ptr& other) noexcept; + + iterator begin(size_t offset=0) { + return iterator(this, offset, false); + } + const_iterator begin(size_t offset=0) const { + return const_iterator(this, offset, false); + } + const_iterator cbegin() const { + return begin(); + } + const_iterator begin_deep(size_t offset=0) const { + return const_iterator(this, offset, true); + } + + // misc + bool is_aligned(unsigned align) const { + return ((uintptr_t)c_str() & (align-1)) == 0; + } + bool is_page_aligned() const { return is_aligned(CEPH_PAGE_SIZE); } + bool is_n_align_sized(unsigned align) const + { + return (length() % align) == 0; + } + bool is_n_page_sized() const { return is_n_align_sized(CEPH_PAGE_SIZE); } + bool is_partial() const { + return have_raw() && (start() > 0 || end() < raw_length()); + } + + int get_mempool() const; + void reassign_to_mempool(int pool); + void try_assign_to_mempool(int pool); + + // accessors + const char *c_str() const; + char *c_str(); + const char *end_c_str() const; + char *end_c_str(); + unsigned length() const { return _len; } + unsigned offset() const { return _off; } + unsigned start() const { return _off; } + unsigned end() const { return _off + _len; } + unsigned unused_tail_length() const; + const char& operator[](unsigned n) const; + char& operator[](unsigned n); + + const char *raw_c_str() const; + unsigned raw_length() const; + int raw_nref() const; + + void copy_out(unsigned o, unsigned l, char *dest) const; + + unsigned wasted() const; + + int cmp(const ptr& o) const; + bool is_zero() const; + + // modifiers + void set_offset(unsigned o) { +#ifdef __CEPH__ + ceph_assert(raw_length() >= o); +#else + assert(raw_length() >= o); +#endif + _off = o; + } + void set_length(unsigned l) { +#ifdef __CEPH__ + ceph_assert(raw_length() >= l); +#else + assert(raw_length() >= l); +#endif + _len = l; + } + + unsigned append(char c); + unsigned append(const char *p, unsigned l); +#if __cplusplus >= 201703L + inline unsigned append(std::string_view s) { + return append(s.data(), s.length()); + } +#endif // __cplusplus >= 201703L + void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset = true); + void zero(bool crc_reset = true); + void zero(unsigned o, unsigned l, bool crc_reset = true); + unsigned append_zeros(unsigned l); + +#ifdef HAVE_SEASTAR + /// create a temporary_buffer, copying the ptr as its deleter + operator seastar::temporary_buffer() &; + /// convert to temporary_buffer, stealing the ptr as its deleter + operator seastar::temporary_buffer() &&; +#endif // HAVE_SEASTAR + + }; + + + struct ptr_hook { + mutable ptr_hook* next; + + ptr_hook() = default; + ptr_hook(ptr_hook* const next) + : next(next) { + } + }; + + class ptr_node : public ptr_hook, public ptr { + public: + struct cloner { + ptr_node* operator()(const ptr_node& clone_this); + }; + struct disposer { + void operator()(ptr_node* const delete_this) { + if (!__builtin_expect(dispose_if_hypercombined(delete_this), 0)) { + delete delete_this; + } + } + }; + + ~ptr_node() = default; + + static std::unique_ptr + create(ceph::unique_leakable_ptr r) { + return create_hypercombined(std::move(r)); + } + static std::unique_ptr + create(const unsigned l) { + return create_hypercombined(buffer::create(l)); + } + template + static std::unique_ptr + create(Args&&... args) { + return std::unique_ptr( + new ptr_node(std::forward(args)...)); + } + + static ptr_node* copy_hypercombined(const ptr_node& copy_this); + + private: + friend list; + + template + ptr_node(Args&&... args) : ptr(std::forward(args)...) { + } + ptr_node(const ptr_node&) = default; + + ptr& operator= (const ptr& p) = delete; + ptr& operator= (ptr&& p) noexcept = delete; + ptr_node& operator= (const ptr_node& p) = delete; + ptr_node& operator= (ptr_node&& p) noexcept = delete; + void swap(ptr& other) noexcept = delete; + void swap(ptr_node& other) noexcept = delete; + + static bool dispose_if_hypercombined(ptr_node* delete_this); + static std::unique_ptr create_hypercombined( + ceph::unique_leakable_ptr r); + }; + /* + * list - the useful bit! + */ + + class CEPH_BUFFER_API list { + public: + // this the very low-level implementation of singly linked list + // ceph::buffer::list is built on. We don't use intrusive slist + // of Boost (or any other 3rd party) to save extra dependencies + // in our public headers. + class buffers_t { + // _root.next can be thought as _head + ptr_hook _root; + ptr_hook* _tail; + + public: + template + class buffers_iterator { + typename std::conditional< + std::is_const::value, const ptr_hook*, ptr_hook*>::type cur; + template friend class buffers_iterator; + public: + using value_type = T; + using reference = typename std::add_lvalue_reference::type; + using pointer = typename std::add_pointer::type; + using difference_type = std::ptrdiff_t; + using iterator_category = std::forward_iterator_tag; + + template + buffers_iterator(U* const p) + : cur(p) { + } + // copy constructor + buffers_iterator(const buffers_iterator& other) + : cur(other.cur) { + } + // converting constructor, from iterator -> const_iterator only + template ::value && !std::is_const::value, int>::type = 0> + buffers_iterator(const buffers_iterator& other) + : cur(other.cur) { + } + buffers_iterator() = default; + + T& operator*() const { + return *reinterpret_cast(cur); + } + T* operator->() const { + return reinterpret_cast(cur); + } + + buffers_iterator& operator++() { + cur = cur->next; + return *this; + } + buffers_iterator operator++(int) { + const auto temp(*this); + ++*this; + return temp; + } + + template + buffers_iterator& operator=(buffers_iterator& other) { + cur = other.cur; + return *this; + } + + bool operator==(const buffers_iterator& rhs) const { + return cur == rhs.cur; + } + bool operator!=(const buffers_iterator& rhs) const { + return !(*this==rhs); + } + }; + + typedef buffers_iterator const_iterator; + typedef buffers_iterator iterator; + + typedef const ptr_node& const_reference; + typedef ptr_node& reference; + + buffers_t() + : _root(&_root), + _tail(&_root) { + } + buffers_t(const buffers_t&) = delete; + buffers_t(buffers_t&& other) + : _root(other._root.next == &other._root ? &_root : other._root.next), + _tail(other._tail == &other._root ? &_root : other._tail) { + other._root.next = &other._root; + other._tail = &other._root; + + _tail->next = &_root; + } + buffers_t& operator=(buffers_t&& other) { + if (&other != this) { + clear_and_dispose(); + swap(other); + } + return *this; + } + + void push_back(reference item) { + item.next = &_root; + // this updates _root.next when called on empty + _tail->next = &item; + _tail = &item; + } + + void push_front(reference item) { + item.next = _root.next; + _root.next = &item; + _tail = _tail == &_root ? &item : _tail; + } + + // *_after + iterator erase_after(const_iterator it) { + const auto* to_erase = it->next; + + it->next = to_erase->next; + _root.next = _root.next == to_erase ? to_erase->next : _root.next; + _tail = _tail == to_erase ? (ptr_hook*)&*it : _tail; + return it->next; + } + + void insert_after(const_iterator it, reference item) { + item.next = it->next; + it->next = &item; + _root.next = it == end() ? &item : _root.next; + _tail = const_iterator(_tail) == it ? &item : _tail; + } + + void splice_back(buffers_t& other) { + if (other.empty()) { + return; + } + + other._tail->next = &_root; + // will update root.next if empty() == true + _tail->next = other._root.next; + _tail = other._tail; + + other._root.next = &other._root; + other._tail = &other._root; + } + + bool empty() const { return _tail == &_root; } + + const_iterator begin() const { + return _root.next; + } + const_iterator before_begin() const { + return &_root; + } + const_iterator end() const { + return &_root; + } + iterator begin() { + return _root.next; + } + iterator before_begin() { + return &_root; + } + iterator end() { + return &_root; + } + + reference front() { + return reinterpret_cast(*_root.next); + } + reference back() { + return reinterpret_cast(*_tail); + } + const_reference front() const { + return reinterpret_cast(*_root.next); + } + const_reference back() const { + return reinterpret_cast(*_tail); + } + + void clone_from(const buffers_t& other) { + clear_and_dispose(); + for (auto& node : other) { + ptr_node* clone = ptr_node::cloner()(node); + push_back(*clone); + } + } + void clear_and_dispose() { + ptr_node::disposer dispose; + for (auto it = begin(), e = end(); it != e; /* nop */) { + auto& node = *it++; + dispose(&node); + } + _tail = &_root; + _root.next = _tail; + } + iterator erase_after_and_dispose(iterator it) { + auto* to_dispose = &*std::next(it); + auto ret = erase_after(it); + ptr_node::disposer()(to_dispose); + return ret; + } + + void swap(buffers_t& other) { + const auto copy_root = _root; + _root.next = \ + other._root.next == &other._root ? &this->_root : other._root.next; + other._root.next = \ + copy_root.next == &_root ? &other._root : copy_root.next; + + const auto copy_tail = _tail; + _tail = other._tail == &other._root ? &this->_root : other._tail; + other._tail = copy_tail == &_root ? &other._root : copy_tail; + + _tail->next = &_root; + other._tail->next = &other._root; + } + }; + + class iterator; + + private: + // my private bits + buffers_t _buffers; + + // track bufferptr we can modify (especially ::append() to). Not all bptrs + // bufferlist holds have this trait -- if somebody ::push_back(const ptr&), + // he expects it won't change. + ptr_node* _carriage; + unsigned _len, _num; + + template + class CEPH_BUFFER_API iterator_impl { + protected: + typedef typename std::conditional::type bl_t; + typedef typename std::conditional::type list_t; + typedef typename std::conditional::type list_iter_t; + bl_t* bl; + list_t* ls; // meh.. just here to avoid an extra pointer dereference.. + list_iter_t p; + unsigned off; // in bl + unsigned p_off; // in *p + friend class iterator_impl; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = typename std::conditional::type; + using difference_type = std::ptrdiff_t; + using pointer = typename std::add_pointer::type; + using reference = typename std::add_lvalue_reference::type; + + // constructor. position. + iterator_impl() + : bl(0), ls(0), off(0), p_off(0) {} + iterator_impl(bl_t *l, unsigned o=0); + iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po) + : bl(l), ls(&bl->_buffers), p(ip), off(o), p_off(po) {} + iterator_impl(const list::iterator& i); + + /// get current iterator offset in buffer::list + unsigned get_off() const { return off; } + + /// get number of bytes remaining from iterator position to the end of the buffer::list + unsigned get_remaining() const { return bl->length() - off; } + + /// true if iterator is at the end of the buffer::list + bool end() const { + return p == ls->end(); + //return off == bl->length(); + } + void seek(unsigned o); + char operator*() const; + iterator_impl& operator+=(unsigned o); + iterator_impl& operator++(); + ptr get_current_ptr() const; + bool is_pointing_same_raw(const ptr& other) const; + + bl_t& get_bl() const { return *bl; } + + // copy data out. + // note that these all _append_ to dest! + void copy(unsigned len, char *dest); + // deprecated, use copy_deep() + void copy(unsigned len, ptr &dest) __attribute__((deprecated)); + void copy_deep(unsigned len, ptr &dest); + void copy_shallow(unsigned len, ptr &dest); + void copy(unsigned len, list &dest); + void copy(unsigned len, std::string &dest); + void copy_all(list &dest); + + // get a pointer to the currenet iterator position, return the + // number of bytes we can read from that position (up to want), + // and advance the iterator by that amount. + size_t get_ptr_and_advance(size_t want, const char **p); + + /// calculate crc from iterator position + uint32_t crc32c(size_t length, uint32_t crc); + + friend bool operator==(const iterator_impl& lhs, + const iterator_impl& rhs) { + return &lhs.get_bl() == &rhs.get_bl() && lhs.get_off() == rhs.get_off(); + } + friend bool operator!=(const iterator_impl& lhs, + const iterator_impl& rhs) { + return &lhs.get_bl() != &rhs.get_bl() || lhs.get_off() != rhs.get_off(); + } + }; + + public: + typedef iterator_impl const_iterator; + + class CEPH_BUFFER_API iterator : public iterator_impl { + public: + iterator() = default; + iterator(bl_t *l, unsigned o=0); + iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po); + // copy data in + void copy_in(unsigned len, const char *src, bool crc_reset = true); + void copy_in(unsigned len, const list& otherl); + }; + + struct reserve_t { + char* bp_data; + unsigned* bp_len; + unsigned* bl_len; + }; + + class contiguous_appender { + ceph::bufferlist& bl; + ceph::bufferlist::reserve_t space; + char* pos; + bool deep; + + /// running count of bytes appended that are not reflected by @pos + size_t out_of_band_offset = 0; + + contiguous_appender(bufferlist& bl, size_t len, bool d) + : bl(bl), + space(bl.obtain_contiguous_space(len)), + pos(space.bp_data), + deep(d) { + } + + void flush_and_continue() { + const size_t l = pos - space.bp_data; + *space.bp_len += l; + *space.bl_len += l; + space.bp_data = pos; + } + + friend class list; + template friend class ::DencDumper; + + public: + ~contiguous_appender() { + flush_and_continue(); + } + + size_t get_out_of_band_offset() const { + return out_of_band_offset; + } + void append(const char* __restrict__ p, size_t l) { + maybe_inline_memcpy(pos, p, l, 16); + pos += l; + } + char *get_pos_add(size_t len) { + char *r = pos; + pos += len; + return r; + } + char *get_pos() const { + return pos; + } + + void append(const bufferptr& p) { + const auto plen = p.length(); + if (!plen) { + return; + } + if (deep) { + append(p.c_str(), plen); + } else { + flush_and_continue(); + bl.append(p); + space = bl.obtain_contiguous_space(0); + out_of_band_offset += plen; + } + } + void append(const bufferlist& l) { + if (deep) { + for (const auto &p : l._buffers) { + append(p.c_str(), p.length()); + } + } else { + flush_and_continue(); + bl.append(l); + space = bl.obtain_contiguous_space(0); + out_of_band_offset += l.length(); + } + } + + size_t get_logical_offset() const { + return out_of_band_offset + (pos - space.bp_data); + } + }; + + contiguous_appender get_contiguous_appender(size_t len, bool deep=false) { + return contiguous_appender(*this, len, deep); + } + + class contiguous_filler { + friend buffer::list; + char* pos; + + contiguous_filler(char* const pos) : pos(pos) {} + + public: + void advance(const unsigned len) { + pos += len; + } + void copy_in(const unsigned len, const char* const src) { + memcpy(pos, src, len); + advance(len); + } + char* c_str() { + return pos; + } + }; + // The contiguous_filler is supposed to be not costlier than a single + // pointer. Keep it dumb, please. + static_assert(sizeof(contiguous_filler) == sizeof(char*), + "contiguous_filler should be no costlier than pointer"); + + class page_aligned_appender { + bufferlist& bl; + unsigned min_alloc; + + page_aligned_appender(list *l, unsigned min_pages) + : bl(*l), + min_alloc(min_pages * CEPH_PAGE_SIZE) { + } + + void _refill(size_t len); + + template + void _append_common(size_t len, Func&& impl_f) { + const auto free_in_last = bl.get_append_buffer_unused_tail_length(); + const auto first_round = std::min(len, free_in_last); + if (first_round) { + impl_f(first_round); + } + // no C++17 for the sake of the C++11 guarantees of librados, sorry. + const auto second_round = len - first_round; + if (second_round) { + _refill(second_round); + impl_f(second_round); + } + } + + friend class list; + + public: + void append(const bufferlist& l) { + bl.append(l); + bl.obtain_contiguous_space(0); + } + + void append(const char* buf, size_t entire_len) { + _append_common(entire_len, + [buf, this] (const size_t chunk_len) mutable { + bl.append(buf, chunk_len); + buf += chunk_len; + }); + } + + void append_zero(size_t entire_len) { + _append_common(entire_len, [this] (const size_t chunk_len) { + bl.append_zero(chunk_len); + }); + } + + void substr_of(const list& bl, unsigned off, unsigned len) { + for (const auto& bptr : bl.buffers()) { + if (off >= bptr.length()) { + off -= bptr.length(); + continue; + } + const auto round_size = std::min(bptr.length() - off, len); + append(bptr.c_str() + off, round_size); + len -= round_size; + off = 0; + } + } + }; + + page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) { + return page_aligned_appender(this, min_pages); + } + + private: + // always_empty_bptr has no underlying raw but its _len is always 0. + // This is useful for e.g. get_append_buffer_unused_tail_length() as + // it allows to avoid conditionals on hot paths. + static ptr_node always_empty_bptr; + ptr_node& refill_append_space(const unsigned len); + + // for page_aligned_appender; never ever expose this publicly! + // carriage / append_buffer is just an implementation's detail. + ptr& get_append_buffer() { + return *_carriage; + } + + public: + // cons/des + list() + : _carriage(&always_empty_bptr), + _len(0), + _num(0) { + } + // cppcheck-suppress noExplicitConstructor + // cppcheck-suppress noExplicitConstructor + list(unsigned prealloc) + : _carriage(&always_empty_bptr), + _len(0), + _num(0) { + reserve(prealloc); + } + + list(const list& other) + : _carriage(&always_empty_bptr), + _len(other._len), + _num(other._num) { + _buffers.clone_from(other._buffers); + } + + list(list&& other) noexcept + : _buffers(std::move(other._buffers)), + _carriage(other._carriage), + _len(other._len), + _num(other._num) { + other.clear(); + } + + ~list() { + _buffers.clear_and_dispose(); + } + + list& operator= (const list& other) { + if (this != &other) { + _carriage = &always_empty_bptr; + _buffers.clone_from(other._buffers); + _len = other._len; + _num = other._num; + } + return *this; + } + list& operator= (list&& other) noexcept { + _buffers = std::move(other._buffers); + _carriage = other._carriage; + _len = other._len; + _num = other._num; + other.clear(); + return *this; + } + + uint64_t get_wasted_space() const; + unsigned get_num_buffers() const { return _num; } + const ptr_node& front() const { return _buffers.front(); } + const ptr_node& back() const { return _buffers.back(); } + + int get_mempool() const; + void reassign_to_mempool(int pool); + void try_assign_to_mempool(int pool); + + size_t get_append_buffer_unused_tail_length() const { + return _carriage->unused_tail_length(); + } + + const buffers_t& buffers() const { return _buffers; } + buffers_t& mut_buffers() { return _buffers; } + void swap(list& other) noexcept; + unsigned length() const { +#if 0 + // DEBUG: verify _len + unsigned len = 0; + for (std::list::const_iterator it = _buffers.begin(); + it != _buffers.end(); + it++) { + len += (*it).length(); + } +#ifdef __CEPH__ + ceph_assert(len == _len); +#else + assert(len == _len); +#endif // __CEPH__ +#endif + return _len; + } + + bool contents_equal(const buffer::list& other) const; + bool contents_equal(const void* other, size_t length) const; + + bool is_provided_buffer(const char *dst) const; + bool is_aligned(unsigned align) const; + bool is_page_aligned() const; + bool is_n_align_sized(unsigned align) const; + bool is_n_page_sized() const; + bool is_aligned_size_and_memory(unsigned align_size, + unsigned align_memory) const; + + bool is_zero() const; + + // modifiers + void clear() noexcept { + _carriage = &always_empty_bptr; + _buffers.clear_and_dispose(); + _len = 0; + _num = 0; + } + void push_back(const ptr& bp) { + if (bp.length() == 0) + return; + _buffers.push_back(*ptr_node::create(bp).release()); + _len += bp.length(); + _num += 1; + } + void push_back(ptr&& bp) { + if (bp.length() == 0) + return; + _len += bp.length(); + _num += 1; + _buffers.push_back(*ptr_node::create(std::move(bp)).release()); + _carriage = &always_empty_bptr; + } + void push_back(const ptr_node&) = delete; + void push_back(ptr_node&) = delete; + void push_back(ptr_node&&) = delete; + void push_back(std::unique_ptr bp) { + _carriage = bp.get(); + _len += bp->length(); + _num += 1; + _buffers.push_back(*bp.release()); + } + void push_back(raw* const r) = delete; + void push_back(ceph::unique_leakable_ptr r) { + _buffers.push_back(*ptr_node::create(std::move(r)).release()); + _carriage = &_buffers.back(); + _len += _buffers.back().length(); + _num += 1; + } + + void zero(); + void zero(unsigned o, unsigned l); + + bool is_contiguous() const; + void rebuild(); + void rebuild(std::unique_ptr nb); + bool rebuild_aligned(unsigned align); + // max_buffers = 0 mean don't care _buffers.size(), other + // must make _buffers.size() <= max_buffers after rebuilding. + bool rebuild_aligned_size_and_memory(unsigned align_size, + unsigned align_memory, + unsigned max_buffers = 0); + bool rebuild_page_aligned(); + + void reserve(size_t prealloc); + + [[deprecated("in favor of operator=(list&&)")]] void claim(list& bl) { + *this = std::move(bl); + } + void claim_append(list& bl); + void claim_append(list&& bl) { + claim_append(bl); + } + + // copy with explicit volatile-sharing semantics + void share(const list& bl) + { + if (this != &bl) { + clear(); + for (const auto& bp : bl._buffers) { + _buffers.push_back(*ptr_node::create(bp).release()); + } + _len = bl._len; + _num = bl._num; + } + } + +#ifdef HAVE_SEASTAR + /// convert the bufferlist into a network packet + operator seastar::net::packet() &&; +#endif + + iterator begin(size_t offset=0) { + return iterator(this, offset); + } + iterator end() { + return iterator(this, _len, _buffers.end(), 0); + } + + const_iterator begin(size_t offset=0) const { + return const_iterator(this, offset); + } + const_iterator cbegin(size_t offset=0) const { + return begin(offset); + } + const_iterator end() const { + return const_iterator(this, _len, _buffers.end(), 0); + } + + void append(char c); + void append(const char *data, unsigned len); + void append(std::string s) { + append(s.data(), s.length()); + } +#if __cplusplus >= 201703L + // To forcibly disambiguate between string and string_view in the + // case of arrays + template + void append(const char (&s)[N]) { + append(s, N); + } + void append(const char* s) { + append(s, strlen(s)); + } + void append(std::string_view s) { + append(s.data(), s.length()); + } +#endif // __cplusplus >= 201703L + void append(const ptr& bp); + void append(ptr&& bp); + void append(const ptr& bp, unsigned off, unsigned len); + void append(const list& bl); + /// append each non-empty line from the stream and add '\n', + /// so a '\n' will be added even the stream does not end with EOL. + /// + /// For example, if the stream contains "ABC\n\nDEF", "ABC\nDEF\n" is + /// actually appended. + void append(std::istream& in); + contiguous_filler append_hole(unsigned len); + void append_zero(unsigned len); + void prepend_zero(unsigned len); + + reserve_t obtain_contiguous_space(const unsigned len); + + /* + * get a char + */ + const char& operator[](unsigned n) const; + char *c_str(); + std::string to_str() const; + + void substr_of(const list& other, unsigned off, unsigned len); + + // funky modifer + void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */); + void write(int off, int len, std::ostream& out) const; + + void encode_base64(list& o); + void decode_base64(list& o); + + void write_stream(std::ostream &out) const; + void hexdump(std::ostream &out, bool trailing_newline = true) const; + ssize_t pread_file(const char *fn, uint64_t off, uint64_t len, std::string *error); + int read_file(const char *fn, std::string *error); + ssize_t read_fd(int fd, size_t len); + ssize_t recv_fd(int fd, size_t len); + int write_file(const char *fn, int mode=0644); + int write_fd(int fd) const; + int write_fd(int fd, uint64_t offset) const; + int send_fd(int fd) const; + template + void prepare_iov(VectorT *piov) const { +#ifdef __CEPH__ + ceph_assert(_num <= IOV_MAX); +#else + assert(_num <= IOV_MAX); +#endif + piov->resize(_num); + unsigned n = 0; + for (auto& p : _buffers) { + (*piov)[n].iov_base = (void *)p.c_str(); + (*piov)[n].iov_len = p.length(); + ++n; + } + } + + struct iovec_t { + uint64_t offset; + uint64_t length; + std::vector iov; + }; + using iov_vec_t = std::vector; + iov_vec_t prepare_iovs() const; + + uint32_t crc32c(uint32_t crc) const; + void invalidate_crc(); + + // These functions return a bufferlist with a pointer to a single + // static buffer. They /must/ not outlive the memory they + // reference. + static list static_from_mem(char* c, size_t l); + static list static_from_cstring(char* c); + static list static_from_string(std::string& s); + }; + +} // inline namespace v15_2_0 + + /* + * efficient hash of one or more bufferlists + */ + + class hash { + uint32_t crc; + + public: + hash() : crc(0) { } + // cppcheck-suppress noExplicitConstructor + hash(uint32_t init) : crc(init) { } + + void update(const buffer::list& bl) { + crc = bl.crc32c(crc); + } + + uint32_t digest() { + return crc; + } + }; + +inline bool operator==(const bufferlist &lhs, const bufferlist &rhs) { + if (lhs.length() != rhs.length()) + return false; + return std::equal(lhs.begin(), lhs.end(), rhs.begin()); +} + +inline bool operator<(const bufferlist& lhs, const bufferlist& rhs) { + auto l = lhs.begin(), r = rhs.begin(); + for (; l != lhs.end() && r != rhs.end(); ++l, ++r) { + if (*l < *r) return true; + if (*l > *r) return false; + } + return (l == lhs.end()) && (r != rhs.end()); // lhs.length() < rhs.length() +} + +inline bool operator<=(const bufferlist& lhs, const bufferlist& rhs) { + auto l = lhs.begin(), r = rhs.begin(); + for (; l != lhs.end() && r != rhs.end(); ++l, ++r) { + if (*l < *r) return true; + if (*l > *r) return false; + } + return l == lhs.end(); // lhs.length() <= rhs.length() +} + +inline bool operator!=(const bufferlist &l, const bufferlist &r) { + return !(l == r); +} +inline bool operator>(const bufferlist& lhs, const bufferlist& rhs) { + return rhs < lhs; +} +inline bool operator>=(const bufferlist& lhs, const bufferlist& rhs) { + return rhs <= lhs; +} + +std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); + +std::ostream& operator<<(std::ostream& out, const buffer::raw &r); + +std::ostream& operator<<(std::ostream& out, const buffer::list& bl); + +inline bufferhash& operator<<(bufferhash& l, const bufferlist &r) { + l.update(r); + return l; +} + +} // namespace buffer + +} // namespace ceph + + +#endif diff --git a/shared/rados/buffer_fwd.h b/shared/rados/buffer_fwd.h new file mode 100644 index 0000000000..6de7b1a1ff --- /dev/null +++ b/shared/rados/buffer_fwd.h @@ -0,0 +1,19 @@ +#ifndef BUFFER_FWD_H +#define BUFFER_FWD_H + +namespace ceph { + namespace buffer { + inline namespace v15_2_0 { + class ptr; + class list; + } + class hash; + } + + using bufferptr = buffer::ptr; + using bufferlist = buffer::list; + using bufferhash = buffer::hash; +} + +#endif + diff --git a/shared/rados/crc32c.h b/shared/rados/crc32c.h new file mode 100644 index 0000000000..dd4ede666e --- /dev/null +++ b/shared/rados/crc32c.h @@ -0,0 +1,57 @@ +#ifndef CEPH_CRC32C_H +#define CEPH_CRC32C_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length); + +/* + * this is a static global with the chosen crc32c implementation for + * the given architecture. + */ +extern ceph_crc32c_func_t ceph_crc32c_func; + +extern ceph_crc32c_func_t ceph_choose_crc32(void); + +/** + * calculate crc32c for data that is entirely 0 (ZERO) + * + * Note: works the same as ceph_crc32c_func for data == nullptr, + * but faster than the optimized assembly on certain architectures. + * This is faster than intel optimized assembly, but not as fast as + * ppc64le optimized assembly. + * + * @param crc initial value + * @param length length of buffer + */ +uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length); + +/** + * calculate crc32c + * + * Note: if the data pointer is NULL, we calculate a crc value as if + * it were zero-filled. + * + * @param crc initial value + * @param data pointer to data buffer + * @param length length of buffer + */ +static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length) +{ +#ifndef HAVE_POWER8 + if (!data && length > 16) + return ceph_crc32c_zeros(crc, length); +#endif /* HAVE_POWER8 */ + + return ceph_crc32c_func(crc, data, length); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/shared/rados/inline_memory.h b/shared/rados/inline_memory.h new file mode 100644 index 0000000000..48d889763f --- /dev/null +++ b/shared/rados/inline_memory.h @@ -0,0 +1,150 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_INLINE_MEMORY_H +#define CEPH_INLINE_MEMORY_H + +#if defined(__GNUC__) + +// optimize for the common case, which is very small copies +static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l, + size_t inline_len) + __attribute__((always_inline)); + +void *maybe_inline_memcpy(void *dest, const void *src, size_t l, + size_t inline_len) +{ + if (l > inline_len) { + return memcpy(dest, src, l); + } + switch (l) { + case 8: + return __builtin_memcpy(dest, src, 8); + case 4: + return __builtin_memcpy(dest, src, 4); + case 3: + return __builtin_memcpy(dest, src, 3); + case 2: + return __builtin_memcpy(dest, src, 2); + case 1: + return __builtin_memcpy(dest, src, 1); + default: + int cursor = 0; + while (l >= sizeof(uint64_t)) { + __builtin_memcpy((char*)dest + cursor, (char*)src + cursor, + sizeof(uint64_t)); + cursor += sizeof(uint64_t); + l -= sizeof(uint64_t); + } + while (l >= sizeof(uint32_t)) { + __builtin_memcpy((char*)dest + cursor, (char*)src + cursor, + sizeof(uint32_t)); + cursor += sizeof(uint32_t); + l -= sizeof(uint32_t); + } + while (l > 0) { + *((char*)dest + cursor) = *((char*)src + cursor); + cursor++; + l--; + } + } + return dest; +} + +#else + +#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l) + +#endif + + +#if defined(__GNUC__) && defined(__x86_64__) + +namespace ceph { +typedef unsigned uint128_t __attribute__ ((mode (TI))); +} +using ceph::uint128_t; + +static inline bool mem_is_zero(const char *data, size_t len) + __attribute__((always_inline)); + +bool mem_is_zero(const char *data, size_t len) +{ + // we do have XMM registers in x86-64, so if we need to check at least + // 16 bytes, make use of them + if (len / sizeof(uint128_t) > 0) { + // align data pointer to 16 bytes, otherwise it'll segfault due to bug + // in (at least some) GCC versions (using MOVAPS instead of MOVUPS). + // check up to 15 first bytes while at it. + while (((unsigned long long)data) & 15) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + --len; + } + + const char* data_start = data; + const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t); + + while (data < max128) { + if (*(uint128_t*)data != 0) { + return false; + } + data += sizeof(uint128_t); + } + len -= (data - data_start); + } + + const char* max = data + len; + const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t); + while (data < max32) { + if (*(uint32_t*)data != 0) { + return false; + } + data += sizeof(uint32_t); + } + while (data < max) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + } + return true; +} + +#else // gcc and x86_64 + +static inline bool mem_is_zero(const char *data, size_t len) { + const char *end = data + len; + const char* end64 = data + (len / sizeof(uint64_t))*sizeof(uint64_t); + + while (data < end64) { + if (*(uint64_t*)data != 0) { + return false; + } + data += sizeof(uint64_t); + } + + while (data < end) { + if (*data != 0) { + return false; + } + ++data; + } + return true; +} + +#endif // !x86_64 + +#endif diff --git a/shared/rados/librados.h b/shared/rados/librados.h new file mode 100644 index 0000000000..858804c3a0 --- /dev/null +++ b/shared/rados/librados.h @@ -0,0 +1,4156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2012 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIBRADOS_H +#define CEPH_LIBRADOS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#if defined(__linux__) +#include +#elif defined(__FreeBSD__) +#include +#endif +#include +#include +#include "rados_types.h" + +#include + +#ifndef CEPH_OSD_TMAP_SET +/* These are also defined in rados.h and objclass.h. Keep them in sync! */ +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_CREATE 'c' +#define CEPH_OSD_TMAP_RM 'r' +#endif + +#define LIBRADOS_VER_MAJOR 3 +#define LIBRADOS_VER_MINOR 0 +#define LIBRADOS_VER_EXTRA 0 + +#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA) + +#define LIBRADOS_SUPPORTS_WATCH 1 +#define LIBRADOS_SUPPORTS_SERVICES 1 +#define LIBRADOS_SUPPORTS_GETADDRS 1 +#define LIBRADOS_SUPPORTS_APP_METADATA 1 + +/* RADOS lock flags + * They are also defined in cls_lock_types.h. Keep them in sync! + */ +#define LIBRADOS_LOCK_FLAG_RENEW (1u<<0) +#define LIBRADOS_LOCK_FLAG_MAY_RENEW LIBRADOS_LOCK_FLAG_RENEW +#define LIBRADOS_LOCK_FLAG_MUST_RENEW (1u<<1) + +/* + * Constants for rados_write_op_create(). + */ +#define LIBRADOS_CREATE_EXCLUSIVE 1 +#define LIBRADOS_CREATE_IDEMPOTENT 0 + +/* + * Flags that can be set on a per-op basis via + * rados_read_op_set_flags() and rados_write_op_set_flags(). + */ +enum { + // fail a create operation if the object already exists + LIBRADOS_OP_FLAG_EXCL = 0x1, + // allow the transaction to succeed even if the flagged op fails + LIBRADOS_OP_FLAG_FAILOK = 0x2, + // indicate read/write op random + LIBRADOS_OP_FLAG_FADVISE_RANDOM = 0x4, + // indicate read/write op sequential + LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, + // indicate read/write data will be accessed in the near future (by someone) + LIBRADOS_OP_FLAG_FADVISE_WILLNEED = 0x10, + // indicate read/write data will not accessed in the near future (by anyone) + LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20, + // indicate read/write data will not accessed again (by *this* client) + LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40, + // optionally support FUA (force unit access) on write requests + LIBRADOS_OP_FLAG_FADVISE_FUA = 0x80, +}; + +#define CEPH_RADOS_API + +/** + * @name xattr comparison operations + * Operators for comparing xattrs on objects, and aborting the + * rados_read_op or rados_write_op transaction if the comparison + * fails. + * + * @{ + */ +enum { + LIBRADOS_CMPXATTR_OP_EQ = 1, + LIBRADOS_CMPXATTR_OP_NE = 2, + LIBRADOS_CMPXATTR_OP_GT = 3, + LIBRADOS_CMPXATTR_OP_GTE = 4, + LIBRADOS_CMPXATTR_OP_LT = 5, + LIBRADOS_CMPXATTR_OP_LTE = 6 +}; +/** @} */ + +/** + * @name Operation Flags + * Flags for rados_read_op_operate(), rados_write_op_operate(), + * rados_aio_read_op_operate(), and rados_aio_write_op_operate(). + * See librados.hpp for details. + * @{ + */ +enum { + LIBRADOS_OPERATION_NOFLAG = 0, + LIBRADOS_OPERATION_BALANCE_READS = 1, + LIBRADOS_OPERATION_LOCALIZE_READS = 2, + LIBRADOS_OPERATION_ORDER_READS_WRITES = 4, + LIBRADOS_OPERATION_IGNORE_CACHE = 8, + LIBRADOS_OPERATION_SKIPRWLOCKS = 16, + LIBRADOS_OPERATION_IGNORE_OVERLAY = 32, + /* send requests to cluster despite the cluster or pool being marked + full; ops will either succeed (e.g., delete) or return EDQUOT or + ENOSPC. */ + LIBRADOS_OPERATION_FULL_TRY = 64, + /* + * Mainly for delete op + */ + LIBRADOS_OPERATION_FULL_FORCE = 128, + LIBRADOS_OPERATION_IGNORE_REDIRECT = 256, + LIBRADOS_OPERATION_ORDERSNAP = 512, + /* enable/allow >0 return values and payloads on write/update */ + LIBRADOS_OPERATION_RETURNVEC = 1024, +}; +/** @} */ + +/** + * @name Alloc hint flags + * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2() + * indicating future IO patterns. + * @{ + */ +enum { + LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8, + LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32, + LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64, + LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128, + LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; +/** @} */ + +typedef enum { + LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0, + LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1, + LIBRADOS_CHECKSUM_TYPE_CRC32C = 2 +} rados_checksum_type_t; + +/* + * snap id contants + */ +#define LIBRADOS_SNAP_HEAD UINT64_C(-2) +#define LIBRADOS_SNAP_DIR UINT64_C(-1) + +/** + * @typedef rados_t + * + * A handle for interacting with a RADOS cluster. It encapsulates all + * RADOS client configuration, including username, key for + * authentication, logging, and debugging. Talking to different clusters + * -- or to the same cluster with different users -- requires + * different cluster handles. + */ +#ifndef VOIDPTR_RADOS_T +#define VOIDPTR_RADOS_T +typedef void *rados_t; +#endif //VOIDPTR_RADOS_T + +/** + * @typedef rados_config_t + * + * A handle for the ceph configuration context for the rados_t cluster + * instance. This can be used to share configuration context/state + * (e.g., logging configuration) between librados instance. + * + * @warning The config context does not have independent reference + * counting. As such, a rados_config_t handle retrieved from a given + * rados_t is only valid as long as that rados_t. + */ +typedef void *rados_config_t; + +/** + * @typedef rados_ioctx_t + * + * An io context encapsulates a few settings for all I/O operations + * done on it: + * - pool - set when the io context is created (see rados_ioctx_create()) + * - snapshot context for writes (see + * rados_ioctx_selfmanaged_snap_set_write_ctx()) + * - snapshot id to read from (see rados_ioctx_snap_set_read()) + * - object locator for all single-object operations (see + * rados_ioctx_locator_set_key()) + * - namespace for all single-object operations (see + * rados_ioctx_set_namespace()). Set to LIBRADOS_ALL_NSPACES + * before rados_nobjects_list_open() will list all objects in all + * namespaces. + * + * @warning Changing any of these settings is not thread-safe - + * librados users must synchronize any of these changes on their own, + * or use separate io contexts for each thread + */ +typedef void *rados_ioctx_t; + +/** + * @typedef rados_list_ctx_t + * + * An iterator for listing the objects in a pool. + * Used with rados_nobjects_list_open(), + * rados_nobjects_list_next(), rados_nobjects_list_next2(), and + * rados_nobjects_list_close(). + */ +typedef void *rados_list_ctx_t; + +/** + * @typedef rados_object_list_cursor + * + * The cursor used with rados_enumerate_objects + * and accompanying methods. + */ +typedef void * rados_object_list_cursor; + +/** + * @struct rados_object_list_item + * + * The item populated by rados_object_list in + * the results array. + */ +typedef struct { + + /// oid length + size_t oid_length; + /// name of the object + char *oid; + /// namespace length + size_t nspace_length; + /// the object namespace + char *nspace; + /// locator length + size_t locator_length; + /// object locator + char *locator; +} rados_object_list_item; + +/** + * @typedef rados_snap_t + * The id of a snapshot. + */ +typedef uint64_t rados_snap_t; + +/** + * @typedef rados_xattrs_iter_t + * An iterator for listing extended attrbutes on an object. + * Used with rados_getxattrs(), rados_getxattrs_next(), and + * rados_getxattrs_end(). + */ +typedef void *rados_xattrs_iter_t; + +/** + * @typedef rados_omap_iter_t + * An iterator for listing omap key/value pairs on an object. + * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(), + * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and + * rados_omap_get_end(). + */ +typedef void *rados_omap_iter_t; + +/** + * @struct rados_pool_stat_t + * Usage information for a pool. + */ +struct rados_pool_stat_t { + /// space used in bytes + uint64_t num_bytes; + /// space used in KB + uint64_t num_kb; + /// number of objects in the pool + uint64_t num_objects; + /// number of clones of objects + uint64_t num_object_clones; + /// num_objects * num_replicas + uint64_t num_object_copies; + /// number of objects missing on primary + uint64_t num_objects_missing_on_primary; + /// number of objects found on no OSDs + uint64_t num_objects_unfound; + /// number of objects replicated fewer times than they should be + /// (but found on at least one OSD) + uint64_t num_objects_degraded; + /// number of objects read + uint64_t num_rd; + /// objects read in KB + uint64_t num_rd_kb; + /// number of objects written + uint64_t num_wr; + /// objects written in KB + uint64_t num_wr_kb; + /// bytes originally provided by user + uint64_t num_user_bytes; + /// bytes passed compression + uint64_t compressed_bytes_orig; + /// bytes resulted after compression + uint64_t compressed_bytes; + /// bytes allocated at storage + uint64_t compressed_bytes_alloc; +}; + +/** + * @struct rados_cluster_stat_t + * Cluster-wide usage information + */ +struct rados_cluster_stat_t { + /// total device size + uint64_t kb; + /// total used + uint64_t kb_used; + /// total available/free + uint64_t kb_avail; + /// number of objects + uint64_t num_objects; +}; + +/** + * @typedef rados_write_op_t + * + * An object write operation stores a number of operations which can be + * executed atomically. For usage, see: + * - Creation and deletion: rados_create_write_op() rados_release_write_op() + * - Extended attribute manipulation: rados_write_op_cmpxattr() + * rados_write_op_cmpxattr(), rados_write_op_setxattr(), + * rados_write_op_rmxattr() + * - Object map key/value pairs: rados_write_op_omap_set(), + * rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(), + * rados_write_op_omap_cmp() + * - Object properties: rados_write_op_assert_exists(), + * rados_write_op_assert_version() + * - Creating objects: rados_write_op_create() + * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero + * rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove, + * rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext() + * - Hints: rados_write_op_set_alloc_hint() + * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate() + */ +typedef void *rados_write_op_t; + +/** + * @typedef rados_read_op_t + * + * An object read operation stores a number of operations which can be + * executed atomically. For usage, see: + * - Creation and deletion: rados_create_read_op() rados_release_read_op() + * - Extended attribute manipulation: rados_read_op_cmpxattr(), + * rados_read_op_getxattr(), rados_read_op_getxattrs() + * - Object map key/value pairs: rados_read_op_omap_get_vals(), + * rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(), + * rados_read_op_omap_cmp() + * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(), + * rados_read_op_assert_version() + * - IO on objects: rados_read_op_read(), rados_read_op_checksum(), + * rados_read_op_cmpext() + * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf() + * - Request properties: rados_read_op_set_flags() + * - Performing the operation: rados_read_op_operate(), + * rados_aio_read_op_operate() + */ +typedef void *rados_read_op_t; + +/** + * @typedef rados_completion_t + * Represents the state of an asynchronous operation - it contains the + * return value once the operation completes, and can be used to block + * until the operation is complete or safe. + */ +typedef void *rados_completion_t; + +/** + * @struct blkin_trace_info + * blkin trace information for Zipkin tracing + */ +struct blkin_trace_info; + +/** + * Get the version of librados. + * + * The version number is major.minor.extra. Note that this is + * unrelated to the Ceph version number. + * + * TODO: define version semantics, i.e.: + * - incrementing major is for backwards-incompatible changes + * - incrementing minor is for backwards-compatible changes + * - incrementing extra is for bug fixes + * + * @param major where to store the major version number + * @param minor where to store the minor version number + * @param extra where to store the extra version number + */ +CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra); + +/** + * @name Setup and Teardown + * These are the first and last functions to that should be called + * when using librados. + * + * @{ + */ + +/** + * Create a handle for communicating with a RADOS cluster. + * + * Ceph environment variables are read when this is called, so if + * $CEPH_ARGS specifies everything you need to connect, no further + * configuration is necessary. + * + * @param cluster where to store the handle + * @param id the user to connect as (i.e. admin, not client.admin) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id); + +/** + * Extended version of rados_create. + * + * Like rados_create, but + * 1) don't assume 'client\.'+id; allow full specification of name + * 2) allow specification of cluster name + * 3) flags for future expansion + */ +CEPH_RADOS_API int rados_create2(rados_t *pcluster, + const char *const clustername, + const char * const name, uint64_t flags); + +/** + * Initialize a cluster handle from an existing configuration. + * + * Share configuration state with another rados_t instance. + * + * @param cluster where to store the handle + * @param cct the existing configuration to use + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_create_with_context(rados_t *cluster, + rados_config_t cct); + +/** + * Ping the monitor with ID mon_id, storing the resulting reply in + * buf (if specified) with a maximum size of len. + * + * The result buffer is allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param mon_id [in] ID of the monitor to ping + * @param outstr [out] double pointer with the resulting reply + * @param outstrlen [out] pointer with the size of the reply in outstr + */ +CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id, + char **outstr, size_t *outstrlen); + +/** + * Connect to the cluster. + * + * @note BUG: Before calling this, calling a function that communicates with the + * cluster will crash. + * + * @pre The cluster handle is configured with at least a monitor + * address. If cephx is enabled, a client name and secret must also be + * set. + * + * @post If this succeeds, any function in librados may be used + * + * @param cluster The cluster to connect to. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_connect(rados_t cluster); + +/** + * Disconnects from the cluster. + * + * For clean up, this is only necessary after rados_connect() has + * succeeded. + * + * @warning This does not guarantee any asynchronous writes have + * completed. To do that, you must call rados_aio_flush() on all open + * io contexts. + * + * @warning We implicitly call rados_watch_flush() on shutdown. If + * there are watches being used, this should be done explicitly before + * destroying the relevant IoCtx. We do it here as a safety measure. + * + * @post the cluster handle cannot be used again + * + * @param cluster the cluster to shutdown + */ +CEPH_RADOS_API void rados_shutdown(rados_t cluster); + +/** @} init */ + +/** + * @name Configuration + * These functions read and update Ceph configuration for a cluster + * handle. Any configuration changes must be done before connecting to + * the cluster. + * + * Options that librados users might want to set include: + * - mon_host + * - auth_supported + * - key, keyfile, or keyring when using cephx + * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog + * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms + * + * See docs.ceph.com for information about available configuration options` + * + * @{ + */ + +/** + * Configure the cluster handle using a Ceph config file + * + * If path is NULL, the default locations are searched, and the first + * found is used. The locations are: + * - $CEPH_CONF (environment variable) + * - /etc/ceph/ceph.conf + * - ~/.ceph/config + * - ceph.conf (in the current working directory) + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param path path to a Ceph configuration file + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path); + +/** + * Configure the cluster handle with command line arguments + * + * argv can contain any common Ceph command line option, including any + * configuration parameter prefixed by '--' and replacing spaces with + * dashes or underscores. For example, the following options are equivalent: + * - --mon-host 10.0.0.1:6789 + * - --mon_host 10.0.0.1:6789 + * - -m 10.0.0.1:6789 + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param argc number of arguments in argv + * @param argv arguments to parse + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc, + const char **argv); + + +/** + * Configure the cluster handle with command line arguments, returning + * any remainders. Same rados_conf_parse_argv, except for extra + * remargv argument to hold returns unrecognized arguments. + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param argc number of arguments in argv + * @param argv arguments to parse + * @param remargv char* array for returned unrecognized arguments + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc, + const char **argv, + const char **remargv); +/** + * Configure the cluster handle based on an environment variable + * + * The contents of the environment variable are parsed as if they were + * Ceph command line options. If var is NULL, the CEPH_ARGS + * environment variable is used. + * + * @pre rados_connect() has not been called on the cluster handle + * + * @note BUG: this is not threadsafe - it uses a static buffer + * + * @param cluster cluster handle to configure + * @param var name of the environment variable to read + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var); + +/** + * Set a configuration option + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param option option to set + * @param value value of the option + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when the option is not a Ceph configuration option + */ +CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option, + const char *value); + +/** + * Get the value of a configuration option + * + * @param cluster configuration to read + * @param option which option to read + * @param buf where to write the configuration value + * @param len the size of buf in bytes + * @returns 0 on success, negative error code on failure + * @returns -ENAMETOOLONG if the buffer is too short to contain the + * requested value + */ +CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option, + char *buf, size_t len); + +/** @} config */ + +/** + * Read usage info about the cluster + * + * This tells you total space, space used, space available, and number + * of objects. These are not updated immediately when data is written, + * they are eventually consistent. + * + * @param cluster cluster to query + * @param result where to store the results + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cluster_stat(rados_t cluster, + struct rados_cluster_stat_t *result); + +/** + * Get the fsid of the cluster as a hexadecimal string. + * + * The fsid is a unique id of an entire Ceph cluster. + * + * @param cluster where to get the fsid + * @param buf where to write the fsid + * @param len the size of buf in bytes (should be 37) + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the buffer is too short to contain the + * fsid + */ +CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len); + +/** + * Get/wait for the most recent osdmap + * + * @param cluster the cluster to shutdown + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster); + +/** + * @name Pools + * + * RADOS pools are separate namespaces for objects. Pools may have + * different crush rules associated with them, so they could have + * differing replication levels or placement strategies. RADOS + * permissions are also tied to pools - users can have different read, + * write, and execute permissions on a per-pool basis. + * + * @{ + */ + +/** + * List pools + * + * Gets a list of pool names as NULL-terminated strings. The pool + * names will be placed in the supplied buffer one after another. + * After the last pool name, there will be two 0 bytes in a row. + * + * If len is too short to fit all the pool name entries we need, we will fill + * as much as we can. + * + * Buf may be null to determine the buffer size needed to list all pools. + * + * @param cluster cluster handle + * @param buf output buffer + * @param len output buffer length + * @returns length of the buffer we would need to list all pools + */ +CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len); + +/** + * List inconsistent placement groups of the given pool + * + * Gets a list of inconsistent placement groups as NULL-terminated strings. + * The placement group names will be placed in the supplied buffer one after + * another. After the last name, there will be two 0 types in a row. + * + * If len is too short to fit all the placement group entries we need, we will + * fill as much as we can. + * + * @param cluster cluster handle + * @param pool pool ID + * @param buf output buffer + * @param len output buffer length + * @returns length of the buffer we would need to list all pools + */ +CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool, + char *buf, size_t len); + +/** + * Get a configuration handle for a rados cluster handle + * + * This handle is valid only as long as the cluster handle is valid. + * + * @param cluster cluster handle + * @returns config handle for this cluster + */ +CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster); + +/** + * Get a global id for current instance + * + * This id is a unique representation of current connection to the cluster + * + * @param cluster cluster handle + * @returns instance global id + */ +CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster); + +/** + * Gets the minimum compatible OSD version + * + * @param cluster cluster handle + * @param require_osd_release [out] minimum compatible OSD version + * based upon the current features + * @returns 0 on sucess, negative error code on failure + */ +CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster, + int8_t* require_osd_release); + +/** + * Gets the minimum compatible client version + * + * @param cluster cluster handle + * @param min_compat_client [out] minimum compatible client version + * based upon the current features + * @param require_min_compat_client [out] required minimum client version + * based upon explicit setting + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster, + int8_t* min_compat_client, + int8_t* require_min_compat_client); + +/** + * Create an io context + * + * The io context allows you to perform operations within a particular + * pool. For more details see rados_ioctx_t. + * + * @param cluster which cluster the pool is in + * @param pool_name name of the pool + * @param ioctx where to store the io context + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name, + rados_ioctx_t *ioctx); +CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id, + rados_ioctx_t *ioctx); + +/** + * The opposite of rados_ioctx_create + * + * This just tells librados that you no longer need to use the io context. + * It may not be freed immediately if there are pending asynchronous + * requests on it, but you should not use an io context again after + * calling this function on it. + * + * @warning This does not guarantee any asynchronous + * writes have completed. You must call rados_aio_flush() + * on the io context before destroying it to do that. + * + * @warning If this ioctx is used by rados_watch, the caller needs to + * be sure that all registered watches are disconnected via + * rados_unwatch() and that rados_watch_flush() is called. This + * ensures that a racing watch callback does not make use of a + * destroyed ioctx. + * + * @param io the io context to dispose of + */ +CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io); + +/** + * Get configuration handle for a pool handle + * + * @param io pool handle + * @returns rados_config_t for this cluster + */ +CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io); + +/** + * Get the cluster handle used by this rados_ioctx_t + * Note that this is a weak reference, and should not + * be destroyed via rados_shutdown(). + * + * @param io the io context + * @returns the cluster handle for this io context + */ +CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io); + +/** + * Get pool usage statistics + * + * Fills in a rados_pool_stat_t after querying the cluster. + * + * @param io determines which pool to query + * @param stats where to store the results + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io, + struct rados_pool_stat_t *stats); + +/** + * Get the id of a pool + * + * @param cluster which cluster the pool is in + * @param pool_name which pool to look up + * @returns id of the pool + * @returns -ENOENT if the pool is not found + */ +CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster, + const char *pool_name); + +/** + * Get the name of a pool + * + * @param cluster which cluster the pool is in + * @param id the id of the pool + * @param buf where to store the pool name + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id, + char *buf, size_t maxlen); + +/** + * Create a pool with default settings + * + * The default crush rule is rule 0. + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name); + +/** + * Create a pool owned by a specific auid. + * + * DEPRECATED: auid support has been removed, and this call will be removed in a future + * release. + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param auid the id of the owner of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster, + const char *pool_name, + uint64_t auid) + __attribute__((deprecated)); + +/** + * Create a pool with a specific CRUSH rule + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param crush_rule_num which rule to use for placement in the new pool1 + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster, + const char *pool_name, + uint8_t crush_rule_num); + +/** + * Create a pool with a specific CRUSH rule and auid + * + * DEPRECATED: auid support has been removed and this call will be removed + * in a future release. + * + * This is a combination of rados_pool_create_with_crush_rule() and + * rados_pool_create_with_auid(). + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param crush_rule_num which rule to use for placement in the new pool2 + * @param auid the id of the owner of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster, + const char *pool_name, + uint64_t auid, + uint8_t crush_rule_num) + __attribute__((deprecated)); + +/** + * Returns the pool that is the base tier for this pool. + * + * The return value is the ID of the pool that should be used to read from/write to. + * If tiering is not set up for the pool, returns \c pool. + * + * @param cluster the cluster the pool is in + * @param pool ID of the pool to query + * @param base_tier [out] base tier, or \c pool if tiering is not configured + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool, + int64_t* base_tier); + +/** + * Delete a pool and all data inside it + * + * The pool is removed from the cluster immediately, + * but the actual data is deleted in the background. + * + * @param cluster the cluster the pool is in + * @param pool_name which pool to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name); + +/** + * Attempt to change an io context's associated auid "owner" + * + * DEPRECATED: auid support has been removed and this call has no effect. + * + * Requires that you have write permission on both the current and new + * auid. + * + * @param io reference to the pool to change. + * @param auid the auid you wish the io to have. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid) + __attribute__((deprecated)); + + +/** + * Get the auid of a pool + * + * DEPRECATED: auid support has been removed and this call always reports + * CEPH_AUTH_UID_DEFAULT (-1). + + * @param io pool to query + * @param auid where to store the auid + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid) + __attribute__((deprecated)); + +/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */ +CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io) + __attribute__((deprecated)); + +/** + * Test whether the specified pool requires alignment or not. + * + * @param io pool to query + * @param req 1 if alignment is supported, 0 if not. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io, + int *req); + +/* deprecated, use rados_ioctx_pool_required_alignment2 instead */ +CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io) + __attribute__((deprecated)); + +/** + * Get the alignment flavor of a pool + * + * @param io pool to query + * @param alignment where to store the alignment flavor + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io, + uint64_t *alignment); + +/** + * Get the pool id of the io context + * + * @param io the io context to query + * @returns the id of the pool the io context uses + */ +CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io); + +/** + * Get the pool name of the io context + * + * @param io the io context to query + * @param buf pointer to buffer where name will be stored + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf, + unsigned maxlen); + +/** @} pools */ + +/** + * @name Object Locators + * + * @{ + */ + +/** + * Set the key for mapping objects to pgs within an io context. + * + * The key is used instead of the object name to determine which + * placement groups an object is put in. This affects all subsequent + * operations of the io context - until a different locator key is + * set, all objects in this io context will be placed in the same pg. + * + * @param io the io context to change + * @param key the key to use as the object locator, or NULL to discard + * any previously set key + */ +CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io, + const char *key); + +/** + * Set the namespace for objects within an io context + * + * The namespace specification further refines a pool into different + * domains. The mapping of objects to pgs is also based on this + * value. + * + * @param io the io context to change + * @param nspace the name to use as the namespace, or NULL use the + * default namespace + */ +CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io, + const char *nspace); + +/** + * Get the namespace for objects within the io context + * + * @param io the io context to query + * @param buf pointer to buffer where name will be stored + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf, + unsigned maxlen); + +/** @} obj_loc */ + +/** + * @name Listing Objects + * @{ + */ +/** + * Start listing objects in a pool + * + * @param io the pool to list from + * @param ctx the handle to store list context in + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io, + rados_list_ctx_t *ctx); + +/** + * Return hash position of iterator, rounded to the current PG + * + * @param ctx iterator marking where you are in the listing + * @returns current hash position, rounded to the current pg + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx); + +/** + * Reposition object iterator to a different hash position + * + * @param ctx iterator marking where you are in the listing + * @param pos hash position to move to + * @returns actual (rounded) position we moved to + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx, + uint32_t pos); + +/** + * Reposition object iterator to a different position + * + * @param ctx iterator marking where you are in the listing + * @param cursor position to move to + * @returns rounded position we moved to + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx, + rados_object_list_cursor cursor); + +/** + * Reposition object iterator to a different position + * + * The returned handle must be released with rados_object_list_cursor_free(). + * + * @param ctx iterator marking where you are in the listing + * @param cursor where to store cursor + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx, + rados_object_list_cursor *cursor); + +/** + * Get the next object name and locator in the pool + * + * *entry and *key are valid until next call to rados_nobjects_list_* + * + * @param ctx iterator marking where you are in the listing + * @param entry where to store the name of the entry + * @param key where to store the object locator (set to NULL to ignore) + * @param nspace where to store the object namespace (set to NULL to ignore) + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when there are no more objects to list + */ +CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx, + const char **entry, + const char **key, + const char **nspace); + +/** + * Get the next object name, locator and their sizes in the pool + * + * The sizes allow to list objects with \0 (the NUL character) + * in .e.g *entry. Is is unusual see such object names but a bug + * in a client has risen the need to handle them as well. + * *entry and *key are valid until next call to rados_nobjects_list_* + * + * @param ctx iterator marking where you are in the listing + * @param entry where to store the name of the entry + * @param key where to store the object locator (set to NULL to ignore) + * @param nspace where to store the object namespace (set to NULL to ignore) + * @param entry_size where to store the size of name of the entry + * @param key_size where to store the size of object locator (set to NULL to ignore) + * @param nspace_size where to store the size of object namespace (set to NULL to ignore) + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when there are no more objects to list + */ +CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx, + const char **entry, + const char **key, + const char **nspace, + size_t *entry_size, + size_t *key_size, + size_t *nspace_size); + +/** + * Close the object listing handle. + * + * This should be called when the handle is no longer needed. + * The handle should not be used after it has been closed. + * + * @param ctx the handle to close + */ +CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx); + +/** + * Get cursor handle pointing to the *beginning* of a pool. + * + * This is an opaque handle pointing to the start of a pool. It must + * be released with rados_object_list_cursor_free(). + * + * @param io ioctx for the pool + * @returns handle for the pool, NULL on error (pool does not exist) + */ +CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin( + rados_ioctx_t io); + +/** + * Get cursor handle pointing to the *end* of a pool. + * + * This is an opaque handle pointing to the start of a pool. It must + * be released with rados_object_list_cursor_free(). + * + * @param io ioctx for the pool + * @returns handle for the pool, NULL on error (pool does not exist) + */ +CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io); + +/** + * Check if a cursor has reached the end of a pool + * + * @param io ioctx + * @param cur cursor + * @returns 1 if the cursor has reached the end of the pool, 0 otherwise + */ +CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io, + rados_object_list_cursor cur); + +/** + * Release a cursor + * + * Release a cursor. The handle may not be used after this point. + * + * @param io ioctx + * @param cur cursor + */ +CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io, + rados_object_list_cursor cur); + +/** + * Compare two cursor positions + * + * Compare two cursors, and indicate whether the first cursor precedes, + * matches, or follows the second. + * + * @param io ioctx + * @param lhs first cursor + * @param rhs second cursor + * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs + */ +CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io, + rados_object_list_cursor lhs, rados_object_list_cursor rhs); + +/** + * @return the number of items set in the results array + */ +CEPH_RADOS_API int rados_object_list(rados_ioctx_t io, + const rados_object_list_cursor start, + const rados_object_list_cursor finish, + const size_t result_size, + const char *filter_buf, + const size_t filter_buf_len, + rados_object_list_item *results, + rados_object_list_cursor *next); + +CEPH_RADOS_API void rados_object_list_free( + const size_t result_size, + rados_object_list_item *results); + +/** + * Obtain cursors delineating a subset of a range. Use this + * when you want to split up the work of iterating over the + * global namespace. Expected use case is when you are iterating + * in parallel, with `m` workers, and each worker taking an id `n`. + * + * @param io ioctx + * @param start start of the range to be sliced up (inclusive) + * @param finish end of the range to be sliced up (exclusive) + * @param n which of the m chunks you would like to get cursors for + * @param m how many chunks to divide start-finish into + * @param split_start cursor populated with start of the subrange (inclusive) + * @param split_finish cursor populated with end of the subrange (exclusive) + */ +CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io, + const rados_object_list_cursor start, + const rados_object_list_cursor finish, + const size_t n, + const size_t m, + rados_object_list_cursor *split_start, + rados_object_list_cursor *split_finish); + + +/** @} Listing Objects */ + +/** + * @name Snapshots + * + * RADOS snapshots are based upon sequence numbers that form a + * snapshot context. They are pool-specific. The snapshot context + * consists of the current snapshot sequence number for a pool, and an + * array of sequence numbers at which snapshots were taken, in + * descending order. Whenever a snapshot is created or deleted, the + * snapshot sequence number for the pool is increased. To add a new + * snapshot, the new snapshot sequence number must be increased and + * added to the snapshot context. + * + * There are two ways to manage these snapshot contexts: + * -# within the RADOS cluster + * These are called pool snapshots, and store the snapshot context + * in the OSDMap. These represent a snapshot of all the objects in + * a pool. + * -# within the RADOS clients + * These are called self-managed snapshots, and push the + * responsibility for keeping track of the snapshot context to the + * clients. For every write, the client must send the snapshot + * context. In librados, this is accomplished with + * rados_selfmanaged_snap_set_write_ctx(). These are more + * difficult to manage, but are restricted to specific objects + * instead of applying to an entire pool. + * + * @{ + */ + +/** + * Create a pool-wide snapshot + * + * @param io the pool to snapshot + * @param snapname the name of the snapshot + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io, + const char *snapname); + +/** + * Delete a pool snapshot + * + * @param io the pool to delete the snapshot from + * @param snapname which snapshot to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io, + const char *snapname); + +/** + * Rollback an object to a pool snapshot + * + * The contents of the object will be the same as + * when the snapshot was taken. + * + * @param io the pool in which the object is stored + * @param oid the name of the object to rollback + * @param snapname which snapshot to rollback to + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid, + const char *snapname); + +/** + * @warning Deprecated: Use rados_ioctx_snap_rollback() instead + */ +CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid, + const char *snapname) + __attribute__((deprecated)); + +/** + * Set the snapshot from which reads are performed. + * + * Subsequent reads will return data as it was at the time of that + * snapshot. + * + * @param io the io context to change + * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no + * snapshot (i.e. normal operation) + */ +CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io, + rados_snap_t snap); + +/** + * Allocate an ID for a self-managed snapshot + * + * Get a unique ID to put in the snaphot context to create a + * snapshot. A clone of an object is not created until a write with + * the new snapshot context is completed. + * + * @param io the pool in which the snapshot will exist + * @param snapid where to store the newly allocated snapshot ID + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io, + rados_snap_t *snapid); +CEPH_RADOS_API void +rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io, + rados_snap_t *snapid, + rados_completion_t completion); + +/** + * Remove a self-managed snapshot + * + * This increases the snapshot sequence number, which will cause + * snapshots to be removed lazily. + * + * @param io the pool in which the snapshot will exist + * @param snapid where to store the newly allocated snapshot ID + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io, + rados_snap_t snapid); +CEPH_RADOS_API void +rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io, + rados_snap_t snapid, + rados_completion_t completion); + +/** + * Rollback an object to a self-managed snapshot + * + * The contents of the object will be the same as + * when the snapshot was taken. + * + * @param io the pool in which the object is stored + * @param oid the name of the object to rollback + * @param snapid which snapshot to rollback to + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io, + const char *oid, + rados_snap_t snapid); + +/** + * Set the snapshot context for use when writing to objects + * + * This is stored in the io context, and applies to all future writes. + * + * @param io the io context to change + * @param seq the newest snapshot sequence number for the pool + * @param snaps array of snapshots in sorted by descending id + * @param num_snaps how many snaphosts are in the snaps array + * @returns 0 on success, negative error code on failure + * @returns -EINVAL if snaps are not in descending order + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io, + rados_snap_t seq, + rados_snap_t *snaps, + int num_snaps); + +/** + * List all the ids of pool snapshots + * + * If the output array does not have enough space to fit all the + * snapshots, -ERANGE is returned and the caller should retry with a + * larger array. + * + * @param io the pool to read from + * @param snaps where to store the results + * @param maxlen the number of rados_snap_t that fit in the snaps array + * @returns number of snapshots on success, negative error code on failure + * @returns -ERANGE is returned if the snaps array is too short + */ +CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps, + int maxlen); + +/** + * Get the id of a pool snapshot + * + * @param io the pool to read from + * @param name the snapshot to find + * @param id where to store the result + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name, + rados_snap_t *id); + +/** + * Get the name of a pool snapshot + * + * @param io the pool to read from + * @param id the snapshot to find + * @param name where to store the result + * @param maxlen the size of the name array + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the name array is too small + */ +CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id, + char *name, int maxlen); + +/** + * Find when a pool snapshot occurred + * + * @param io the pool the snapshot was taken in + * @param id the snapshot to lookup + * @param t where to store the result + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id, + time_t *t); + +/** @} Snapshots */ + +/** + * @name Synchronous I/O + * Writes are replicated to a number of OSDs based on the + * configuration of the pool they are in. These write functions block + * until data is in memory on all replicas of the object they're + * writing to - they are equivalent to doing the corresponding + * asynchronous write, and the calling + * rados_ioctx_wait_for_complete(). For greater data safety, use the + * asynchronous functions and rados_aio_wait_for_safe(). + * + * @{ + */ + +/** + * Return the version of the last object read or written to. + * + * This exposes the internal version number of the last object read or + * written via this io context + * + * @param io the io context to check + * @returns last read or written object version + */ +CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io); + +/** + * Write *len* bytes from *buf* into the *oid* object, starting at + * offset *off*. The value of *len* must be <= UINT_MAX/2. + * + * @note This will never return a positive value not equal to len. + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid, + const char *buf, size_t len, uint64_t off); + +/** + * Write *len* bytes from *buf* into the *oid* object. The value of + * *len* must be <= UINT_MAX/2. + * + * The object is filled with the provided data. If the object exists, + * it is atomically truncated and then written. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid, + const char *buf, size_t len); + +/** + * Write the same *data_len* bytes from *buf* multiple times into the + * *oid* object. *write_len* bytes are written in total, which must be + * a multiple of *data_len*. The value of *write_len* and *data_len* + * must be <= UINT_MAX/2. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param data_len length of the data, in bytes + * @param write_len the total number of bytes to write + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid, + const char *buf, size_t data_len, + size_t write_len, uint64_t off); + +/** + * Append *len* bytes from *buf* into the *oid* object. The value of + * *len* must be <= UINT_MAX/2. + * + * @param io the context to operate in + * @param oid the name of the object + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid, + const char *buf, size_t len); + +/** + * Read data from an object + * + * The io context determines the snapshot to read from, if any was set + * by rados_ioctx_snap_set_read(). + * + * @param io the context in which to perform the read + * @param oid the name of the object to read from + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns number of bytes read on success, negative error code on + * failure + */ +CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf, + size_t len, uint64_t off); + +/** + * Compute checksum from object data + * + * The io context determines the snapshot to checksum, if any was set + * by rados_ioctx_snap_set_read(). The length of the init_value and + * resulting checksum are dependent upon the checksum type: + * + * XXHASH64: le64 + * XXHASH32: le32 + * CRC32C: le32 + * + * The checksum result is encoded the following manner: + * + * le32 num_checksum_chunks + * { + * leXX checksum for chunk (where XX = appropriate size for the checksum type) + * } * num_checksum_chunks + * + * @param io the context in which to perform the checksum + * @param oid the name of the object to checksum + * @param type the checksum algorithm to utilize + * @param init_value the init value for the algorithm + * @param init_value_len the length of the init value + * @param len the number of bytes to checksum + * @param off the offset to start checksumming in the object + * @param chunk_size optional length-aligned chunk size for checksums + * @param pchecksum where to store the checksum result + * @param checksum_len the number of bytes available for the result + * @return negative error code on failure + */ +CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid, + rados_checksum_type_t type, + const char *init_value, size_t init_value_len, + size_t len, uint64_t off, size_t chunk_size, + char *pchecksum, size_t checksum_len); + +/** + * Delete an object + * + * @note This does not delete any snapshots of the object. + * + * @param io the pool to delete the object from + * @param oid the name of the object to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid); + +/** + * Resize an object + * + * If this enlarges the object, the new area is logically filled with + * zeroes. If this shrinks the object, the excess data is removed. + * + * @param io the context in which to truncate + * @param oid the name of the object + * @param size the new size of the object in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid, + uint64_t size); + +/** + * Compare an on-disk object range with a buffer + * + * @param io the context in which to perform the comparison + * @param o name of the object + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o, + const char *cmp_buf, size_t cmp_len, + uint64_t off); + +/** + * @name Xattrs + * Extended attributes are stored as extended attributes on the files + * representing an object on the OSDs. Thus, they have the same + * limitations as the underlying filesystem. On ext4, this means that + * the total data stored in xattrs cannot exceed 4KB. + * + * @{ + */ + +/** + * Get the value of an extended attribute on an object. + * + * @param io the context in which the attribute is read + * @param o name of the object + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o, + const char *name, char *buf, size_t len); + +/** + * Set an extended attribute on an object. + * + * @param io the context in which xattr is set + * @param o name of the object + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o, + const char *name, const char *buf, + size_t len); + +/** + * Delete an extended attribute from an object. + * + * @param io the context in which to delete the xattr + * @param o the name of the object + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o, + const char *name); + +/** + * Start iterating over xattrs on an object. + * + * @post iter is a valid iterator + * + * @param io the context in which to list xattrs + * @param oid name of the object + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid, + rados_xattrs_iter_t *iter); + +/** + * Get the next xattr on the object + * + * @pre iter is a valid iterator + * + * @post name is the NULL-terminated name of the next xattr, and val + * contains the value of the xattr, which is of length len. If the end + * of the list has been reached, name and val are NULL, and len is 0. + * + * @param iter iterator to advance + * @param name where to store the name of the next xattr + * @param val where to store the value of the next xattr + * @param len the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter, + const char **name, const char **val, + size_t *len); + +/** + * Close the xattr iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter); + +/** @} Xattrs */ + +/** + * Get the next omap key/value pair on the object + * + * @pre iter is a valid iterator + * + * @post key and val are the next key/value pair. key is + * null-terminated, and val has length len. If the end of the list has + * been reached, key and val are NULL, and len is 0. key and val will + * not be accessible after rados_omap_get_end() is called on iter, so + * if they are needed after that they should be copied. + * + * @param iter iterator to advance + * @param key where to store the key of the next omap entry + * @param val where to store the value of the next omap entry + * @param len where to store the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter, + char **key, + char **val, + size_t *len); + +/** + * Get the next omap key/value pair on the object. Note that it's + * perfectly safe to mix calls to rados_omap_get_next and + * rados_omap_get_next2. + * + * @pre iter is a valid iterator + * + * @post key and val are the next key/value pair. key has length + * keylen and val has length vallen. If the end of the list has + * been reached, key and val are NULL, and keylen and vallen is 0. + * key and val will not be accessible after rados_omap_get_end() + * is called on iter, so if they are needed after that they + * should be copied. + * + * @param iter iterator to advance + * @param key where to store the key of the next omap entry + * @param val where to store the value of the next omap entry + * @param key_len where to store the number of bytes in key + * @param val_len where to store the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter, + char **key, + char **val, + size_t *key_len, + size_t *val_len); + +/** + * Return number of elements in the iterator + * + * @param iter the iterator of which to return the size + */ +CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter); + +/** + * Close the omap iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter); + +/** + * Get object size and most recent update time from the OSD. + * + * @param io ioctx + * @param o object name + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize, + time_t *pmtime); + +CEPH_RADOS_API int rados_stat2(rados_ioctx_t io, const char *o, uint64_t *psize, + struct timespec *pmtime); + +/** + * Execute an OSD class method on an object + * + * The OSD has a plugin mechanism for performing complicated + * operations on an object atomically. These plugins are called + * classes. This function allows librados users to call the custom + * methods. The input and output formats are defined by the class. + * Classes in ceph.git can be found in src/cls subdirectories + * + * @param io the context in which to call the method + * @param oid the object to call the method on + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param buf where to store output + * @param out_len length of buf in bytes + * @returns the length of the output, or + * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For + * methods that don't return data, the return value is + * method-specific. + */ +CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid, + const char *cls, const char *method, + const char *in_buf, size_t in_len, char *buf, + size_t out_len); + + +/** @} Synchronous I/O */ + +/** + * @name Asynchronous I/O + * Read and write to objects without blocking. + * + * @{ + */ + +/** + * @typedef rados_callback_t + * Callbacks for asynchrous operations take two parameters: + * - cb the completion that has finished + * - arg application defined data made available to the callback function + */ +typedef void (*rados_callback_t)(rados_completion_t cb, void *arg); + +/** + * Constructs a completion to use with asynchronous operations + * + * The complete and safe callbacks correspond to operations being + * acked and committed, respectively. The callbacks are called in + * order of receipt, so the safe callback may be triggered before the + * complete callback, and vice versa. This is affected by journalling + * on the OSDs. + * + * TODO: more complete documentation of this elsewhere (in the RADOS docs?) + * + * @note Read operations only get a complete callback. + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is + * in memory on all replicas + * @param cb_safe the function to be called when the operation is on + * stable storage on all replicas + * @param pc where to store the completion + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg, + rados_callback_t cb_complete, + rados_callback_t cb_safe, + rados_completion_t *pc); + +/** + * Constructs a completion to use with asynchronous operations + * + * The complete callback corresponds to operation being acked. + * + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is committed + * on all replicas + * @param pc where to store the completion + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_create_completion2(void *cb_arg, + rados_callback_t cb_complete, + rados_completion_t *pc); + +/** + * Block until an operation completes + * + * This means it is in memory on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c); + +/** + * Block until an operation is safe + * + * This means it is on stable storage on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c) + __attribute__((deprecated)); + +/** + * Has an asynchronous operation completed? + * + * @warning This does not imply that the complete callback has + * finished + * + * @param c async operation to inspect + * @returns whether c is complete + */ +CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c); + +/** + * Is an asynchronous operation safe? + * + * @warning This does not imply that the safe callback has + * finished + * + * @param c async operation to inspect + * @returns whether c is safe + */ +CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c); + +/** + * Block until an operation completes and callback completes + * + * This means it is in memory on all replicas and can be read. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c); + +/** + * Block until an operation is safe and callback has completed + * + * This means it is on stable storage on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c) + __attribute__((deprecated)); + +/** + * Has an asynchronous operation and callback completed + * + * @param c async operation to inspect + * @returns whether c is complete + */ +CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c); + +/** + * Is an asynchronous operation safe and has the callback completed + * + * @param c async operation to inspect + * @returns whether c is safe + */ +CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c); + +/** + * Get the return value of an asychronous operation + * + * The return value is set when the operation is complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operation to inspect + * @returns return value of the operation + */ +CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c); + +/** + * Get the internal object version of the target of an asychronous operation + * + * The return value is set when the operation is complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operation to inspect + * @returns version number of the asychronous operation's target + */ +CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c); + +/** + * Release a completion + * + * Call this when you no longer need the completion. It may not be + * freed immediately if the operation is not acked and committed. + * + * @param c completion to release + */ +CEPH_RADOS_API void rados_aio_release(rados_completion_t c); + +/** + * Write data to an object asynchronously + * + * Queues the write and returns. The return value of the completion + * will be 0 on success, negative error code on failure. + * + * @param io the context in which the write will occur + * @param oid name of the object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len, uint64_t off); + +/** + * Asynchronously append data to an object + * + * Queues the append and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the context to operate in + * @param oid the name of the object + * @param completion what to do when the append is safe and complete + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len); + +/** + * Asynchronously write an entire object + * + * The object is filled with the provided data. If the object exists, + * it is atomically truncated and then written. + * Queues the write_full and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param completion what to do when the write_full is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len); + +/** + * Asynchronously write the same buffer multiple times + * + * Queues the writesame and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param completion what to do when the writesame is safe and complete + * @param buf data to write + * @param data_len length of the data, in bytes + * @param write_len the total number of bytes to write + * @param off byte offset in the object to begin writing at + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t data_len, + size_t write_len, uint64_t off); + +/** + * Asynchronously remove an object + * + * Queues the remove and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the context to operate in + * @param oid the name of the object + * @param completion what to do when the remove is safe and complete + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid, + rados_completion_t completion); + +/** + * Asynchronously read data from an object + * + * The io context determines the snapshot to read from, if any was set + * by rados_ioctx_snap_set_read(). + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @note only the 'complete' callback of the completion will be called. + * + * @param io the context in which to perform the read + * @param oid the name of the object to read from + * @param completion what to do when the read is complete + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + char *buf, size_t len, uint64_t off); + +/** + * Block until all pending writes in an io context are safe + * + * This is not equivalent to calling rados_aio_wait_for_safe() on all + * write completions, since this waits for the associated callbacks to + * complete as well. + * + * @note BUG: always returns 0, should be void or accept a timeout + * + * @param io the context to flush + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io); + + +/** + * Schedule a callback for when all currently pending + * aio writes are safe. This is a non-blocking version of + * rados_aio_flush(). + * + * @param io the context to flush + * @param completion what to do when the writes are safe + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io, + rados_completion_t completion); + + +/** + * Asynchronously get object stats (size/mtime) + * + * @param io ioctx + * @param o object name + * @param completion what to do when the stat is complete + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o, + rados_completion_t completion, + uint64_t *psize, time_t *pmtime); + +CEPH_RADOS_API int rados_aio_stat2(rados_ioctx_t io, const char *o, + rados_completion_t completion, + uint64_t *psize, struct timespec *pmtime); + +/** + * Asynchronously compare an on-disk object range with a buffer + * + * @param io the context in which to perform the comparison + * @param o the name of the object to compare with + * @param completion what to do when the comparison is complete + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *cmp_buf, + size_t cmp_len, + uint64_t off); + +/** + * Cancel async operation + * + * @param io ioctx + * @param completion completion handle + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io, + rados_completion_t completion); + +/** + * Asynchronously execute an OSD class method on an object + * + * The OSD has a plugin mechanism for performing complicated + * operations on an object atomically. These plugins are called + * classes. This function allows librados users to call the custom + * methods. The input and output formats are defined by the class. + * Classes in ceph.git can be found in src/cls subdirectories + * + * @param io the context in which to call the method + * @param o name of the object + * @param completion what to do when the exec completes + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param buf where to store output + * @param out_len length of buf in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *cls, const char *method, + const char *in_buf, size_t in_len, + char *buf, size_t out_len); + +/** @} Asynchronous I/O */ + +/** + * @name Asynchronous Xattrs + * Extended attributes are stored as extended attributes on the files + * representing an object on the OSDs. Thus, they have the same + * limitations as the underlying filesystem. On ext4, this means that + * the total data stored in xattrs cannot exceed 4KB. + * + * @{ + */ + +/** + * Asynchronously get the value of an extended attribute on an object. + * + * @param io the context in which the attribute is read + * @param o name of the object + * @param completion what to do when the getxattr completes + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name, char *buf, size_t len); + +/** + * Asynchronously set an extended attribute on an object. + * + * @param io the context in which xattr is set + * @param o name of the object + * @param completion what to do when the setxattr completes + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name, const char *buf, + size_t len); + +/** + * Asynchronously delete an extended attribute from an object. + * + * @param io the context in which to delete the xattr + * @param o the name of the object + * @param completion what to do when the rmxattr completes + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name); + +/** + * Asynchronously start iterating over xattrs on an object. + * + * @post iter is a valid iterator + * + * @param io the context in which to list xattrs + * @param oid name of the object + * @param completion what to do when the getxattrs completes + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + rados_xattrs_iter_t *iter); + +/** @} Asynchronous Xattrs */ + +/** + * @name Watch/Notify + * + * Watch/notify is a protocol to help communicate among clients. It + * can be used to sychronize client state. All that's needed is a + * well-known object name (for example, rbd uses the header object of + * an image). + * + * Watchers register an interest in an object, and receive all + * notifies on that object. A notify attempts to communicate with all + * clients watching an object, and blocks on the notifier until each + * client responds or a timeout is reached. + * + * See rados_watch() and rados_notify() for more details. + * + * @{ + */ + +/** + * @typedef rados_watchcb_t + * + * Callback activated when a notify is received on a watched + * object. + * + * @param opcode undefined + * @param ver version of the watched object + * @param arg application-specific data + * + * @note BUG: opcode is an internal detail that shouldn't be exposed + * @note BUG: ver is unused + */ +typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg); + +/** + * @typedef rados_watchcb2_t + * + * Callback activated when a notify is received on a watched + * object. + * + * @param arg opaque user-defined value provided to rados_watch2() + * @param notify_id an id for this notify event + * @param handle the watcher handle we are notifying + * @param notifier_id the unique client id for the notifier + * @param data payload from the notifier + * @param data_len length of payload buffer + */ +typedef void (*rados_watchcb2_t)(void *arg, + uint64_t notify_id, + uint64_t handle, + uint64_t notifier_id, + void *data, + size_t data_len); + +/** + * @typedef rados_watcherrcb_t + * + * Callback activated when we encounter an error with the watch session. + * This can happen when the location of the objects moves within the + * cluster and we fail to register our watch with the new object location, + * or when our connection with the object OSD is otherwise interrupted and + * we may have missed notify events. + * + * @param pre opaque user-defined value provided to rados_watch2() + * @param cookie the internal id assigned to the watch session + * @param err error code + */ + typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err); + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after 30 seconds. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @note BUG: librados should provide a way for watchers to notice connection resets + * @note BUG: the ver parameter does not work, and -ERANGE will never be returned + * (See URL tracker.ceph.com/issues/2592) + * + * @param io the pool the object is in + * @param o the object to watch + * @param ver expected version of the object + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param arg application defined data to pass when watchcb is called + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the version of the object is greater than ver + */ +CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver, + uint64_t *cookie, + rados_watchcb_t watchcb, void *arg) + __attribute__((deprecated)); + + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to the + * primary OSD for a watched object, the watch will be removed after + * a timeout configured with osd_client_watch_timeout. + * Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + void *arg); + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param timeout how many seconds the connection will keep after disconnection + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + uint32_t timeout, + void *arg); + +/** + * Asynchronous register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after 30 seconds. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param completion what to do when operation has been attempted + * @param handle where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o, + rados_completion_t completion, uint64_t *handle, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + void *arg); + +/** + * Asynchronous register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after the number of seconds that configured in timeout parameter. + * Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param completion what to do when operation has been attempted + * @param handle where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param timeout how many seconds the connection will keep after disconnection + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o, + rados_completion_t completion, uint64_t *handle, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + uint32_t timeout, + void *arg); + +/** + * Check on the status of a watch + * + * Return the number of milliseconds since the watch was last confirmed. + * Or, if there has been an error, return that. + * + * If there is an error, the watch is no longer valid, and should be + * destroyed with rados_unwatch2(). The the user is still interested + * in the object, a new watch should be created with rados_watch2(). + * + * @param io the pool the object is in + * @param cookie the watch handle + * @returns ms since last confirmed on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie); + +/** + * Unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param o the name of the watched object (ignored) + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie) + __attribute__((deprecated)); + +/** + * Unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie); + +/** + * Asynchronous unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param completion what to do when operation has been attempted + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie, + rados_completion_t completion); + +/** + * Sychronously notify watchers of an object + * + * This blocks until all watchers of the object have received and + * reacted to the notify, or a timeout is reached. + * + * @note BUG: the timeout is not changeable via the C API + * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t + * + * @param io the pool the object is in + * @param o the name of the object + * @param ver obsolete - just pass zero + * @param buf data to send to watchers + * @param buf_len length of buf in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver, + const char *buf, int buf_len) + __attribute__((deprecated)); + +/** + * Sychronously notify watchers of an object + * + * This blocks until all watchers of the object have received and + * reacted to the notify, or a timeout is reached. + * + * The reply buffer is optional. If specified, the client will get + * back an encoded buffer that includes the ids of the clients that + * acknowledged the notify as well as their notify ack payloads (if + * any). Clients that timed out are not included. Even clients that + * do not include a notify ack payload are included in the list but + * have a 0-length payload associated with them. The format: + * + * le32 num_acks + * { + * le64 gid global id for the client (for client.1234 that's 1234) + * le64 cookie cookie for the client + * le32 buflen length of reply message buffer + * u8 * buflen payload + * } * num_acks + * le32 num_timeouts + * { + * le64 gid global id for the client + * le64 cookie cookie for the client + * } * num_timeouts + * + * Note: There may be multiple instances of the same gid if there are + * multiple watchers registered via the same client. + * + * Note: The buffer must be released with rados_buffer_free() when the + * user is done with it. + * + * Note: Since the result buffer includes clients that time out, it + * will be set even when rados_notify() returns an error code (like + * -ETIMEDOUT). + * + * @param io the pool the object is in + * @param completion what to do when operation has been attempted + * @param o the name of the object + * @param buf data to send to watchers + * @param buf_len length of buf in bytes + * @param timeout_ms notify timeout (in ms) + * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free) + * @param reply_buffer_len pointer to size of reply buffer + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *buf, int buf_len, + uint64_t timeout_ms, char **reply_buffer, + size_t *reply_buffer_len); +CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o, + const char *buf, int buf_len, + uint64_t timeout_ms, + char **reply_buffer, size_t *reply_buffer_len); + +/** + * Decode a notify response + * + * Decode a notify response (from rados_aio_notify() call) into acks and + * timeout arrays. + * + * @param reply_buffer buffer from rados_aio_notify() call + * @param reply_buffer_len reply_buffer length + * @param acks pointer to struct notify_ack_t pointer + * @param nr_acks pointer to ack count + * @param timeouts pointer to notify_timeout_t pointer + * @param nr_timeouts pointer to timeout count + * @returns 0 on success + */ +CEPH_RADOS_API int rados_decode_notify_response(char *reply_buffer, size_t reply_buffer_len, + struct notify_ack_t **acks, size_t *nr_acks, + struct notify_timeout_t **timeouts, size_t *nr_timeouts); + +/** + * Free notify allocated buffer + * + * Release memory allocated by rados_decode_notify_response() call + * + * @param acks notify_ack_t struct (from rados_decode_notify_response()) + * @param nr_acks ack count + * @param timeouts notify_timeout_t struct (from rados_decode_notify_response()) + */ +CEPH_RADOS_API void rados_free_notify_response(struct notify_ack_t *acks, size_t nr_acks, + struct notify_timeout_t *timeouts); + +/** + * Acknolwedge receipt of a notify + * + * @param io the pool the object is in + * @param o the name of the object + * @param notify_id the notify_id we got on the watchcb2_t callback + * @param cookie the watcher handle + * @param buf payload to return to notifier (optional) + * @param buf_len payload length + * @returns 0 on success + */ +CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o, + uint64_t notify_id, uint64_t cookie, + const char *buf, int buf_len); + +/** + * Flush watch/notify callbacks + * + * This call will block until all pending watch/notify callbacks have + * been executed and the queue is empty. It should usually be called + * after shutting down any watches before shutting down the ioctx or + * librados to ensure that any callbacks do not misuse the ioctx (for + * example by calling rados_notify_ack after the ioctx has been + * destroyed). + * + * @param cluster the cluster handle + */ +CEPH_RADOS_API int rados_watch_flush(rados_t cluster); +/** + * Flush watch/notify callbacks + * + * This call will be nonblock, and the completion will be called + * until all pending watch/notify callbacks have been executed and + * the queue is empty. It should usually be called after shutting + * down any watches before shutting down the ioctx or + * librados to ensure that any callbacks do not misuse the ioctx (for + * example by calling rados_notify_ack after the ioctx has been + * destroyed). + * + * @param cluster the cluster handle + * @param completion what to do when operation has been attempted + */ +CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion); + +/** @} Watch/Notify */ + +/** + * Pin an object in the cache tier + * + * When an object is pinned in the cache tier, it stays in the cache + * tier, and won't be flushed out. + * + * @param io the pool the object is in + * @param o the object id + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o); + +/** + * Unpin an object in the cache tier + * + * After an object is unpinned in the cache tier, it can be flushed out + * + * @param io the pool the object is in + * @param o the object id + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o); + +/** + * @name Hints + * + * @{ + */ + +/** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it was + * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not + * guaranteed to do anything on the backend. + * + * @param io the pool the object is in + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o, + uint64_t expected_object_size, + uint64_t expected_write_size); + +/** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it was + * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not + * guaranteed to do anything on the backend. + * + * @param io the pool the object is in + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags hints about future IO patterns + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + +/** @} Hints */ + +/** + * @name Object Operations + * + * A single rados operation can do multiple operations on one object + * atomically. The whole operation will succeed or fail, and no partial + * results will be visible. + * + * Operations may be either reads, which can return data, or writes, + * which cannot. The effects of writes are applied and visible all at + * once, so an operation that sets an xattr and then checks its value + * will not see the updated value. + * + * @{ + */ + +/** + * Create a new rados_write_op_t write operation. This will store all actions + * to be performed atomically. You must call rados_release_write_op when you are + * finished with it. + * + * @note the ownership of a write operartion is passed to the function + * performing the operation, so the same instance of @c rados_write_op_t + * cannot be used again after being performed. + * + * @returns non-NULL on success, NULL on memory allocation error. + */ +CEPH_RADOS_API rados_write_op_t rados_create_write_op(void); + +/** + * Free a rados_write_op_t, must be called when you're done with it. + * @param write_op operation to deallocate, created with rados_create_write_op + */ +CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op); + +/** + * Set flags for the last operation added to this write_op. + * At least one op must have been added to the write_op. + * @param write_op operation to add this action to + * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op, + int flags); + +/** + * Ensure that the object exists before writing + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op); + +/** + * Ensure that the object exists and that its internal version + * number is equal to "ver" before writing. "ver" should be a + * version number previously obtained with rados_get_last_version(). + * - If the object's version is greater than the asserted version + * then rados_write_op_operate will return -ERANGE instead of + * executing the op. + * - If the object's version is less than the asserted version + * then rados_write_op_operate will return -EOVERFLOW instead + * of executing the op. + * @param write_op operation to add this action to + * @param ver object version number + */ +CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver); + +/** + * Ensure that given object range (extent) satisfies comparison. + * + * @param write_op operation to add this action to + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @param prval returned result of comparison, 0 on success, negative error code + * on failure, (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op, + const char *cmp_buf, + size_t cmp_len, + uint64_t off, + int *prval); + +/** + * Ensure that given xattr satisfies comparison. + * If the comparison is not satisfied, the return code of the + * operation will be -ECANCELED + * @param write_op operation to add this action to + * @param name name of the xattr to look up + * @param comparison_operator currently undocumented, look for + * LIBRADOS_CMPXATTR_OP_EQ in librados.h + * @param value buffer to compare actual xattr value to + * @param value_len length of buffer to compare actual xattr value to + */ +CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op, + const char *name, + uint8_t comparison_operator, + const char *value, + size_t value_len); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param write_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t val_len, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param write_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param key_len length of key in bytes + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t key_len, + size_t val_len, + int *prval); + +/** + * Set an xattr + * @param write_op operation to add this action to + * @param name name of the xattr + * @param value buffer to set xattr to + * @param value_len length of buffer to set xattr to + */ +CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op, + const char *name, + const char *value, + size_t value_len); + +/** + * Remove an xattr + * @param write_op operation to add this action to + * @param name name of the xattr to remove + */ +CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op, + const char *name); + +/** + * Create the object + * @param write_op operation to add this action to + * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or + LIBRADOS_CREATE_IDEMPOTENT + * will error if the object already exists. + * @param category category string (DEPRECATED, HAS NO EFFECT) + */ +CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op, + int exclusive, + const char* category); + +/** + * Write to offset + * @param write_op operation to add this action to + * @param offset offset to write to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op, + const char *buffer, + size_t len, + uint64_t offset); + +/** + * Write whole object, atomically replacing it. + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op, + const char *buffer, + size_t len); + +/** + * Write the same buffer multiple times + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param data_len length of buffer + * @param write_len total number of bytes to write, as a multiple of @c data_len + * @param offset offset to write to + */ +CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op, + const char *buffer, + size_t data_len, + size_t write_len, + uint64_t offset); + +/** + * Append to end of object. + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op, + const char *buffer, + size_t len); +/** + * Remove object + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op); + +/** + * Truncate an object + * @param write_op operation to add this action to + * @param offset Offset to truncate to + */ +CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op, + uint64_t offset); + +/** + * Zero part of an object + * @param write_op operation to add this action to + * @param offset Offset to zero + * @param len length to zero + */ +CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op, + uint64_t offset, + uint64_t len); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * @param write_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + int *prval); + +/** + * Set key/value pairs on an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to set + * @param vals array of pointers to values to set + * @param lens array of lengths corresponding to each value + * @param num number of key/value pairs to set + */ +CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op, + char const* const* keys, + char const* const* vals, + const size_t *lens, + size_t num); + +/** + * Set key/value pairs on an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to set + * @param vals array of pointers to values to set + * @param key_lens array of lengths corresponding to each key + * @param val_lens array of lengths corresponding to each value + * @param num number of key/value pairs to set + */ +CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op, + char const* const* keys, + char const* const* vals, + const size_t *key_lens, + const size_t *val_lens, + size_t num); + +/** + * Remove key/value pairs from an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to remove + * @param keys_len number of key/value pairs to remove + */ +CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op, + char const* const* keys, + size_t keys_len); + +/** + * Remove key/value pairs from an object + * + * @param write_op operation to add this action to + * @param keys array of char arrays representing keys to remove + * @param key_lens array of size_t values representing length of each key + * @param keys_len number of key/value pairs to remove + */ +CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op, + char const* const* keys, + const size_t* key_lens, + size_t keys_len); + + +/** + * Remove key/value pairs from an object whose keys are in the range + * [key_begin, key_end) + * + * @param write_op operation to add this action to + * @param key_begin the lower bound of the key range to remove + * @param key_begin_len length of key_begin + * @param key_end the upper bound of the key range to remove + * @param key_end_len length of key_end + */ +CEPH_RADOS_API void rados_write_op_omap_rm_range2(rados_write_op_t write_op, + const char *key_begin, + size_t key_begin_len, + const char *key_end, + size_t key_end_len); + +/** + * Remove all key/value pairs from an object + * + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op); + +/** + * Set allocation hint for an object + * + * @param write_op operation to add this action to + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + */ +CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op, + uint64_t expected_object_size, + uint64_t expected_write_size); + +/** + * Set allocation hint for an object + * + * @param write_op operation to add this action to + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags hints about future IO patterns + */ +CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + +/** + * Perform a write operation synchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op, + rados_ioctx_t io, + const char *oid, + time_t *mtime, + int flags); +/** + * Perform a write operation synchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ + +CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op, + rados_ioctx_t io, + const char *oid, + struct timespec *mtime, + int flags); + +/** + * Perform a write operation asynchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + time_t *mtime, + int flags); + +/** + * Perform a write operation asynchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_write_op_operate2(rados_write_op_t write_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + struct timespec *mtime, + int flags); + +/** + * Create a new rados_read_op_t read operation. This will store all + * actions to be performed atomically. You must call + * rados_release_read_op when you are finished with it (after it + * completes, or you decide not to send it in the first place). + * + * @note the ownership of a read operartion is passed to the function + * performing the operation, so the same instance of @c rados_read_op_t + * cannot be used again after being performed. + * + * @returns non-NULL on success, NULL on memory allocation error. + */ +CEPH_RADOS_API rados_read_op_t rados_create_read_op(void); + +/** + * Free a rados_read_op_t, must be called when you're done with it. + * @param read_op operation to deallocate, created with rados_create_read_op + */ +CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op); + +/** + * Set flags for the last operation added to this read_op. + * At least one op must have been added to the read_op. + * @param read_op operation to add this action to + * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags); + +/** + * Ensure that the object exists before reading + * @param read_op operation to add this action to + */ +CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op); + +/** + * Ensure that the object exists and that its internal version + * number is equal to "ver" before reading. "ver" should be a + * version number previously obtained with rados_get_last_version(). + * - If the object's version is greater than the asserted version + * then rados_read_op_operate will return -ERANGE instead of + * executing the op. + * - If the object's version is less than the asserted version + * then rados_read_op_operate will return -EOVERFLOW instead + * of executing the op. + * @param read_op operation to add this action to + * @param ver object version number + */ +CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver); + +/** + * Ensure that given object range (extent) satisfies comparison. + * + * @param read_op operation to add this action to + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @param prval returned result of comparison, 0 on success, negative error code + * on failure, (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op, + const char *cmp_buf, + size_t cmp_len, + uint64_t off, + int *prval); + +/** + * Ensure that the an xattr satisfies a comparison + * If the comparison is not satisfied, the return code of the + * operation will be -ECANCELED + * @param read_op operation to add this action to + * @param name name of the xattr to look up + * @param comparison_operator currently undocumented, look for + * LIBRADOS_CMPXATTR_OP_EQ in librados.h + * @param value buffer to compare actual xattr value to + * @param value_len length of buffer to compare actual xattr value to + */ +CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op, + const char *name, + uint8_t comparison_operator, + const char *value, + size_t value_len); + +/** + * Start iterating over xattrs on an object. + * + * @param read_op operation to add this action to + * @param iter where to store the iterator + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op, + rados_xattrs_iter_t *iter, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param read_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t val_len, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param read_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param key_len length of key in bytes + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t key_len, + size_t val_len, + int *prval); + +/** + * Get object size and mtime + * @param read_op operation to add this action to + * @param psize where to store object size + * @param pmtime where to store modification time + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op, + uint64_t *psize, + time_t *pmtime, + int *prval); + +CEPH_RADOS_API void rados_read_op_stat2(rados_read_op_t read_op, + uint64_t *psize, + struct timespec *pmtime, + int *prval); +/** + * Read bytes from offset into buffer. + * + * prlen will be filled with the number of bytes read if successful. + * A short read can only occur if the read reaches the end of the + * object. + * + * @param read_op operation to add this action to + * @param offset offset to read from + * @param len length of buffer + * @param buffer where to put the data + * @param bytes_read where to store the number of bytes read by this action + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op, + uint64_t offset, + size_t len, + char *buffer, + size_t *bytes_read, + int *prval); + +/** + * Compute checksum from object data + * + * @param read_op operation to add this action to + * @param type the checksum algorithm to utilize + * @param init_value the init value for the algorithm + * @param init_value_len the length of the init value + * @param offset the offset to start checksumming in the object + * @param len the number of bytes to checksum + * @param chunk_size optional length-aligned chunk size for checksums + * @param pchecksum where to store the checksum result for this action + * @param checksum_len the number of bytes available for the result + * @param prval where to store the return value for this action + */ +CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op, + rados_checksum_type_t type, + const char *init_value, + size_t init_value_len, + uint64_t offset, size_t len, + size_t chunk_size, char *pchecksum, + size_t checksum_len, int *prval); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * The output buffer is allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param read_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param out_buf where to put librados-allocated output buffer + * @param out_len length of out_buf in bytes + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + char **out_buf, + size_t *out_len, + int *prval); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * If the output buffer is too small, prval will + * be set to -ERANGE and used_len will be 0. + * + * @param read_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param out_buf user-provided buffer to read into + * @param out_len length of out_buf in bytes + * @param used_len where to store the number of bytes read into out_buf + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + char *out_buf, + size_t out_len, + size_t *used_len, + int *prval); + +/** + * Start iterating over key/value pairs on an object. + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param filter_prefix list only keys beginning with filter_prefix + * @param max_return list no more than max_return key/value pairs + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op, + const char *start_after, + const char *filter_prefix, + uint64_t max_return, + rados_omap_iter_t *iter, + int *prval) + __attribute__((deprecated)); /* use v2 below */ + +/** + * Start iterating over key/value pairs on an object. + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param filter_prefix list only keys beginning with filter_prefix + * @param max_return list no more than max_return key/value pairs + * @param iter where to store the iterator + * @param pmore flag indicating whether there are more keys to fetch + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op, + const char *start_after, + const char *filter_prefix, + uint64_t max_return, + rados_omap_iter_t *iter, + unsigned char *pmore, + int *prval); + +/** + * Start iterating over keys on an object. + * + * They will be returned sorted by key, and the iterator + * will fill in NULL for all values if specified. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param max_return list no more than max_return keys + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op, + const char *start_after, + uint64_t max_return, + rados_omap_iter_t *iter, + int *prval) + __attribute__((deprecated)); /* use v2 below */ + +/** + * Start iterating over keys on an object. + * + * They will be returned sorted by key, and the iterator + * will fill in NULL for all values if specified. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param max_return list no more than max_return keys + * @param iter where to store the iterator + * @param pmore flag indicating whether there are more keys to fetch + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op, + const char *start_after, + uint64_t max_return, + rados_omap_iter_t *iter, + unsigned char *pmore, + int *prval); + +/** + * Start iterating over specific key/value pairs + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param keys array of pointers to null-terminated keys to get + * @param keys_len the number of strings in keys + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op, + char const* const* keys, + size_t keys_len, + rados_omap_iter_t *iter, + int *prval); + +/** + * Start iterating over specific key/value pairs + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param keys array of pointers to keys to get + * @param num_keys the number of strings in keys + * @param key_lens array of size_t's describing each key len (in bytes) + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op, + char const* const* keys, + size_t num_keys, + const size_t* key_lens, + rados_omap_iter_t *iter, + int *prval); + +/** + * Perform a read operation synchronously + * @param read_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op, + rados_ioctx_t io, + const char *oid, + int flags); + +/** + * Perform a read operation asynchronously + * @param read_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + int flags); + +/** @} Object Operations */ + +/** + * Take an exclusive lock on an object. + * + * @param io the context to operate in + * @param oid the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for this instance of the lock + * @param desc user-defined lock description + * @param duration the duration of the lock. Set to NULL for infinite duration. + * @param flags lock flags + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid, + const char * name, const char * cookie, + const char * desc, + struct timeval * duration, + uint8_t flags); + +/** + * Take a shared lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for this instance of the lock + * @param tag The tag of the lock + * @param desc user-defined lock description + * @param duration the duration of the lock. Set to NULL for infinite duration. + * @param flags lock flags + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o, + const char * name, const char * cookie, + const char * tag, const char * desc, + struct timeval * duration, uint8_t flags); + +/** + * Release a shared or exclusive lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o, + const char *name, const char *cookie); + +/** + * Asynchronous release a shared or exclusive lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for the instance of the lock + * @param completion what to do when operation has been attempted + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o, + const char *name, const char *cookie, + rados_completion_t completion); + +/** + * List clients that have locked the named object lock and information about + * the lock. + * + * The number of bytes required in each buffer is put in the + * corresponding size out parameter. If any of the provided buffers + * are too short, -ERANGE is returned after these sizes are filled in. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param exclusive where to store whether the lock is exclusive (1) or shared (0) + * @param tag where to store the tag associated with the object lock + * @param tag_len number of bytes in tag buffer + * @param clients buffer in which locker clients are stored, separated by '\0' + * @param clients_len number of bytes in the clients buffer + * @param cookies buffer in which locker cookies are stored, separated by '\0' + * @param cookies_len number of bytes in the cookies buffer + * @param addrs buffer in which locker addresses are stored, separated by '\0' + * @param addrs_len number of bytes in the clients buffer + * @returns number of lockers on success, negative error code on failure + * @returns -ERANGE if any of the buffers are too short + */ +CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o, + const char *name, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len); + +/** + * Releases a shared or exclusive lock on an object, which was taken by the + * specified client. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param client the client currently holding the lock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + * @returns -EINVAL if the client cannot be parsed + */ +CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o, + const char *name, const char *client, + const char *cookie); + +/** + * Blocklists the specified client from the OSDs + * + * @param cluster cluster handle + * @param client_address client address + * @param expire_seconds number of seconds to blocklist (0 for default) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_blocklist_add(rados_t cluster, + char *client_address, + uint32_t expire_seconds); +CEPH_RADOS_API int rados_blacklist_add(rados_t cluster, + char *client_address, + uint32_t expire_seconds) + __attribute__((deprecated)); + +/** + * Gets addresses of the RADOS session, suitable for blocklisting. + * + * @param cluster cluster handle + * @param addrs the output string. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs); + +CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io) + __attribute__((deprecated)); + +CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io) + __attribute__((deprecated)); + +CEPH_RADOS_API void rados_set_pool_full_try(rados_ioctx_t io); + +CEPH_RADOS_API void rados_unset_pool_full_try(rados_ioctx_t io); + +/** + * Enable an application on a pool + * + * @param io pool ioctx + * @param app_name application name + * @param force 0 if only single application per pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io, + const char *app_name, int force); + +/** + * List all enabled applications + * + * If the provided buffer is too short, the required length is filled in and + * -ERANGE is returned. Otherwise, the buffers are filled with the application + * names, with a '\0' after each. + * + * @param io pool ioctx + * @param values buffer in which to store application names + * @param values_len number of bytes in values buffer + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values, + size_t *values_len); + +/** + * Get application metadata value from pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @param value result buffer + * @param value_len maximum len of value + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io, + const char *app_name, + const char *key, char *value, + size_t *value_len); + +/** + * Set application metadata on a pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @param value metadata key + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io, + const char *app_name, + const char *key, + const char *value); + +/** + * Remove application metadata from a pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io, + const char *app_name, + const char *key); + +/** + * List all metadata key/value pairs associated with an application. + * + * This iterates over all metadata, key_len and val_len are filled in + * with the number of bytes put into the keys and values buffers. + * + * If the provided buffers are too short, the required lengths are filled + * in and -ERANGE is returned. Otherwise, the buffers are filled with + * the keys and values of the metadata, with a '\0' after each. + * + * @param io pool ioctx + * @param app_name application name + * @param keys buffer in which to store key names + * @param key_len number of bytes in keys buffer + * @param values buffer in which to store values + * @param vals_len number of bytes in values buffer + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io, + const char *app_name, + char *keys, size_t *key_len, + char *values, + size_t *vals_len); + +/** + * @name Mon/OSD/PG Commands + * + * These interfaces send commands relating to the monitor, OSD, or PGs. + * + * @{ + */ + +/** + * Send monitor command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send ceph-mgr command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send ceph-mgr tell command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param name mgr name to target + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mgr_command_target( + rados_t cluster, + const char *name, + const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send monitor command to a specific monitor. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param name target monitor's name + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/** + * free a rados-allocated buffer + * + * Release memory allocated by librados calls like rados_mon_command(). + * + * @param buf buffer pointer + */ +CEPH_RADOS_API void rados_buffer_free(char *buf); + +CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/* + * This is not a doxygen comment leadin, because doxygen breaks on + * a typedef with function params and returns, and I can't figure out + * how to fix it. + * + * Monitor cluster log + * + * Monitor events logged to the cluster log. The callback get each + * log entry both as a single formatted line and with each field in a + * separate arg. + * + * Calling with a cb argument of NULL will deregister any previously + * registered callback. + * + * @param cluster cluster handle + * @param level minimum log level (debug, info, warn|warning, err|error) + * @param cb callback to run for each log message. It MUST NOT block + * nor call back into librados. + * @param arg void argument to pass to cb + * + * @returns 0 on success, negative code on error + */ +typedef void (*rados_log_callback_t)(void *arg, + const char *line, + const char *who, + uint64_t sec, uint64_t nsec, + uint64_t seq, const char *level, + const char *msg); + +/* + * This is not a doxygen comment leadin, because doxygen breaks on + * a typedef with function params and returns, and I can't figure out + * how to fix it. + * + * Monitor cluster log + * + * Monitor events logged to the cluster log. The callback get each + * log entry both as a single formatted line and with each field in a + * separate arg. + * + * Calling with a cb argument of NULL will deregister any previously + * registered callback. + * + * @param cluster cluster handle + * @param level minimum log level (debug, info, warn|warning, err|error) + * @param cb callback to run for each log message. It MUST NOT block + * nor call back into librados. + * @param arg void argument to pass to cb + * + * @returns 0 on success, negative code on error + */ +typedef void (*rados_log_callback2_t)(void *arg, + const char *line, + const char *channel, + const char *who, + const char *name, + uint64_t sec, uint64_t nsec, + uint64_t seq, const char *level, + const char *msg); + +CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level, + rados_log_callback_t cb, void *arg); +CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level, + rados_log_callback2_t cb, void *arg); + + +/** + * register daemon instance for a service + * + * Register us as a daemon providing a particular service. We identify + * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname'). + * The metadata is a map of keys and values with arbitrary static metdata + * for this instance. The encoding is a series of NULL-terminated strings, + * alternating key names and values, terminating with an empty key name. + * For example, "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}. + * + * For the lifetime of the librados instance, regular beacons will be sent + * to the cluster to maintain our registration in the service map. + * + * @param cluster handle + * @param service service name + * @param daemon daemon instance name + * @param metadata_dict static daemon metadata dict + */ +CEPH_RADOS_API int rados_service_register( + rados_t cluster, + const char *service, + const char *daemon, + const char *metadata_dict); + +/** + * update daemon status + * + * Update our mutable status information in the service map. + * + * The status dict is encoded the same way the daemon metadata is encoded + * for rados_service_register. For example, "foo\0bar\0this\0that\0\0" is + * {foo=bar,this=that}. + * + * @param cluster rados cluster handle + * @param status_dict status dict + */ +CEPH_RADOS_API int rados_service_update_status( + rados_t cluster, + const char *status_dict); + +/** @} Mon/OSD/PG commands */ + +/* + * These methods are no longer supported and return -ENOTSUP where possible. + */ +CEPH_RADOS_API int rados_objects_list_open( + rados_ioctx_t io, + rados_list_ctx_t *ctx) __attribute__((deprecated)); +CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position( + rados_list_ctx_t ctx) __attribute__((deprecated)); +CEPH_RADOS_API uint32_t rados_objects_list_seek( + rados_list_ctx_t ctx, + uint32_t pos) __attribute__((deprecated)); +CEPH_RADOS_API int rados_objects_list_next( + rados_list_ctx_t ctx, + const char **entry, + const char **key) __attribute__((deprecated)); +CEPH_RADOS_API void rados_objects_list_close( + rados_list_ctx_t ctx) __attribute__((deprecated)); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/shared/rados/librados.hpp b/shared/rados/librados.hpp new file mode 100644 index 0000000000..cb8261af12 --- /dev/null +++ b/shared/rados/librados.hpp @@ -0,0 +1,1568 @@ +#ifndef __LIBRADOS_HPP +#define __LIBRADOS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include "buffer.h" + +#include "librados.h" +#include "librados_fwd.hpp" +#include "rados_types.hpp" + +namespace libradosstriper +{ + class RadosStriper; +} + +namespace neorados { class RADOS; } + +namespace librados { + +using ceph::bufferlist; + +struct AioCompletionImpl; +struct IoCtxImpl; +struct ListObjectImpl; +class NObjectIteratorImpl; +struct ObjListCtx; +class ObjectOperationImpl; +struct PlacementGroupImpl; +struct PoolAsyncCompletionImpl; + +typedef struct rados_cluster_stat_t cluster_stat_t; +typedef struct rados_pool_stat_t pool_stat_t; + +typedef void *list_ctx_t; +typedef uint64_t auid_t; +typedef void *config_t; + +typedef struct { + std::string client; + std::string cookie; + std::string address; +} locker_t; + +typedef std::map stats_map; + +typedef void *completion_t; +typedef void (*callback_t)(completion_t cb, void *arg); + +inline namespace v14_2_0 { + + class IoCtx; + class RadosClient; + + class CEPH_RADOS_API ListObject + { + public: + const std::string& get_nspace() const; + const std::string& get_oid() const; + const std::string& get_locator() const; + + ListObject(); + ~ListObject(); + ListObject( const ListObject&); + ListObject& operator=(const ListObject& rhs); + private: + ListObject(ListObjectImpl *impl); + + friend class librados::NObjectIteratorImpl; + friend std::ostream& operator<<(std::ostream& out, const ListObject& lop); + + ListObjectImpl *impl; + }; + CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop); + + class CEPH_RADOS_API NObjectIterator; + + class CEPH_RADOS_API ObjectCursor + { + public: + ObjectCursor(); + ObjectCursor(const ObjectCursor &rhs); + explicit ObjectCursor(rados_object_list_cursor c); + ~ObjectCursor(); + ObjectCursor& operator=(const ObjectCursor& rhs); + bool operator<(const ObjectCursor &rhs) const; + bool operator==(const ObjectCursor &rhs) const; + void set(rados_object_list_cursor c); + + friend class IoCtx; + friend class librados::NObjectIteratorImpl; + friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc); + + std::string to_str() const; + bool from_str(const std::string& s); + + protected: + rados_object_list_cursor c_cursor; + }; + CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc); + + class CEPH_RADOS_API NObjectIterator { + public: + using iterator_category = std::forward_iterator_tag; + using value_type = ListObject; + using difference_type = std::ptrdiff_t; + using pointer = ListObject*; + using reference = ListObject&; + static const NObjectIterator __EndObjectIterator; + NObjectIterator(): impl(NULL) {} + ~NObjectIterator(); + NObjectIterator(const NObjectIterator &rhs); + NObjectIterator& operator=(const NObjectIterator& rhs); + + bool operator==(const NObjectIterator& rhs) const; + bool operator!=(const NObjectIterator& rhs) const; + const ListObject& operator*() const; + const ListObject* operator->() const; + NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions + NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions + friend class IoCtx; + friend class librados::NObjectIteratorImpl; + + /// get current hash position of the iterator, rounded to the current pg + uint32_t get_pg_hash_position() const; + + /// move the iterator to a given hash position. this may (will!) be rounded + /// to the nearest pg. errors are thrown as exceptions + uint32_t seek(uint32_t pos); + + /// move the iterator to a given cursor position. errors are thrown as exceptions + uint32_t seek(const ObjectCursor& cursor); + + /// get current cursor position + ObjectCursor get_cursor(); + + /** + * Configure PGLS filter to be applied OSD-side (requires caller + * to know/understand the format expected by the OSD) + */ + void set_filter(const bufferlist &bl); + + private: + NObjectIterator(ObjListCtx *ctx_); + void get_next(); + NObjectIteratorImpl *impl; + }; + + class CEPH_RADOS_API ObjectItem + { + public: + std::string oid; + std::string nspace; + std::string locator; + }; + + /// DEPRECATED; do not use + class CEPH_RADOS_API WatchCtx { + public: + virtual ~WatchCtx(); + virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0; + }; + + class CEPH_RADOS_API WatchCtx2 { + public: + virtual ~WatchCtx2(); + /** + * Callback activated when we receive a notify event. + * + * @param notify_id unique id for this notify event + * @param cookie the watcher we are notifying + * @param notifier_id the unique client id of the notifier + * @param bl opaque notify payload (from the notifier) + */ + virtual void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) = 0; + + /** + * Callback activated when we encounter an error with the watch. + * + * Errors we may see: + * -ENOTCONN : our watch was disconnected + * -ETIMEDOUT : our watch is still valid, but we may have missed + * a notify event. + * + * @param cookie the watcher with the problem + * @param err error + */ + virtual void handle_error(uint64_t cookie, int err) = 0; + }; + + struct CEPH_RADOS_API AioCompletion { + AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {} + ~AioCompletion(); + int set_complete_callback(void *cb_arg, callback_t cb); + int set_safe_callback(void *cb_arg, callback_t cb) + __attribute__ ((deprecated)); + int wait_for_complete(); + int wait_for_safe() __attribute__ ((deprecated)); + int wait_for_complete_and_cb(); + int wait_for_safe_and_cb() __attribute__ ((deprecated)); + bool is_complete(); + bool is_safe() __attribute__ ((deprecated)); + bool is_complete_and_cb(); + bool is_safe_and_cb() __attribute__ ((deprecated)); + int get_return_value(); + int get_version() __attribute__ ((deprecated)); + uint64_t get_version64(); + void release(); + AioCompletionImpl *pc; + }; + + struct CEPH_RADOS_API PoolAsyncCompletion { + PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {} + ~PoolAsyncCompletion(); + int set_callback(void *cb_arg, callback_t cb); + int wait(); + bool is_complete(); + int get_return_value(); + void release(); + PoolAsyncCompletionImpl *pc; + }; + + /** + * These are per-op flags which may be different among + * ops added to an ObjectOperation. + */ + enum ObjectOperationFlags { + OP_EXCL = LIBRADOS_OP_FLAG_EXCL, + OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK, + OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM, + OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL, + OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED, + OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED, + OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE, + }; + + class CEPH_RADOS_API ObjectOperationCompletion { + public: + virtual ~ObjectOperationCompletion() {} + virtual void handle_completion(int r, bufferlist& outbl) = 0; + }; + + /** + * These flags apply to the ObjectOperation as a whole. + * + * Prior to octopus BALANCE_READS and LOCALIZE_READS should only + * be used when reading from data you're certain won't change, like + * a snapshot, or where eventual consistency is ok. Since octopus + * (get_min_compatible_osd() >= CEPH_RELEASE_OCTOPUS) both are safe + * for general use. + * + * ORDER_READS_WRITES will order reads the same way writes are + * ordered (e.g., waiting for degraded objects). In particular, it + * will make a write followed by a read sequence be preserved. + * + * IGNORE_CACHE will skip the caching logic on the OSD that normally + * handles promotion of objects between tiers. This allows an operation + * to operate (or read) the cached (or uncached) object, even if it is + * not coherent. + * + * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and + * process the op directly on the destination pool. This is useful + * for CACHE_FLUSH and CACHE_EVICT operations. + */ + enum ObjectOperationGlobalFlags { + OPERATION_NOFLAG = LIBRADOS_OPERATION_NOFLAG, + OPERATION_BALANCE_READS = LIBRADOS_OPERATION_BALANCE_READS, + OPERATION_LOCALIZE_READS = LIBRADOS_OPERATION_LOCALIZE_READS, + OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES, + OPERATION_IGNORE_CACHE = LIBRADOS_OPERATION_IGNORE_CACHE, + OPERATION_SKIPRWLOCKS = LIBRADOS_OPERATION_SKIPRWLOCKS, + OPERATION_IGNORE_OVERLAY = LIBRADOS_OPERATION_IGNORE_OVERLAY, + // send requests to cluster despite the cluster or pool being + // marked full; ops will either succeed (e.g., delete) or return + // EDQUOT or ENOSPC + OPERATION_FULL_TRY = LIBRADOS_OPERATION_FULL_TRY, + // mainly for delete + OPERATION_FULL_FORCE = LIBRADOS_OPERATION_FULL_FORCE, + OPERATION_IGNORE_REDIRECT = LIBRADOS_OPERATION_IGNORE_REDIRECT, + OPERATION_ORDERSNAP = LIBRADOS_OPERATION_ORDERSNAP, + // enable/allow return value and per-op return code/buffers + OPERATION_RETURNVEC = LIBRADOS_OPERATION_RETURNVEC, + }; + + /* + * Alloc hint flags for the alloc_hint operation. + */ + enum AllocHintFlags { + ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + ALLOC_HINT_FLAG_RANDOM_READ = 8, + ALLOC_HINT_FLAG_APPEND_ONLY = 16, + ALLOC_HINT_FLAG_IMMUTABLE = 32, + ALLOC_HINT_FLAG_SHORTLIVED = 64, + ALLOC_HINT_FLAG_LONGLIVED = 128, + ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, + }; + + /* + * ObjectOperation : compound object operation + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectOperation + { + public: + ObjectOperation(); + virtual ~ObjectOperation(); + + ObjectOperation(const ObjectOperation&) = delete; + ObjectOperation& operator=(const ObjectOperation&) = delete; + + /** + * Move constructor. + * \warning A moved from ObjectOperation is invalid and may not be used for + * any purpose. This is a hard contract violation and will + * kill your program. + */ + ObjectOperation(ObjectOperation&&); + ObjectOperation& operator =(ObjectOperation&&); + + size_t size(); + void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated)); + //flag mean ObjectOperationFlags + void set_op_flags2(int flags); + + void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval); + void cmpxattr(const char *name, uint8_t op, const bufferlist& val); + void cmpxattr(const char *name, uint8_t op, uint64_t v); + void exec(const char *cls, const char *method, bufferlist& inbl); + void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval); + void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion); + /** + * Guard operation with a check that object version == ver + * + * @param ver [in] version to check + */ + void assert_version(uint64_t ver); + + /** + * Guard operation with a check that the object already exists + */ + void assert_exists(); + + /** + * get key/value pairs for specified keys + * + * @param assertions [in] comparison assertions + * @param prval [out] place error code in prval upon completion + * + * assertions has the form of mappings from keys to (comparison rval, assertion) + * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ]. + * + * That is, to assert that the value at key 'foo' is greater than 'bar': + * + * ObjectReadOperation op; + * int r; + * map > assertions; + * bufferlist bar(string('bar')); + * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT); + * op.omap_cmp(assertions, &r); + */ + void omap_cmp( + const std::map > &assertions, + int *prval); + + protected: + ObjectOperationImpl* impl; + friend class IoCtx; + friend class Rados; + }; + + /* + * ObjectWriteOperation : compound object write operation + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation + { + protected: + time_t *unused; + public: + ObjectWriteOperation() : unused(NULL) {} + ~ObjectWriteOperation() override {} + + ObjectWriteOperation(ObjectWriteOperation&&) = default; + ObjectWriteOperation& operator =(ObjectWriteOperation&&) = default; + + void mtime(time_t *pt); + void mtime2(struct timespec *pts); + + void create(bool exclusive); + void create(bool exclusive, + const std::string& category); ///< NOTE: category is unused + + void write(uint64_t off, const bufferlist& bl); + void write_full(const bufferlist& bl); + void writesame(uint64_t off, uint64_t write_len, + const bufferlist& bl); + void append(const bufferlist& bl); + void remove(); + void truncate(uint64_t off); + void zero(uint64_t off, uint64_t len); + void rmxattr(const char *name); + void setxattr(const char *name, const bufferlist& bl); + void setxattr(const char *name, const bufferlist&& bl); + void tmap_update(const bufferlist& cmdbl); + void tmap_put(const bufferlist& bl); + void selfmanaged_snap_rollback(uint64_t snapid); + + /** + * Rollback an object to the specified snapshot id + * + * Used with pool snapshots + * + * @param snapid [in] snopshot id specified + */ + void snap_rollback(uint64_t snapid); + + /** + * set keys and values according to map + * + * @param map [in] keys and values to set + */ + void omap_set(const std::map &map); + + /** + * set header + * + * @param bl [in] header to set + */ + void omap_set_header(const bufferlist &bl); + + /** + * Clears omap contents + */ + void omap_clear(); + + /** + * Clears keys in to_rm + * + * @param to_rm [in] keys to remove + */ + void omap_rm_keys(const std::set &to_rm); + + /** + * Copy an object + * + * Copies an object from another location. The operation is atomic in that + * the copy either succeeds in its entirety or fails (e.g., because the + * source object was modified while the copy was in progress). + * + * @param src source object name + * @param src_ioctx ioctx for the source object + * @param src_version current version of the source object + * @param src_fadvise_flags the fadvise flags for source object + */ + void copy_from(const std::string& src, const IoCtx& src_ioctx, + uint64_t src_version, uint32_t src_fadvise_flags); + + /** + * Copy an object + * + * Copies an object from another location. The operation is atomic in that + * the copy either succeeds in its entirety or fails (e.g., because the + * source object was modified while the copy was in progress). Instead of + * copying truncate_seq and truncate_size from the source object it receives + * these values as parameters. + * + * @param src source object name + * @param src_ioctx ioctx for the source object + * @param src_version current version of the source object + * @param truncate_seq truncate sequence for the destination object + * @param truncate_size truncate size for the destination object + * @param src_fadvise_flags the fadvise flags for source object + */ + void copy_from2(const std::string& src, const IoCtx& src_ioctx, + uint64_t src_version, uint32_t truncate_seq, + uint64_t truncate_size, uint32_t src_fadvise_flags); + + /** + * undirty an object + * + * Clear an objects dirty flag + */ + void undirty(); + + /** + * Set allocation hint for an object + * + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags flags () + */ + void set_alloc_hint(uint64_t expected_object_size, + uint64_t expected_write_size); + void set_alloc_hint2(uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + + /** + * Pin/unpin an object in cache tier + * + * @returns 0 on success, negative error code on failure + */ + void cache_pin(); + void cache_unpin(); + + /** + * Extensible tier + * + * Set redirect target + */ + void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx, + uint64_t tgt_version, int flag = 0); + void tier_promote(); + void unset_manifest(); + + friend class IoCtx; + }; + + /* + * ObjectReadOperation : compound object operation that return value + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation + { + public: + ObjectReadOperation() {} + ~ObjectReadOperation() override {} + + ObjectReadOperation(ObjectReadOperation&&) = default; + ObjectReadOperation& operator =(ObjectReadOperation&&) = default; + + void stat(uint64_t *psize, time_t *pmtime, int *prval); + void stat2(uint64_t *psize, struct timespec *pts, int *prval); + void getxattr(const char *name, bufferlist *pbl, int *prval); + void getxattrs(std::map *pattrs, int *prval); + void read(size_t off, uint64_t len, bufferlist *pbl, int *prval); + void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl, + uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl, + int *prval); + + /** + * see aio_sparse_read() + */ + void sparse_read(uint64_t off, uint64_t len, std::map *m, + bufferlist *data_bl, int *prval, + uint64_t truncate_size = 0, + uint32_t truncate_seq = 0); + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list no keys smaller than start_after + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals( + const std::string &start_after, + uint64_t max_return, + std::map *out_vals, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list no keys smaller than start_after + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals2( + const std::string &start_after, + uint64_t max_return, + std::map *out_vals, + bool *pmore, + int *prval); + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param filter_prefix [in] list only keys beginning with filter_prefix + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals( + const std::string &start_after, + const std::string &filter_prefix, + uint64_t max_return, + std::map *out_vals, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_vals2: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param filter_prefix [in] list only keys beginning with filter_prefix + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param pmore [out] pointer to bool indicating whether there are more keys + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals2( + const std::string &start_after, + const std::string &filter_prefix, + uint64_t max_return, + std::map *out_vals, + bool *pmore, + int *prval); + + + /** + * omap_get_keys: keys from the object omap + * + * Get up to max_return keys beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param max_return [in] list no more than max_return keys + * @param out_keys [out] place returned values in out_keys on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_keys(const std::string &start_after, + uint64_t max_return, + std::set *out_keys, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_keys2: keys from the object omap + * + * Get up to max_return keys beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param max_return [in] list no more than max_return keys + * @param out_keys [out] place returned values in out_keys on completion + * @param pmore [out] pointer to bool indicating whether there are more keys + * @param prval [out] place error code in prval upon completion + */ + void omap_get_keys2(const std::string &start_after, + uint64_t max_return, + std::set *out_keys, + bool *pmore, + int *prval); + + /** + * omap_get_header: get header from object omap + * + * @param header [out] place header here upon completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_header(bufferlist *header, int *prval); + + /** + * get key/value pairs for specified keys + * + * @param keys [in] keys to get + * @param map [out] place key/value pairs found here on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals_by_keys(const std::set &keys, + std::map *map, + int *prval); + + /** + * list_watchers: Get list watchers of object + * + * @param out_watchers [out] place returned values in out_watchers on completion + * @param prval [out] place error code in prval upon completion + */ + void list_watchers(std::list *out_watchers, int *prval); + + /** + * list snapshot clones associated with a logical object + * + * This will include a record for each version of the object, + * include the "HEAD" (which will have a cloneid of SNAP_HEAD). + * Each clone includes a vector of snap ids for which it is + * defined to exist. + * + * NOTE: this operation must be submitted from an IoCtx with a + * read snapid of SNAP_DIR for reliable results. + * + * @param out_snaps [out] pointer to resulting snap_set_t + * @param prval [out] place error code in prval upon completion + */ + void list_snaps(snap_set_t *out_snaps, int *prval); + + /** + * query dirty state of an object + * + * @param isdirty [out] pointer to resulting bool + * @param prval [out] place error code in prval upon completion + */ + void is_dirty(bool *isdirty, int *prval); + + /** + * flush a cache tier object to backing tier; will block racing + * updates. + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promotion. + */ + void cache_flush(); + + /** + * Flush a cache tier object to backing tier; will EAGAIN if we race + * with an update. Must be used with the SKIPRWLOCKS flag. + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promotion. + */ + void cache_try_flush(); + + /** + * evict a clean cache tier object + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promote on the OSD (that is then evicted). + */ + void cache_evict(); + + /** + * Extensible tier + * + * set_chunk: make a chunk pointing a part of the source object at the target + * object + * + * @param src_offset [in] source offset to indicate the start position of + * a chunk in the source object + * @param src_length [in] source length to set the length of the chunk + * @param tgt_oid [in] target object's id to set a chunk + * @param tgt_offset [in] the start position of the target object + * @param flag [in] flag for the source object + * + */ + void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx, + std::string tgt_oid, uint64_t tgt_offset, int flag = 0); + /** + * flush a manifest tier object to backing tier, performing deduplication; + * will block racing updates. + * + * Invoking tier_flush() implicitly makes a manifest object even if + * the target object is not manifest. + */ + void tier_flush(); + /** + * evict a manifest tier object to backing tier; will block racing + * updates. + */ + void tier_evict(); + }; + + /* IoCtx : This is a context in which we can perform I/O. + * It includes a Pool, + * + * Typical use (error checking omitted): + * + * IoCtx p; + * rados.ioctx_create("my_pool", p); + * p->stat(&stats); + * ... etc ... + * + * NOTE: be sure to call watch_flush() prior to destroying any IoCtx + * that is used for watch events to ensure that racing callbacks + * have completed. + */ + class CEPH_RADOS_API IoCtx + { + public: + IoCtx(); + static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool); + IoCtx(const IoCtx& rhs); + IoCtx& operator=(const IoCtx& rhs); + IoCtx(IoCtx&& rhs) noexcept; + IoCtx& operator=(IoCtx&& rhs) noexcept; + + ~IoCtx(); + + bool is_valid() const; + + // Close our pool handle + void close(); + + // deep copy + void dup(const IoCtx& rhs); + + // set pool auid + int set_auid(uint64_t auid_) + __attribute__ ((deprecated)); + + // set pool auid + int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + + // get pool auid + int get_auid(uint64_t *auid_) + __attribute__ ((deprecated)); + + uint64_t get_instance_id() const; + + std::string get_pool_name(); + + bool pool_requires_alignment(); + int pool_requires_alignment2(bool * req); + uint64_t pool_required_alignment(); + int pool_required_alignment2(uint64_t * alignment); + + // create an object + int create(const std::string& oid, bool exclusive); + int create(const std::string& oid, bool exclusive, + const std::string& category); ///< category is unused + + /** + * write bytes to an object at a specified offset + * + * NOTE: this call steals the contents of @param bl. + */ + int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off); + /** + * append bytes to an object + * + * NOTE: this call steals the contents of @param bl. + */ + int append(const std::string& oid, bufferlist& bl, size_t len); + /** + * replace object contents with provided data + * + * NOTE: this call steals the contents of @param bl. + */ + int write_full(const std::string& oid, bufferlist& bl); + int writesame(const std::string& oid, bufferlist& bl, + size_t write_len, uint64_t off); + int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off); + int checksum(const std::string& o, rados_checksum_type_t type, + const bufferlist &init_value_bl, size_t len, uint64_t off, + size_t chunk_size, bufferlist *pbl); + int remove(const std::string& oid); + int remove(const std::string& oid, int flags); + int trunc(const std::string& oid, uint64_t size); + int mapext(const std::string& o, uint64_t off, size_t len, std::map& m); + int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl); + int sparse_read(const std::string& o, std::map& m, bufferlist& bl, size_t len, uint64_t off); + int getxattr(const std::string& oid, const char *name, bufferlist& bl); + int getxattrs(const std::string& oid, std::map& attrset); + int setxattr(const std::string& oid, const char *name, bufferlist& bl); + int rmxattr(const std::string& oid, const char *name); + int stat(const std::string& oid, uint64_t *psize, time_t *pmtime); + int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts); + int exec(const std::string& oid, const char *cls, const char *method, + bufferlist& inbl, bufferlist& outbl); + /** + * modify object tmap based on encoded update sequence + * + * NOTE: this call steals the contents of @param bl + */ + int tmap_update(const std::string& oid, bufferlist& cmdbl); + + int omap_get_vals(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::map *out_vals); + int omap_get_vals2(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::map *out_vals, + bool *pmore); + int omap_get_vals(const std::string& oid, + const std::string& start_after, + const std::string& filter_prefix, + uint64_t max_return, + std::map *out_vals); + int omap_get_vals2(const std::string& oid, + const std::string& start_after, + const std::string& filter_prefix, + uint64_t max_return, + std::map *out_vals, + bool *pmore); + int omap_get_keys(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::set *out_keys); + int omap_get_keys2(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::set *out_keys, + bool *pmore); + int omap_get_header(const std::string& oid, + bufferlist *bl); + int omap_get_vals_by_keys(const std::string& oid, + const std::set& keys, + std::map *vals); + int omap_set(const std::string& oid, + const std::map& map); + int omap_set_header(const std::string& oid, + const bufferlist& bl); + int omap_clear(const std::string& oid); + int omap_rm_keys(const std::string& oid, + const std::set& keys); + + void snap_set_read(snap_t seq); + int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector& snaps); + + // Create a snapshot with a given name + int snap_create(const char *snapname); + + // Look up a snapshot by name. + // Returns 0 on success; error code otherwise + int snap_lookup(const char *snapname, snap_t *snap); + + // Gets a timestamp for a snap + int snap_get_stamp(snap_t snapid, time_t *t); + + // Gets the name of a snap + int snap_get_name(snap_t snapid, std::string *s); + + // Remove a snapshot from this pool + int snap_remove(const char *snapname); + + int snap_list(std::vector *snaps); + + int snap_rollback(const std::string& oid, const char *snapname); + + // Deprecated name kept for backward compatibility - same as snap_rollback() + int rollback(const std::string& oid, const char *snapname) + __attribute__ ((deprecated)); + + int selfmanaged_snap_create(uint64_t *snapid); + void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c); + + int selfmanaged_snap_remove(uint64_t snapid); + void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c); + + int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid); + + // Advisory locking on rados objects. + int lock_exclusive(const std::string &oid, const std::string &name, + const std::string &cookie, + const std::string &description, + struct timeval * duration, uint8_t flags); + + int lock_shared(const std::string &oid, const std::string &name, + const std::string &cookie, const std::string &tag, + const std::string &description, + struct timeval * duration, uint8_t flags); + + int unlock(const std::string &oid, const std::string &name, + const std::string &cookie); + + int break_lock(const std::string &oid, const std::string &name, + const std::string &client, const std::string &cookie); + + int list_lockers(const std::string &oid, const std::string &name, + int *exclusive, + std::string *tag, + std::list *lockers); + + + /// Start enumerating objects for a pool. Errors are thrown as exceptions. + NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist()); + /// Start enumerating objects for a pool starting from a hash position. + /// Errors are thrown as exceptions. + NObjectIterator nobjects_begin(uint32_t start_hash_position, + const bufferlist &filter=bufferlist()); + /// Start enumerating objects for a pool starting from cursor. Errors are + /// thrown as exceptions. + NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor, + const bufferlist &filter=bufferlist()); + /// Iterator indicating the end of a pool + const NObjectIterator& nobjects_end() const; + + /// Get cursor for pool beginning + ObjectCursor object_list_begin(); + + /// Get cursor for pool end + ObjectCursor object_list_end(); + + /// Check whether a cursor is at the end of a pool + bool object_list_is_end(const ObjectCursor &oc); + + /// List some objects between two cursors + int object_list(const ObjectCursor &start, const ObjectCursor &finish, + const size_t result_count, + const bufferlist &filter, + std::vector *result, + ObjectCursor *next); + + /// Generate cursors that include the N out of Mth slice of the pool + void object_list_slice( + const ObjectCursor start, + const ObjectCursor finish, + const size_t n, + const size_t m, + ObjectCursor *split_start, + ObjectCursor *split_finish); + + /** + * List available hit set objects + * + * @param uint32_t [in] hash position to query + * @param c [in] completion + * @param pls [out] list of available intervals + */ + int hit_set_list(uint32_t hash, AioCompletion *c, + std::list< std::pair > *pls); + + /** + * Retrieve hit set for a given hash, and time + * + * @param hash [in] hash position + * @param c [in] completion + * @param stamp [in] time interval that falls within the hit set's interval + * @param pbl [out] buffer to store the result in + */ + int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp, + bufferlist *pbl); + + uint64_t get_last_version(); + + int aio_read(const std::string& oid, AioCompletion *c, + bufferlist *pbl, size_t len, uint64_t off); + /** + * Asynchronously read from an object at a particular snapshot + * + * This is the same as normal aio_read, except that it chooses + * the snapshot to read from from its arguments instead of the + * internal IoCtx state. + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param pbl where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @param snapid the id of the snapshot to read from + * @returns 0 on success, negative error code on failure + */ + int aio_read(const std::string& oid, AioCompletion *c, + bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid); + int aio_sparse_read(const std::string& oid, AioCompletion *c, + std::map *m, bufferlist *data_bl, + size_t len, uint64_t off); + /** + * Asynchronously read existing extents from an object at a + * particular snapshot + * + * This is the same as normal aio_sparse_read, except that it chooses + * the snapshot to read from from its arguments instead of the + * internal IoCtx state. + * + * m will be filled in with a map of extents in the object, + * mapping offsets to lengths (in bytes) within the range + * requested. The data for all of the extents are stored + * back-to-back in offset order in data_bl. + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param m where to store the map of extents + * @param data_bl where to store the data + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @param snapid the id of the snapshot to read from + * @returns 0 on success, negative error code on failure + */ + int aio_sparse_read(const std::string& oid, AioCompletion *c, + std::map *m, bufferlist *data_bl, + size_t len, uint64_t off, uint64_t snapid); + /** + * Asynchronously compare an on-disk object range with a buffer + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param off object byte offset at which to start the comparison + * @param cmp_bl buffer containing bytes to be compared with object contents + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ + int aio_cmpext(const std::string& oid, + librados::AioCompletion *c, + uint64_t off, + bufferlist& cmp_bl); + int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t len, uint64_t off); + int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t len); + int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl); + int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t write_len, uint64_t off); + + /** + * Asynchronously remove an object + * + * Queues the remove and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param oid the name of the object + * @param c what to do when the remove is safe and complete + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than SNAP_HEAD + */ + int aio_remove(const std::string& oid, AioCompletion *c); + int aio_remove(const std::string& oid, AioCompletion *c, int flags); + + /** + * Wait for all currently pending aio writes to be safe. + * + * @returns 0 on success, negative error code on failure + */ + int aio_flush(); + + /** + * Schedule a callback for when all currently pending + * aio writes are safe. This is a non-blocking version of + * aio_flush(). + * + * @param c what to do when the writes are safe + * @returns 0 on success, negative error code on failure + */ + int aio_flush_async(AioCompletion *c); + int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl); + int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map& attrset); + int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl); + int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name); + int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime); + int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts); + + /** + * Cancel aio operation + * + * @param c completion handle + * @returns 0 on success, negative error code on failure + */ + int aio_cancel(AioCompletion *c); + + int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method, + bufferlist& inbl, bufferlist *outbl); + + /* + * asynchronous version of unlock + */ + int aio_unlock(const std::string &oid, const std::string &name, + const std::string &cookie, AioCompletion *c); + + // compound object operations + int operate(const std::string& oid, ObjectWriteOperation *op); + int operate(const std::string& oid, ObjectWriteOperation *op, int flags); + int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl); + int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl, int flags); + int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op); + int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags); + /** + * Schedule an async write operation with explicit snapshot parameters + * + * This is the same as the first aio_operate(), except that it + * gets the snapshot context from its arguments instead of the + * IoCtx internal state. + * + * @param oid the object to operate on + * @param c what to do when the operation is complete and safe + * @param op which operations to perform + * @param seq latest selfmanaged snapshot sequence number for this object + * @param snaps currently existing selfmanaged snapshot ids for this object + * @returns 0 on success, negative error code on failure + */ + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector& snaps); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector& snaps, + const blkin_trace_info *trace_info); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector& snaps, int flags, + const blkin_trace_info *trace_info); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, bufferlist *pbl); + + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, snap_t snapid, int flags, + bufferlist *pbl) + __attribute__ ((deprecated)); + + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, int flags, + bufferlist *pbl); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, int flags, + bufferlist *pbl, const blkin_trace_info *trace_info); + + // watch/notify + int watch2(const std::string& o, uint64_t *handle, + librados::WatchCtx2 *ctx); + int watch3(const std::string& o, uint64_t *handle, + librados::WatchCtx2 *ctx, uint32_t timeout); + int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle, + librados::WatchCtx2 *ctx); + int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle, + librados::WatchCtx2 *ctx, uint32_t timeout); + int unwatch2(uint64_t handle); + int aio_unwatch(uint64_t handle, AioCompletion *c); + /** + * Send a notify event to watchers + * + * Upon completion the pbl bufferlist reply payload will be + * encoded like so: + * + * le32 num_acks + * { + * le64 gid global id for the client (for client.1234 that's 1234) + * le64 cookie cookie for the client + * le32 buflen length of reply message buffer + * u8 * buflen payload + * } * num_acks + * le32 num_timeouts + * { + * le64 gid global id for the client + * le64 cookie cookie for the client + * } * num_timeouts + * + * + */ + int notify2(const std::string& o, ///< object + bufferlist& bl, ///< optional broadcast payload + uint64_t timeout_ms, ///< timeout (in ms) + bufferlist *pbl); ///< reply buffer + int aio_notify(const std::string& o, ///< object + AioCompletion *c, ///< completion when notify completes + bufferlist& bl, ///< optional broadcast payload + uint64_t timeout_ms, ///< timeout (in ms) + bufferlist *pbl); ///< reply buffer + /* + * Decode a notify response into acks and timeout vectors. + */ + void decode_notify_response(bufferlist &bl, + std::vector *acks, + std::vector *timeouts); + + int list_watchers(const std::string& o, std::list *out_watchers); + int list_snaps(const std::string& o, snap_set_t *out_snaps); + void set_notify_timeout(uint32_t timeout); + + /// acknowledge a notify we received. + void notify_ack(const std::string& o, ///< watched object + uint64_t notify_id, ///< notify id + uint64_t cookie, ///< our watch handle + bufferlist& bl); ///< optional reply payload + + /*** + * check on watch validity + * + * Check if a watch is valid. If so, return the number of + * milliseconds since we last confirmed its liveness. If there is + * a known error, return it. + * + * If there is an error, the watch is no longer valid, and should + * be destroyed with unwatch(). The user is still interested in + * the object, a new watch should be created with watch(). + * + * @param cookie watch handle + * @returns ms since last confirmed valid, or error + */ + int watch_check(uint64_t cookie); + + // old, deprecated versions + int watch(const std::string& o, uint64_t ver, uint64_t *cookie, + librados::WatchCtx *ctx) __attribute__ ((deprecated)); + int notify(const std::string& o, uint64_t ver, bufferlist& bl) + __attribute__ ((deprecated)); + int unwatch(const std::string& o, uint64_t cookie) + __attribute__ ((deprecated)); + + /** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it + * was submitted with a OP_FAILOK flag set) and is not guaranteed + * to do anything on the backend. + * + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @returns 0 on success, negative error code on failure + */ + int set_alloc_hint(const std::string& o, + uint64_t expected_object_size, + uint64_t expected_write_size); + int set_alloc_hint2(const std::string& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + + // assert version for next sync operations + void set_assert_version(uint64_t ver); + + /** + * Pin/unpin an object in cache tier + * + * @param o the name of the object + * @returns 0 on success, negative error code on failure + */ + int cache_pin(const std::string& o); + int cache_unpin(const std::string& o); + + std::string get_pool_name() const; + + void locator_set_key(const std::string& key); + void set_namespace(const std::string& nspace); + std::string get_namespace() const; + + int64_t get_id(); + + // deprecated versions + uint32_t get_object_hash_position(const std::string& oid) + __attribute__ ((deprecated)); + uint32_t get_object_pg_hash_position(const std::string& oid) + __attribute__ ((deprecated)); + + int get_object_hash_position2(const std::string& oid, uint32_t *hash_position); + int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position); + + config_t cct(); + + void set_osdmap_full_try() + __attribute__ ((deprecated)); + void unset_osdmap_full_try() + __attribute__ ((deprecated)); + + bool get_pool_full_try(); + void set_pool_full_try(); + void unset_pool_full_try(); + + int application_enable(const std::string& app_name, bool force); + int application_enable_async(const std::string& app_name, + bool force, PoolAsyncCompletion *c); + int application_list(std::set *app_names); + int application_metadata_get(const std::string& app_name, + const std::string &key, + std::string *value); + int application_metadata_set(const std::string& app_name, + const std::string &key, + const std::string& value); + int application_metadata_remove(const std::string& app_name, + const std::string &key); + int application_metadata_list(const std::string& app_name, + std::map *values); + + private: + /* You can only get IoCtx instances from Rados */ + IoCtx(IoCtxImpl *io_ctx_impl_); + + friend class Rados; // Only Rados can use our private constructor to create IoCtxes. + friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl + friend class ObjectWriteOperation; // copy_from needs to see our IoCtxImpl + friend class ObjectReadOperation; // set_chunk needs to see our IoCtxImpl + + IoCtxImpl *io_ctx_impl; + }; + + struct CEPH_RADOS_API PlacementGroup { + PlacementGroup(); + PlacementGroup(const PlacementGroup&); + ~PlacementGroup(); + bool parse(const char*); + std::unique_ptr impl; + }; + + CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&); + + class CEPH_RADOS_API Rados + { + public: + static void version(int *major, int *minor, int *extra); + + Rados(); + explicit Rados(IoCtx& ioctx); + ~Rados(); + static void from_rados_t(rados_t cluster, Rados &rados); + + int init(const char * const id); + int init2(const char * const name, const char * const clustername, + uint64_t flags); + int init_with_context(config_t cct_); + config_t cct(); + int connect(); + void shutdown(); + int watch_flush(); + int aio_watch_flush(AioCompletion*); + int conf_read_file(const char * const path) const; + int conf_parse_argv(int argc, const char ** argv) const; + int conf_parse_argv_remainder(int argc, const char ** argv, + const char ** remargv) const; + int conf_parse_env(const char *env) const; + int conf_set(const char *option, const char *value); + int conf_get(const char *option, std::string &val); + + int service_daemon_register( + const std::string& service, ///< service name (e.g., 'rgw') + const std::string& name, ///< daemon name (e.g., 'gwfoo') + const std::map& metadata); ///< static metadata about daemon + int service_daemon_update_status( + std::map&& status); + + int pool_create(const char *name); + int pool_create(const char *name, uint64_t auid) + __attribute__ ((deprecated)); + int pool_create(const char *name, uint64_t auid, uint8_t crush_rule) + __attribute__ ((deprecated)); + int pool_create_with_rule(const char *name, uint8_t crush_rule); + int pool_create_async(const char *name, PoolAsyncCompletion *c); + int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c); + int pool_get_base_tier(int64_t pool, int64_t* base_tier); + int pool_delete(const char *name); + int pool_delete_async(const char *name, PoolAsyncCompletion *c); + int64_t pool_lookup(const char *name); + int pool_reverse_lookup(int64_t id, std::string *name); + + uint64_t get_instance_id(); + + int get_min_compatible_osd(int8_t* require_osd_release); + int get_min_compatible_client(int8_t* min_compat_client, + int8_t* require_min_compat_client); + + int mon_command(std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int mgr_command(std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int osd_command(int osdid, std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + + int ioctx_create(const char *name, IoCtx &pioctx); + int ioctx_create2(int64_t pool_id, IoCtx &pioctx); + + // Features useful for test cases + void test_blocklist_self(bool set); + + /* pool info */ + int pool_list(std::list& v); + int pool_list2(std::list >& v); + int get_pool_stats(std::list& v, + stats_map& result); + /// deprecated; use simpler form. categories no longer supported. + int get_pool_stats(std::list& v, + std::map& stats); + /// deprecated; categories no longer supported + int get_pool_stats(std::list& v, + std::string& category, + std::map& stats); + /// check if pool has selfmanaged snaps + bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname); + + int cluster_stat(cluster_stat_t& result); + int cluster_fsid(std::string *fsid); + + /** + * List inconsistent placement groups in the given pool + * + * @param pool_id the pool id + * @param pgs [out] the inconsistent PGs + */ + int get_inconsistent_pgs(int64_t pool_id, + std::vector* pgs); + /** + * List the inconsistent objects found in a given PG by last scrub + * + * @param pg the placement group returned by @c pg_list() + * @param start_after the first returned @c objects + * @param max_return the max number of the returned @c objects + * @param c what to do when the operation is complete and safe + * @param objects [out] the objects where inconsistencies are found + * @param interval [in,out] an epoch indicating current interval + * @returns if a non-zero @c interval is specified, will return -EAGAIN i + * the current interval begin epoch is different. + */ + int get_inconsistent_objects(const PlacementGroup& pg, + const object_id_t &start_after, + unsigned max_return, + AioCompletion *c, + std::vector* objects, + uint32_t* interval); + /** + * List the inconsistent snapsets found in a given PG by last scrub + * + * @param pg the placement group returned by @c pg_list() + * @param start_after the first returned @c objects + * @param max_return the max number of the returned @c objects + * @param c what to do when the operation is complete and safe + * @param snapsets [out] the objects where inconsistencies are found + * @param interval [in,out] an epoch indicating current interval + * @returns if a non-zero @c interval is specified, will return -EAGAIN i + * the current interval begin epoch is different. + */ + int get_inconsistent_snapsets(const PlacementGroup& pg, + const object_id_t &start_after, + unsigned max_return, + AioCompletion *c, + std::vector* snapset, + uint32_t* interval); + + /// get/wait for the most recent osdmap + int wait_for_latest_osdmap(); + + int blocklist_add(const std::string& client_address, + uint32_t expire_seconds); + + std::string get_addrs() const; + + /* + * pool aio + * + * It is up to the caller to release the completion handler, even if the pool_create_async() + * and/or pool_delete_async() fails and does not send the async request + */ + static PoolAsyncCompletion *pool_async_create_completion(); + + // -- aio -- + static AioCompletion *aio_create_completion(); + static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete, + callback_t cb_safe) + __attribute__ ((deprecated)); + static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete); + + friend std::ostream& operator<<(std::ostream &oss, const Rados& r); + private: + friend class neorados::RADOS; + + // We don't allow assignment or copying + Rados(const Rados& rhs); + const Rados& operator=(const Rados& rhs); + RadosClient *client; + }; + +} // namespace v14_2_0 +} // namespace librados + +#endif + diff --git a/shared/rados/librados_fwd.hpp b/shared/rados/librados_fwd.hpp new file mode 100644 index 0000000000..396f3a8387 --- /dev/null +++ b/shared/rados/librados_fwd.hpp @@ -0,0 +1,34 @@ +#ifndef __LIBRADOS_FWD_HPP +#define __LIBRADOS_FWD_HPP + +struct blkin_trace_info; + +namespace libradosstriper { + +class RadosStriper; + +} // namespace libradosstriper + +namespace librados { +inline namespace v14_2_0 { + +class AioCompletion; +class IoCtx; +class ListObject; +class NObjectIterator; +class ObjectCursor; +class ObjectItem; +class ObjectOperation; +class ObjectOperationCompletion; +class ObjectReadOperation; +class ObjectWriteOperation; +class PlacementGroup; +class PoolAsyncCompletion; +class Rados; +class WatchCtx; +class WatchCtx2; + +} // inline namespace v14_2_0 +} // namespace librados + +#endif // __LIBRADOS_FWD_HPP diff --git a/shared/rados/page.h b/shared/rados/page.h new file mode 100644 index 0000000000..db6e20585c --- /dev/null +++ b/shared/rados/page.h @@ -0,0 +1,18 @@ +#ifndef CEPH_PAGE_H +#define CEPH_PAGE_H + +namespace ceph { + // these are in common/page.cc + extern unsigned _page_size; + extern unsigned long _page_mask; + extern unsigned _page_shift; +} + +#endif + + +#define CEPH_PAGE_SIZE ceph::_page_size +#define CEPH_PAGE_MASK ceph::_page_mask +#define CEPH_PAGE_SHIFT ceph::_page_shift + + diff --git a/shared/rados/rados_types.h b/shared/rados/rados_types.h new file mode 100644 index 0000000000..d308341ec5 --- /dev/null +++ b/shared/rados/rados_types.h @@ -0,0 +1,41 @@ +#ifndef CEPH_RADOS_TYPES_H +#define CEPH_RADOS_TYPES_H + +#include + +/** + * @struct obj_watch_t + * One item from list_watchers + */ +struct obj_watch_t { + /// Address of the Watcher + char addr[256]; + /// Watcher ID + int64_t watcher_id; + /// Cookie + uint64_t cookie; + /// Timeout in Seconds + uint32_t timeout_seconds; +}; + +struct notify_ack_t { + uint64_t notifier_id; + uint64_t cookie; + char *payload; + uint64_t payload_len; +}; + +struct notify_timeout_t { + uint64_t notifier_id; + uint64_t cookie; +}; + +/** + * + * Pass as nspace argument to rados_ioctx_set_namespace() + * before calling rados_nobjects_list_open() to return + * all objects in all namespaces. + */ +#define LIBRADOS_ALL_NSPACES "\001" + +#endif diff --git a/shared/rados/rados_types.hpp b/shared/rados/rados_types.hpp new file mode 100644 index 0000000000..84023579b3 --- /dev/null +++ b/shared/rados/rados_types.hpp @@ -0,0 +1,341 @@ +#ifndef CEPH_RADOS_TYPES_HPP +#define CEPH_RADOS_TYPES_HPP + +#include +#include +#include +#include +#include + +#include "buffer.h" +#include "rados_types.h" + +namespace librados { + +typedef uint64_t snap_t; + +enum { + SNAP_HEAD = (uint64_t)(-2), + SNAP_DIR = (uint64_t)(-1) +}; + +struct clone_info_t { + snap_t cloneid; + std::vector snaps; // ascending + std::vector< std::pair > overlap; // with next newest + uint64_t size; + clone_info_t() : cloneid(0), size(0) {} +}; + +struct snap_set_t { + std::vector clones; // ascending + snap_t seq; // newest snapid seen by the object + snap_set_t() : seq(0) {} +}; + +struct object_id_t { + std::string name; + std::string nspace; + std::string locator; + snap_t snap = 0; + object_id_t() = default; + object_id_t(const std::string& name, + const std::string& nspace, + const std::string& locator, + snap_t snap) + : name(name), + nspace(nspace), + locator(locator), + snap(snap) + {} +}; + +struct err_t { + enum : uint64_t { + SHARD_MISSING = 1 << 1, + SHARD_STAT_ERR = 1 << 2, + SHARD_READ_ERR = 1 << 3, + DATA_DIGEST_MISMATCH_OI = 1 << 9, // Old + DATA_DIGEST_MISMATCH_INFO = 1 << 9, + OMAP_DIGEST_MISMATCH_OI = 1 << 10, // Old + OMAP_DIGEST_MISMATCH_INFO = 1 << 10, + SIZE_MISMATCH_OI = 1 << 11, // Old + SIZE_MISMATCH_INFO = 1 << 11, + SHARD_EC_HASH_MISMATCH = 1 << 12, + SHARD_EC_SIZE_MISMATCH = 1 << 13, + OI_ATTR_MISSING = 1 << 14, // Old + INFO_MISSING = 1 << 14, + OI_ATTR_CORRUPTED = 1 << 15, // Old + INFO_CORRUPTED = 1 << 15, + SS_ATTR_MISSING = 1 << 16, // Old + SNAPSET_MISSING = 1 << 16, + SS_ATTR_CORRUPTED = 1 << 17, // Old + SNAPSET_CORRUPTED = 1 << 17, + OBJ_SIZE_OI_MISMATCH = 1 << 18, // Old + OBJ_SIZE_INFO_MISMATCH = 1 << 18, + HINFO_MISSING = 1 << 19, + HINFO_CORRUPTED = 1 << 20 + // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS + }; + uint64_t errors = 0; + static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED; + static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH; + bool has_shard_missing() const { + return errors & SHARD_MISSING; + } + bool has_stat_error() const { + return errors & SHARD_STAT_ERR; + } + bool has_read_error() const { + return errors & SHARD_READ_ERR; + } + bool has_data_digest_mismatch_oi() const { // Compatibility + return errors & DATA_DIGEST_MISMATCH_OI; + } + bool has_data_digest_mismatch_info() const { + return errors & DATA_DIGEST_MISMATCH_INFO; + } + bool has_omap_digest_mismatch_oi() const { // Compatibility + return errors & OMAP_DIGEST_MISMATCH_OI; + } + bool has_omap_digest_mismatch_info() const { + return errors & OMAP_DIGEST_MISMATCH_INFO; + } + bool has_size_mismatch_oi() const { // Compatibility + return errors & SIZE_MISMATCH_OI; + } + bool has_size_mismatch_info() const { + return errors & SIZE_MISMATCH_INFO; + } + bool has_ec_hash_error() const { + return errors & SHARD_EC_HASH_MISMATCH; + } + bool has_ec_size_error() const { + return errors & SHARD_EC_SIZE_MISMATCH; + } + bool has_oi_attr_missing() const { // Compatibility + return errors & OI_ATTR_MISSING; + } + bool has_info_missing() const { + return errors & INFO_MISSING; + } + bool has_oi_attr_corrupted() const { // Compatibility + return errors & OI_ATTR_CORRUPTED; + } + bool has_info_corrupted() const { + return errors & INFO_CORRUPTED; + } + bool has_ss_attr_missing() const { // Compatibility + return errors & SS_ATTR_MISSING; + } + bool has_snapset_missing() const { + return errors & SNAPSET_MISSING; + } + bool has_ss_attr_corrupted() const { // Compatibility + return errors & SS_ATTR_CORRUPTED; + } + bool has_snapset_corrupted() const { + return errors & SNAPSET_CORRUPTED; + } + bool has_shallow_errors() const { + return errors & SHALLOW_ERRORS; + } + bool has_deep_errors() const { + return errors & DEEP_ERRORS; + } + bool has_obj_size_oi_mismatch() const { // Compatibility + return errors & OBJ_SIZE_OI_MISMATCH; + } + bool has_obj_size_info_mismatch() const { + return errors & OBJ_SIZE_INFO_MISMATCH; + } + bool has_hinfo_missing() const { + return errors & HINFO_MISSING; + } + bool has_hinfo_corrupted() const { + return errors & HINFO_CORRUPTED; + } +}; + +struct shard_info_t : err_t { + std::map attrs; + uint64_t size = -1; + bool omap_digest_present = false; + uint32_t omap_digest = 0; + bool data_digest_present = false; + uint32_t data_digest = 0; + bool selected_oi = false; + bool primary = false; +}; + +struct osd_shard_t { + int32_t osd; + int8_t shard; +}; + +inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) { + if (lhs.osd < rhs.osd) + return true; + else if (lhs.osd > rhs.osd) + return false; + else + return lhs.shard < rhs.shard; +} + +struct obj_err_t { + enum : uint64_t { + OBJECT_INFO_INCONSISTENCY = 1 << 1, + // XXX: Can an older rados binary work if these bits stay the same? + DATA_DIGEST_MISMATCH = 1 << 4, + OMAP_DIGEST_MISMATCH = 1 << 5, + SIZE_MISMATCH = 1 << 6, + ATTR_VALUE_MISMATCH = 1 << 7, + ATTR_NAME_MISMATCH = 1 << 8, + SNAPSET_INCONSISTENCY = 1 << 9, + HINFO_INCONSISTENCY = 1 << 10, + SIZE_TOO_LARGE = 1 << 11, + // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS + }; + uint64_t errors = 0; + static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH + |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE; + static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH; + bool has_object_info_inconsistency() const { + return errors & OBJECT_INFO_INCONSISTENCY; + } + bool has_data_digest_mismatch() const { + return errors & DATA_DIGEST_MISMATCH; + } + bool has_omap_digest_mismatch() const { + return errors & OMAP_DIGEST_MISMATCH; + } + bool has_size_mismatch() const { + return errors & SIZE_MISMATCH; + } + bool has_attr_value_mismatch() const { + return errors & ATTR_VALUE_MISMATCH; + } + bool has_attr_name_mismatch() const { + return errors & ATTR_NAME_MISMATCH; + } + bool has_shallow_errors() const { + return errors & SHALLOW_ERRORS; + } + bool has_deep_errors() const { + return errors & DEEP_ERRORS; + } + bool has_snapset_inconsistency() const { + return errors & SNAPSET_INCONSISTENCY; + } + bool has_hinfo_inconsistency() const { + return errors & HINFO_INCONSISTENCY; + } + bool has_size_too_large() const { + return errors & SIZE_TOO_LARGE; + } +}; + +struct inconsistent_obj_t : obj_err_t { + inconsistent_obj_t() = default; + inconsistent_obj_t(const object_id_t& object) + : object{object}, version(0) + {} + object_id_t object; + uint64_t version; // XXX: Redundant with object info attr + std::map shards; + err_t union_shards; +}; + +struct inconsistent_snapset_t { + inconsistent_snapset_t() = default; + inconsistent_snapset_t(const object_id_t& head) + : object{head} + {} + enum { + SNAPSET_MISSING = 1 << 0, + SNAPSET_CORRUPTED = 1 << 1, + CLONE_MISSING = 1 << 2, + SNAP_ERROR = 1 << 3, + HEAD_MISMATCH = 1 << 4, // Unused + HEADLESS_CLONE = 1 << 5, + SIZE_MISMATCH = 1 << 6, + OI_MISSING = 1 << 7, // Old + INFO_MISSING = 1 << 7, + OI_CORRUPTED = 1 << 8, // Old + INFO_CORRUPTED = 1 << 8, + EXTRA_CLONES = 1 << 9, + }; + uint64_t errors = 0; + object_id_t object; + // Extra clones + std::vector clones; + std::vector missing; + ceph::bufferlist ss_bl; + + bool ss_attr_missing() const { // Compatibility + return errors & SNAPSET_MISSING; + } + bool snapset_missing() const { + return errors & SNAPSET_MISSING; + } + bool ss_attr_corrupted() const { // Compatibility + return errors & SNAPSET_CORRUPTED; + } + bool snapset_corrupted() const { + return errors & SNAPSET_CORRUPTED; + } + bool clone_missing() const { + return errors & CLONE_MISSING; + } + bool snapset_mismatch() const { // Compatibility + return errors & SNAP_ERROR; + } + bool snapset_error() const { + return errors & SNAP_ERROR; + } + bool head_mismatch() const { // Compatibility + return false; + } + bool headless() const { + return errors & HEADLESS_CLONE; + } + bool size_mismatch() const { + return errors & SIZE_MISMATCH; + } + bool oi_attr_missing() const { // Compatibility + return errors & OI_MISSING; + } + bool info_missing() const { + return errors & INFO_MISSING; + } + bool oi_attr_corrupted() const { // Compatibility + return errors & OI_CORRUPTED; + } + bool info_corrupted() const { + return errors & INFO_CORRUPTED; + } + bool extra_clones() const { + return errors & EXTRA_CLONES; + } +}; + +/** + * @var all_nspaces + * Pass as nspace argument to IoCtx::set_namespace() + * before calling nobjects_begin() to iterate + * through all objects in all namespaces. + */ +const std::string all_nspaces(LIBRADOS_ALL_NSPACES); + +struct notify_ack_t { + uint64_t notifier_id; + uint64_t cookie; + ceph::bufferlist payload_bl; +}; + +struct notify_timeout_t { + uint64_t notifier_id; + uint64_t cookie; +}; +} +#endif diff --git a/shared/rbd/features.h b/shared/rbd/features.h new file mode 100644 index 0000000000..31c73b38f7 --- /dev/null +++ b/shared/rbd/features.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_FEATURES_H +#define CEPH_RBD_FEATURES_H + +#define RBD_FEATURE_LAYERING (1ULL<<0) +#define RBD_FEATURE_STRIPINGV2 (1ULL<<1) +#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) +#define RBD_FEATURE_OBJECT_MAP (1ULL<<3) +#define RBD_FEATURE_FAST_DIFF (1ULL<<4) +#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) +#define RBD_FEATURE_JOURNALING (1ULL<<6) +#define RBD_FEATURE_DATA_POOL (1ULL<<7) +#define RBD_FEATURE_OPERATIONS (1ULL<<8) +#define RBD_FEATURE_MIGRATING (1ULL<<9) +#define RBD_FEATURE_NON_PRIMARY (1ULL<<10) +#define RBD_FEATURE_DIRTY_CACHE (1ULL<<11) + +#define RBD_FEATURES_DEFAULT (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN) + +#define RBD_FEATURE_NAME_LAYERING "layering" +#define RBD_FEATURE_NAME_STRIPINGV2 "striping" +#define RBD_FEATURE_NAME_EXCLUSIVE_LOCK "exclusive-lock" +#define RBD_FEATURE_NAME_OBJECT_MAP "object-map" +#define RBD_FEATURE_NAME_FAST_DIFF "fast-diff" +#define RBD_FEATURE_NAME_DEEP_FLATTEN "deep-flatten" +#define RBD_FEATURE_NAME_JOURNALING "journaling" +#define RBD_FEATURE_NAME_DATA_POOL "data-pool" +#define RBD_FEATURE_NAME_OPERATIONS "operations" +#define RBD_FEATURE_NAME_MIGRATING "migrating" +#define RBD_FEATURE_NAME_NON_PRIMARY "non-primary" +#define RBD_FEATURE_NAME_DIRTY_CACHE "dirty-cache" + +/// features that make an image inaccessible for read or write by +/// clients that don't understand them +#define RBD_FEATURES_INCOMPATIBLE (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_DATA_POOL | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that make an image unwritable by clients that don't understand them +#define RBD_FEATURES_RW_INCOMPATIBLE (RBD_FEATURES_INCOMPATIBLE | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING | \ + RBD_FEATURE_NON_PRIMARY) + +#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_DATA_POOL | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING | \ + RBD_FEATURE_NON_PRIMARY | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that may be dynamically enabled or disabled +#define RBD_FEATURES_MUTABLE (RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_NON_PRIMARY | \ + RBD_FEATURE_DIRTY_CACHE) + +#define RBD_FEATURES_MUTABLE_INTERNAL (RBD_FEATURE_NON_PRIMARY | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that may be dynamically disabled +#define RBD_FEATURES_DISABLE_ONLY (RBD_FEATURE_DEEP_FLATTEN) + +/// features that only work when used with a single client +/// using the image for writes +#define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that will be implicitly enabled +#define RBD_FEATURES_IMPLICIT_ENABLE (RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_DATA_POOL | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING | \ + RBD_FEATURE_NON_PRIMARY | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that cannot be controlled by the user +#define RBD_FEATURES_INTERNAL (RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING) + +#define RBD_OPERATION_FEATURE_CLONE_PARENT (1ULL<<0) +#define RBD_OPERATION_FEATURE_CLONE_CHILD (1ULL<<1) +#define RBD_OPERATION_FEATURE_GROUP (1ULL<<2) +#define RBD_OPERATION_FEATURE_SNAP_TRASH (1ULL<<3) + +#define RBD_OPERATION_FEATURE_NAME_CLONE_PARENT "clone-parent" +#define RBD_OPERATION_FEATURE_NAME_CLONE_CHILD "clone-child" +#define RBD_OPERATION_FEATURE_NAME_GROUP "group" +#define RBD_OPERATION_FEATURE_NAME_SNAP_TRASH "snap-trash" + +/// all valid operation features +#define RBD_OPERATION_FEATURES_ALL (RBD_OPERATION_FEATURE_CLONE_PARENT | \ + RBD_OPERATION_FEATURE_CLONE_CHILD | \ + RBD_OPERATION_FEATURE_GROUP | \ + RBD_OPERATION_FEATURE_SNAP_TRASH) + +#endif diff --git a/shared/rbd/librbd.h b/shared/rbd/librbd.h new file mode 100644 index 0000000000..7ae20e4dd5 --- /dev/null +++ b/shared/rbd/librbd.h @@ -0,0 +1,1549 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIBRBD_H +#define CEPH_LIBRBD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#if defined(__linux__) +#include +#elif defined(__FreeBSD__) +#include +#endif +#include +#include +#include +#include "../rados/librados.h" +#include "features.h" + +#define LIBRBD_VER_MAJOR 1 +#define LIBRBD_VER_MINOR 18 +#define LIBRBD_VER_EXTRA 0 + +#define LIBRBD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA) + +#define LIBRBD_SUPPORTS_AIO_FLUSH 1 +#define LIBRBD_SUPPORTS_AIO_OPEN 1 +#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE 1 +#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE_IOVEC 1 +#define LIBRBD_SUPPORTS_LOCKING 1 +#define LIBRBD_SUPPORTS_INVALIDATE 1 +#define LIBRBD_SUPPORTS_IOVEC 1 +#define LIBRBD_SUPPORTS_WATCH 0 +#define LIBRBD_SUPPORTS_WRITESAME 1 +#define LIBRBD_SUPPORTS_WRITE_ZEROES 1 +#define LIBRBD_SUPPORTS_ENCRYPTION 1 +#define LIBRBD_SUPPORTS_ENCRYPTION_LOAD2 1 + +#if __GNUC__ >= 4 + #define CEPH_RBD_API __attribute__ ((visibility ("default"))) + #define CEPH_RBD_DEPRECATED __attribute__((deprecated)) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#else + #define CEPH_RBD_API + #define CEPH_RBD_DEPRECATED +#endif + +#define RBD_FLAG_OBJECT_MAP_INVALID (1<<0) +#define RBD_FLAG_FAST_DIFF_INVALID (1<<1) + +#define RBD_MIRROR_IMAGE_STATUS_LOCAL_MIRROR_UUID "" + +typedef void *rbd_image_t; +typedef void *rbd_image_options_t; +typedef void *rbd_pool_stats_t; + +typedef void *rbd_completion_t; +typedef void (*rbd_callback_t)(rbd_completion_t cb, void *arg); + +typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr); + +typedef void (*rbd_update_callback_t)(void *arg); + +typedef enum { + RBD_SNAP_NAMESPACE_TYPE_USER = 0, + RBD_SNAP_NAMESPACE_TYPE_GROUP = 1, + RBD_SNAP_NAMESPACE_TYPE_TRASH = 2, + RBD_SNAP_NAMESPACE_TYPE_MIRROR = 3, +} rbd_snap_namespace_type_t; + +typedef struct { + char *id; + char *name; +} rbd_image_spec_t; + +typedef struct { + int64_t pool_id; + char *pool_name; + char *pool_namespace; + char *image_id; + char *image_name; + bool trash; +} rbd_linked_image_spec_t; + +typedef struct { + uint64_t id; + rbd_snap_namespace_type_t namespace_type; + char *name; +} rbd_snap_spec_t; + +typedef struct { + uint64_t id; + uint64_t size; + const char *name; +} rbd_snap_info_t; + +typedef struct { + const char *pool_name; + const char *image_name; + const char *image_id; + bool trash; +} rbd_child_info_t; + +#define RBD_MAX_IMAGE_NAME_SIZE 96 +#define RBD_MAX_BLOCK_NAME_SIZE 24 + +#define RBD_SNAP_CREATE_SKIP_QUIESCE (1 << 0) +#define RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR (1 << 1) + +#define RBD_SNAP_REMOVE_UNPROTECT (1 << 0) +#define RBD_SNAP_REMOVE_FLATTEN (1 << 1) +#define RBD_SNAP_REMOVE_FORCE (RBD_SNAP_REMOVE_UNPROTECT | RBD_SNAP_REMOVE_FLATTEN) + +/** + * These types used to in set_image_notification to indicate the type of event + * socket passed in. + */ +enum { + EVENT_TYPE_PIPE = 1, + EVENT_TYPE_EVENTFD = 2 +}; + +typedef struct { + uint64_t size; + uint64_t obj_size; + uint64_t num_objs; + int order; + char block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE]; /* deprecated */ + int64_t parent_pool; /* deprecated */ + char parent_name[RBD_MAX_IMAGE_NAME_SIZE]; /* deprecated */ +} rbd_image_info_t; + +typedef enum { + RBD_MIRROR_MODE_DISABLED, /* mirroring is disabled */ + RBD_MIRROR_MODE_IMAGE, /* mirroring enabled on a per-image basis */ + RBD_MIRROR_MODE_POOL /* mirroring enabled on all journaled images */ +} rbd_mirror_mode_t; + +typedef enum { + RBD_MIRROR_PEER_DIRECTION_RX = 0, + RBD_MIRROR_PEER_DIRECTION_TX = 1, + RBD_MIRROR_PEER_DIRECTION_RX_TX = 2 +} rbd_mirror_peer_direction_t; + +typedef struct { + char *uuid; + char *cluster_name; + char *client_name; +} rbd_mirror_peer_t CEPH_RBD_DEPRECATED; + +typedef struct { + char *uuid; + rbd_mirror_peer_direction_t direction; + char *site_name; + char *mirror_uuid; + char *client_name; + time_t last_seen; +} rbd_mirror_peer_site_t; + +#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_MON_HOST "mon_host" +#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_KEY "key" + +typedef enum { + RBD_MIRROR_IMAGE_MODE_JOURNAL = 0, + RBD_MIRROR_IMAGE_MODE_SNAPSHOT = 1, +} rbd_mirror_image_mode_t; + +typedef enum { + RBD_MIRROR_IMAGE_DISABLING = 0, + RBD_MIRROR_IMAGE_ENABLED = 1, + RBD_MIRROR_IMAGE_DISABLED = 2 +} rbd_mirror_image_state_t; + +typedef struct { + char *global_id; + rbd_mirror_image_state_t state; + bool primary; +} rbd_mirror_image_info_t; + +typedef enum { + MIRROR_IMAGE_STATUS_STATE_UNKNOWN = 0, + MIRROR_IMAGE_STATUS_STATE_ERROR = 1, + MIRROR_IMAGE_STATUS_STATE_SYNCING = 2, + MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY = 3, + MIRROR_IMAGE_STATUS_STATE_REPLAYING = 4, + MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY = 5, + MIRROR_IMAGE_STATUS_STATE_STOPPED = 6, +} rbd_mirror_image_status_state_t; + +typedef struct { + char *name; + rbd_mirror_image_info_t info; + rbd_mirror_image_status_state_t state; + char *description; + time_t last_update; + bool up; +} rbd_mirror_image_status_t CEPH_RBD_DEPRECATED; + +typedef struct { + char *mirror_uuid; + rbd_mirror_image_status_state_t state; + char *description; + time_t last_update; + bool up; +} rbd_mirror_image_site_status_t; + +typedef struct { + char *name; + rbd_mirror_image_info_t info; + uint32_t site_statuses_count; + rbd_mirror_image_site_status_t *site_statuses; +} rbd_mirror_image_global_status_t; + +typedef enum { + RBD_GROUP_IMAGE_STATE_ATTACHED, + RBD_GROUP_IMAGE_STATE_INCOMPLETE +} rbd_group_image_state_t; + +typedef struct { + char *name; + int64_t pool; + rbd_group_image_state_t state; +} rbd_group_image_info_t; + +typedef struct { + char *name; + int64_t pool; +} rbd_group_info_t; + +typedef enum { + RBD_GROUP_SNAP_STATE_INCOMPLETE, + RBD_GROUP_SNAP_STATE_COMPLETE +} rbd_group_snap_state_t; + +typedef struct { + char *name; + rbd_group_snap_state_t state; +} rbd_group_snap_info_t; + +typedef struct { + int64_t group_pool; + char *group_name; + char *group_snap_name; +} rbd_snap_group_namespace_t; + +typedef enum { + RBD_SNAP_MIRROR_STATE_PRIMARY, + RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED, + RBD_SNAP_MIRROR_STATE_NON_PRIMARY, + RBD_SNAP_MIRROR_STATE_NON_PRIMARY_DEMOTED +} rbd_snap_mirror_state_t; + +typedef struct { + rbd_snap_mirror_state_t state; + size_t mirror_peer_uuids_count; + char *mirror_peer_uuids; + bool complete; + char *primary_mirror_uuid; + uint64_t primary_snap_id; + uint64_t last_copied_object_number; +} rbd_snap_mirror_namespace_t; + +typedef enum { + RBD_LOCK_MODE_EXCLUSIVE = 0, + RBD_LOCK_MODE_SHARED = 1, +} rbd_lock_mode_t; + +CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra); + +/* image options */ +enum { + RBD_IMAGE_OPTION_FORMAT = 0, + RBD_IMAGE_OPTION_FEATURES = 1, + RBD_IMAGE_OPTION_ORDER = 2, + RBD_IMAGE_OPTION_STRIPE_UNIT = 3, + RBD_IMAGE_OPTION_STRIPE_COUNT = 4, + RBD_IMAGE_OPTION_JOURNAL_ORDER = 5, + RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH = 6, + RBD_IMAGE_OPTION_JOURNAL_POOL = 7, + RBD_IMAGE_OPTION_FEATURES_SET = 8, + RBD_IMAGE_OPTION_FEATURES_CLEAR = 9, + RBD_IMAGE_OPTION_DATA_POOL = 10, + RBD_IMAGE_OPTION_FLATTEN = 11, + RBD_IMAGE_OPTION_CLONE_FORMAT = 12, + RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE = 13, +}; + +typedef enum { + RBD_TRASH_IMAGE_SOURCE_USER = 0, + RBD_TRASH_IMAGE_SOURCE_MIRRORING = 1, + RBD_TRASH_IMAGE_SOURCE_MIGRATION = 2, + RBD_TRASH_IMAGE_SOURCE_REMOVING = 3, + RBD_TRASH_IMAGE_SOURCE_USER_PARENT = 4, +} rbd_trash_image_source_t; + +typedef struct { + char *id; + char *name; + rbd_trash_image_source_t source; + time_t deletion_time; + time_t deferment_end_time; +} rbd_trash_image_info_t; + +typedef struct { + char *addr; + int64_t id; + uint64_t cookie; +} rbd_image_watcher_t; + +typedef enum { + RBD_IMAGE_MIGRATION_STATE_UNKNOWN = -1, + RBD_IMAGE_MIGRATION_STATE_ERROR = 0, + RBD_IMAGE_MIGRATION_STATE_PREPARING = 1, + RBD_IMAGE_MIGRATION_STATE_PREPARED = 2, + RBD_IMAGE_MIGRATION_STATE_EXECUTING = 3, + RBD_IMAGE_MIGRATION_STATE_EXECUTED = 4, + RBD_IMAGE_MIGRATION_STATE_ABORTING = 5, +} rbd_image_migration_state_t; + +typedef struct { + int64_t source_pool_id; + char *source_pool_namespace; + char *source_image_name; + char *source_image_id; + int64_t dest_pool_id; + char *dest_pool_namespace; + char *dest_image_name; + char *dest_image_id; + rbd_image_migration_state_t state; + char *state_description; +} rbd_image_migration_status_t; + +typedef enum { + RBD_CONFIG_SOURCE_CONFIG = 0, + RBD_CONFIG_SOURCE_POOL = 1, + RBD_CONFIG_SOURCE_IMAGE = 2, +} rbd_config_source_t; + +typedef struct { + char *name; + char *value; + rbd_config_source_t source; +} rbd_config_option_t; + +typedef enum { + RBD_POOL_STAT_OPTION_IMAGES, + RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, + RBD_POOL_STAT_OPTION_TRASH_IMAGES, + RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS +} rbd_pool_stat_option_t; + +/* rbd_write_zeroes / rbd_aio_write_zeroes flags */ +enum { + RBD_WRITE_ZEROES_FLAG_THICK_PROVISION = (1U<<0), /* fully allocated zeroed extent */ +}; + +typedef enum { + RBD_ENCRYPTION_FORMAT_LUKS1 = 0, + RBD_ENCRYPTION_FORMAT_LUKS2 = 1, + RBD_ENCRYPTION_FORMAT_LUKS = 2 +} rbd_encryption_format_t; + +typedef enum { + RBD_ENCRYPTION_ALGORITHM_AES128 = 0, + RBD_ENCRYPTION_ALGORITHM_AES256 = 1 +} rbd_encryption_algorithm_t; + +typedef void *rbd_encryption_options_t; + +typedef struct { + rbd_encryption_format_t format; + rbd_encryption_options_t opts; + size_t opts_size; +} rbd_encryption_spec_t; + +typedef struct { + rbd_encryption_algorithm_t alg; + const char* passphrase; + size_t passphrase_size; +} rbd_encryption_luks1_format_options_t; + +typedef struct { + rbd_encryption_algorithm_t alg; + const char* passphrase; + size_t passphrase_size; +} rbd_encryption_luks2_format_options_t; + +typedef struct { + const char* passphrase; + size_t passphrase_size; +} rbd_encryption_luks_format_options_t; + +CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts); +CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts); +CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts, + int optname, const char* optval); +CEPH_RBD_API int rbd_image_options_set_uint64(rbd_image_options_t opts, + int optname, uint64_t optval); +CEPH_RBD_API int rbd_image_options_get_string(rbd_image_options_t opts, + int optname, char* optval, + size_t maxlen); +CEPH_RBD_API int rbd_image_options_get_uint64(rbd_image_options_t opts, + int optname, uint64_t* optval); +CEPH_RBD_API int rbd_image_options_is_set(rbd_image_options_t opts, + int optname, bool* is_set); +CEPH_RBD_API int rbd_image_options_unset(rbd_image_options_t opts, int optname); +CEPH_RBD_API void rbd_image_options_clear(rbd_image_options_t opts); +CEPH_RBD_API int rbd_image_options_is_empty(rbd_image_options_t opts); + +/* helpers */ +CEPH_RBD_API void rbd_image_spec_cleanup(rbd_image_spec_t *image); +CEPH_RBD_API void rbd_image_spec_list_cleanup(rbd_image_spec_t *images, + size_t num_images); +CEPH_RBD_API void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image); +CEPH_RBD_API void rbd_linked_image_spec_list_cleanup( + rbd_linked_image_spec_t *images, size_t num_images); +CEPH_RBD_API void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap); + +/* images */ +CEPH_RBD_API int rbd_list(rados_ioctx_t io, char *names, size_t *size) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_list2(rados_ioctx_t io, rbd_image_spec_t* images, + size_t *max_images); + +CEPH_RBD_API int rbd_create(rados_ioctx_t io, const char *name, uint64_t size, + int *order); +CEPH_RBD_API int rbd_create2(rados_ioctx_t io, const char *name, uint64_t size, + uint64_t features, int *order); +/** + * create new rbd image + * + * The stripe_unit must be a factor of the object size (1 << order). + * The stripe_count can be one (no intra-object striping) or greater + * than one. The RBD_FEATURE_STRIPINGV2 must be specified if the + * stripe_unit != the object size and the stripe_count is != 1. + * + * @param io ioctx + * @param name image name + * @param size image size in bytes + * @param features initial feature bits + * @param order object/block size, as a power of two (object size == 1 << order) + * @param stripe_unit stripe unit size, in bytes. + * @param stripe_count number of objects to stripe over before looping + * @return 0 on success, or negative error code + */ +CEPH_RBD_API int rbd_create3(rados_ioctx_t io, const char *name, uint64_t size, + uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count); +CEPH_RBD_API int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size, + rbd_image_options_t opts); +CEPH_RBD_API int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order); +CEPH_RBD_API int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count); +CEPH_RBD_API int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, rbd_image_options_t c_opts); +CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name); +CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname, + const char *destname); + +CEPH_RBD_API int rbd_trash_move(rados_ioctx_t io, const char *name, + uint64_t delay); +CEPH_RBD_API int rbd_trash_get(rados_ioctx_t io, const char *id, + rbd_trash_image_info_t *info); +CEPH_RBD_API void rbd_trash_get_cleanup(rbd_trash_image_info_t *info); +CEPH_RBD_API int rbd_trash_list(rados_ioctx_t io, + rbd_trash_image_info_t *trash_entries, + size_t *num_entries); +CEPH_RBD_API void rbd_trash_list_cleanup(rbd_trash_image_info_t *trash_entries, + size_t num_entries); +CEPH_RBD_API int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, float threshold); +CEPH_RBD_API int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts, + float threshold, librbd_progress_fn_t cb, + void* cbdata); +CEPH_RBD_API int rbd_trash_remove(rados_ioctx_t io, const char *id, bool force); +CEPH_RBD_API int rbd_trash_remove_with_progress(rados_ioctx_t io, + const char *id, + bool force, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_trash_restore(rados_ioctx_t io, const char *id, + const char *name); + +/* migration */ +CEPH_RBD_API int rbd_migration_prepare(rados_ioctx_t ioctx, + const char *image_name, + rados_ioctx_t dest_ioctx, + const char *dest_image_name, + rbd_image_options_t opts); +CEPH_RBD_API int rbd_migration_prepare_import( + const char *source_spec, rados_ioctx_t dest_ioctx, + const char *dest_image_name, rbd_image_options_t opts); +CEPH_RBD_API int rbd_migration_execute(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_execute_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_abort(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_abort_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_commit(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_commit_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_status(rados_ioctx_t ioctx, + const char *image_name, + rbd_image_migration_status_t *status, + size_t status_size); +CEPH_RBD_API void rbd_migration_status_cleanup( + rbd_image_migration_status_t *status); + +/* pool mirroring */ +CEPH_RBD_API int rbd_mirror_site_name_get(rados_t cluster, + char *name, size_t *max_len); +CEPH_RBD_API int rbd_mirror_site_name_set(rados_t cluster, + const char *name); + +CEPH_RBD_API int rbd_mirror_mode_get(rados_ioctx_t io_ctx, + rbd_mirror_mode_t *mirror_mode); +CEPH_RBD_API int rbd_mirror_mode_set(rados_ioctx_t io_ctx, + rbd_mirror_mode_t mirror_mode); + +CEPH_RBD_API int rbd_mirror_uuid_get(rados_ioctx_t io_ctx, + char *uuid, size_t *max_len); + +CEPH_RBD_API int rbd_mirror_peer_bootstrap_create( + rados_ioctx_t io_ctx, char *token, size_t *max_len); +CEPH_RBD_API int rbd_mirror_peer_bootstrap_import( + rados_ioctx_t io_ctx, rbd_mirror_peer_direction_t direction, + const char *token); + +CEPH_RBD_API int rbd_mirror_peer_site_add( + rados_ioctx_t io_ctx, char *uuid, size_t uuid_max_length, + rbd_mirror_peer_direction_t direction, const char *site_name, + const char *client_name); +CEPH_RBD_API int rbd_mirror_peer_site_set_name( + rados_ioctx_t io_ctx, const char *uuid, const char *site_name); +CEPH_RBD_API int rbd_mirror_peer_site_set_client_name( + rados_ioctx_t io_ctx, const char *uuid, const char *client_name); +CEPH_RBD_API int rbd_mirror_peer_site_set_direction( + rados_ioctx_t io_ctx, const char *uuid, + rbd_mirror_peer_direction_t direction); +CEPH_RBD_API int rbd_mirror_peer_site_remove( + rados_ioctx_t io_ctx, const char *uuid); +CEPH_RBD_API int rbd_mirror_peer_site_list( + rados_ioctx_t io_ctx, rbd_mirror_peer_site_t *peers, int *max_peers); +CEPH_RBD_API void rbd_mirror_peer_site_list_cleanup( + rbd_mirror_peer_site_t *peers, int max_peers); +CEPH_RBD_API int rbd_mirror_peer_site_get_attributes( + rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len, + char *values, size_t *max_value_len, size_t *key_value_count); +CEPH_RBD_API int rbd_mirror_peer_site_set_attributes( + rados_ioctx_t p, const char *uuid, const char *keys, const char *values, + size_t key_value_count); + +CEPH_RBD_API int rbd_mirror_image_global_status_list( + rados_ioctx_t io_ctx, const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_global_status_t *images, size_t *len); +CEPH_RBD_API void rbd_mirror_image_global_status_list_cleanup( + char **image_ids, rbd_mirror_image_global_status_t *images, size_t len); + +/* rbd_mirror_peer_ commands are deprecated to rbd_mirror_peer_site_ + * equivalents */ +CEPH_RBD_API int rbd_mirror_peer_add( + rados_ioctx_t io_ctx, char *uuid, size_t uuid_max_length, + const char *cluster_name, const char *client_name) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_remove( + rados_ioctx_t io_ctx, const char *uuid) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_list( + rados_ioctx_t io_ctx, rbd_mirror_peer_t *peers, int *max_peers) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API void rbd_mirror_peer_list_cleanup( + rbd_mirror_peer_t *peers, int max_peers) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_set_client( + rados_ioctx_t io_ctx, const char *uuid, const char *client_name) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_set_cluster( + rados_ioctx_t io_ctx, const char *uuid, const char *cluster_name) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_get_attributes( + rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len, + char *values, size_t *max_value_len, size_t *key_value_count) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_set_attributes( + rados_ioctx_t p, const char *uuid, const char *keys, const char *values, + size_t key_value_count) + CEPH_RBD_DEPRECATED; + +/* rbd_mirror_image_status_list_ commands are deprecard to + * rbd_mirror_image_global_status_list_ commands */ + +CEPH_RBD_API int rbd_mirror_image_status_list( + rados_ioctx_t io_ctx, const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_status_t *images, size_t *len) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API void rbd_mirror_image_status_list_cleanup( + char **image_ids, rbd_mirror_image_status_t *images, size_t len) + CEPH_RBD_DEPRECATED; + +CEPH_RBD_API int rbd_mirror_image_status_summary( + rados_ioctx_t io_ctx, rbd_mirror_image_status_state_t *states, int *counts, + size_t *maxlen); + +CEPH_RBD_API int rbd_mirror_image_instance_id_list(rados_ioctx_t io_ctx, + const char *start_id, + size_t max, char **image_ids, + char **instance_ids, + size_t *len); +CEPH_RBD_API void rbd_mirror_image_instance_id_list_cleanup(char **image_ids, + char **instance_ids, + size_t len); +CEPH_RBD_API int rbd_mirror_image_info_list( + rados_ioctx_t io_ctx, rbd_mirror_image_mode_t *mode_filter, + const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_mode_t *mode_entries, + rbd_mirror_image_info_t *info_entries, size_t *num_entries); +CEPH_RBD_API void rbd_mirror_image_info_list_cleanup( + char **image_ids, rbd_mirror_image_info_t *info_entries, + size_t num_entries); + +/* pool metadata */ +CEPH_RBD_API int rbd_pool_metadata_get(rados_ioctx_t io_ctx, const char *key, + char *value, size_t *val_len); +CEPH_RBD_API int rbd_pool_metadata_set(rados_ioctx_t io_ctx, const char *key, + const char *value); +CEPH_RBD_API int rbd_pool_metadata_remove(rados_ioctx_t io_ctx, + const char *key); +CEPH_RBD_API int rbd_pool_metadata_list(rados_ioctx_t io_ctx, const char *start, + uint64_t max, char *keys, + size_t *key_len, char *values, + size_t *vals_len); + +CEPH_RBD_API int rbd_config_pool_list(rados_ioctx_t io_ctx, + rbd_config_option_t *options, + int *max_options); +CEPH_RBD_API void rbd_config_pool_list_cleanup(rbd_config_option_t *options, + int max_options); + +CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_open_by_id(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name); + +CEPH_RBD_API int rbd_aio_open(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_open_by_id(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); + +/** + * Open an image in read-only mode. + * + * This is intended for use by clients that cannot write to a block + * device due to cephx restrictions. There will be no watch + * established on the header object, since a watch is a write. This + * means the metadata reported about this image (parents, snapshots, + * size, etc.) may become stale. This should not be used for + * long-running operations, unless you can be sure that one of these + * properties changing is safe. + * + * Attempting to write to a read-only image will return -EROFS. + * + * @param io ioctx to determine the pool the image is in + * @param name image name + * @param image where to store newly opened image handle + * @param snap_name name of snapshot to open at, or NULL for no snapshot + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_open_read_only(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_open_by_id_read_only(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_aio_open_read_only(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_open_by_id_read_only(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_features_to_string(uint64_t features, char *str_features, + size_t *size); +CEPH_RBD_API int rbd_features_from_string(const char *str_features, uint64_t *features); +CEPH_RBD_API int rbd_close(rbd_image_t image); +CEPH_RBD_API int rbd_aio_close(rbd_image_t image, rbd_completion_t c); +CEPH_RBD_API int rbd_resize(rbd_image_t image, uint64_t size); +CEPH_RBD_API int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_resize_with_progress(rbd_image_t image, uint64_t size, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_stat(rbd_image_t image, rbd_image_info_t *info, + size_t infosize); +CEPH_RBD_API int rbd_get_old_format(rbd_image_t image, uint8_t *old); +CEPH_RBD_API int rbd_get_size(rbd_image_t image, uint64_t *size); +CEPH_RBD_API int rbd_get_features(rbd_image_t image, uint64_t *features); +CEPH_RBD_API int rbd_update_features(rbd_image_t image, uint64_t features, + uint8_t enabled); +CEPH_RBD_API int rbd_get_op_features(rbd_image_t image, uint64_t *op_features); +CEPH_RBD_API int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit); +CEPH_RBD_API int rbd_get_stripe_count(rbd_image_t image, + uint64_t *stripe_count); + +CEPH_RBD_API int rbd_get_create_timestamp(rbd_image_t image, + struct timespec *timestamp); +CEPH_RBD_API int rbd_get_access_timestamp(rbd_image_t image, + struct timespec *timestamp); +CEPH_RBD_API int rbd_get_modify_timestamp(rbd_image_t image, + struct timespec *timestamp); + +CEPH_RBD_API int rbd_get_overlap(rbd_image_t image, uint64_t *overlap); +CEPH_RBD_API int rbd_get_name(rbd_image_t image, char *name, size_t *name_len); +CEPH_RBD_API int rbd_get_id(rbd_image_t image, char *id, size_t id_len); +CEPH_RBD_API int rbd_get_block_name_prefix(rbd_image_t image, + char *prefix, size_t prefix_len); +CEPH_RBD_API int64_t rbd_get_data_pool_id(rbd_image_t image); + +CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image, + char *parent_poolname, size_t ppoolnamelen, + char *parent_name, size_t pnamelen, + char *parent_snapname, + size_t psnapnamelen) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_get_parent_info2(rbd_image_t image, + char *parent_poolname, + size_t ppoolnamelen, + char *parent_name, size_t pnamelen, + char *parent_id, size_t pidlen, + char *parent_snapname, + size_t psnapnamelen) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_get_parent(rbd_image_t image, + rbd_linked_image_spec_t *parent_image, + rbd_snap_spec_t *parent_snap); + +CEPH_RBD_API int rbd_get_migration_source_spec(rbd_image_t image, + char* source_spec, + size_t* max_len); + +CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags); +CEPH_RBD_API int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info, + size_t group_info_size); +CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type); + +/* exclusive lock feature */ +CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner); +CEPH_RBD_API int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode); +CEPH_RBD_API int rbd_lock_release(rbd_image_t image); +CEPH_RBD_API int rbd_lock_get_owners(rbd_image_t image, + rbd_lock_mode_t *lock_mode, + char **lock_owners, + size_t *max_lock_owners); +CEPH_RBD_API void rbd_lock_get_owners_cleanup(char **lock_owners, + size_t lock_owner_count); +CEPH_RBD_API int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode, + const char *lock_owner); + +/* object map feature */ +CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image, + librbd_progress_fn_t cb, void *cbdata); + +CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx, + const char *destname); +CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest); +CEPH_RBD_API int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, rbd_image_options_t dest_opts); +CEPH_RBD_API int rbd_copy4(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, rbd_image_options_t dest_opts, + size_t sparse_size); +CEPH_RBD_API int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress2(rbd_image_t src, rbd_image_t dest, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress3(rbd_image_t image, + rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress4(rbd_image_t image, + rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, void *cbdata, + size_t sparse_size); + +/* deep copy */ +CEPH_RBD_API int rbd_deep_copy(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, + rbd_image_options_t dest_opts); +CEPH_RBD_API int rbd_deep_copy_with_progress(rbd_image_t image, + rados_ioctx_t dest_io_ctx, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, + void *cbdata); + +/* encryption */ + +/* + * Format the image using the encryption spec specified by + * (format, opts, opts_size) tuple. + * + * For a flat (i.e. non-cloned) image, the new encryption is loaded + * implicitly, calling rbd_encryption_load() afterwards is not needed. + * If existing encryption is already loaded, it is automatically + * replaced with the new encryption. + * + * For a cloned image, the new encryption must be loaded explicitly. + * Existing encryption (if any) must not be loaded. + */ +CEPH_RBD_API int rbd_encryption_format(rbd_image_t image, + rbd_encryption_format_t format, + rbd_encryption_options_t opts, + size_t opts_size); +/* + * Load the encryption spec specified by (format, opts, opts_size) + * tuple for the image and all ancestor images. If an ancestor image + * which does not match any encryption format known to librbd is + * encountered, it - along with remaining ancestor images - is + * interpreted as plaintext. + */ +CEPH_RBD_API int rbd_encryption_load(rbd_image_t image, + rbd_encryption_format_t format, + rbd_encryption_options_t opts, + size_t opts_size); +/* + * Load encryption specs. The first spec in the passed array is + * applied to the image itself, the second spec is applied to its + * ancestor image, the third spec is applied to the ancestor of + * that ancestor image and so on. + * + * If not enough specs are passed, the last spec is reused exactly as + * in rbd_encryption_load(). If an ancestor image for which the last + * spec is being reused turns out to not match any encryption format + * known to librbd, it - along with remaining ancestor images - is + * interpreted as plaintext. + */ +CEPH_RBD_API int rbd_encryption_load2(rbd_image_t image, + const rbd_encryption_spec_t *specs, + size_t spec_count); + +/* snapshots */ +CEPH_RBD_API int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps, + int *max_snaps); +CEPH_RBD_API void rbd_snap_list_end(rbd_snap_info_t *snaps); +CEPH_RBD_API int rbd_snap_exists(rbd_image_t image, const char *snapname, bool *exists); +CEPH_RBD_API int rbd_snap_create(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_create2(rbd_image_t image, const char *snap_name, + uint32_t flags, librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_snap_remove(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_remove2(rbd_image_t image, const char *snap_name, + uint32_t flags, librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id); +CEPH_RBD_API int rbd_snap_rollback(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_rollback_with_progress(rbd_image_t image, + const char *snapname, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_snap_rename(rbd_image_t image, const char *snapname, + const char* dstsnapsname); +/** + * Prevent a snapshot from being deleted until it is unprotected. + * + * @param snap_name which snapshot to protect + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if snap is already protected + */ +CEPH_RBD_API int rbd_snap_protect(rbd_image_t image, const char *snap_name); +/** + * Allow a snaphshot to be deleted. + * + * @param snap_name which snapshot to unprotect + * @returns 0 on success, negative error code on failure + * @returns -EINVAL if snap is not protected + */ +CEPH_RBD_API int rbd_snap_unprotect(rbd_image_t image, const char *snap_name); +/** + * Determine whether a snapshot is protected. + * + * @param snap_name which snapshot query + * @param is_protected where to store the result (0 or 1) + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_is_protected(rbd_image_t image, const char *snap_name, + int *is_protected); +/** + * Get the current snapshot limit for an image. If no limit is set, + * UINT64_MAX is returned. + * + * @param limit pointer where the limit will be stored on success + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit); + +/** + * Set a limit for the number of snapshots that may be taken of an image. + * + * @param limit the maximum number of snapshots allowed in the future. + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_set_limit(rbd_image_t image, uint64_t limit); + +/** + * Get the timestamp of a snapshot for an image. + * + * @param snap_id the snap id of a snapshot of input image. + * @param timestamp the timestamp of input snapshot. + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp); + +CEPH_RBD_API int rbd_snap_set(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id); +CEPH_RBD_API int rbd_snap_get_name(rbd_image_t image, uint64_t snap_id, char *snapname, size_t *name_len); +CEPH_RBD_API int rbd_snap_get_id(rbd_image_t image, const char *snapname, uint64_t *snap_id); + +CEPH_RBD_API int rbd_snap_get_namespace_type(rbd_image_t image, + uint64_t snap_id, + rbd_snap_namespace_type_t *namespace_type); +CEPH_RBD_API int rbd_snap_get_group_namespace(rbd_image_t image, + uint64_t snap_id, + rbd_snap_group_namespace_t *group_snap, + size_t group_snap_size); +CEPH_RBD_API int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap, + size_t group_snap_size); +CEPH_RBD_API int rbd_snap_get_trash_namespace(rbd_image_t image, + uint64_t snap_id, + char* original_name, + size_t max_length); +CEPH_RBD_API int rbd_snap_get_mirror_namespace( + rbd_image_t image, uint64_t snap_id, + rbd_snap_mirror_namespace_t *mirror_snap, size_t mirror_snap_size); +CEPH_RBD_API int rbd_snap_mirror_namespace_cleanup( + rbd_snap_mirror_namespace_t *mirror_snap, size_t mirror_snap_size); + +CEPH_RBD_API int rbd_flatten(rbd_image_t image); + +CEPH_RBD_API int rbd_flatten_with_progress(rbd_image_t image, + librbd_progress_fn_t cb, + void *cbdata); + +CEPH_RBD_API int rbd_sparsify(rbd_image_t image, size_t sparse_size); + +CEPH_RBD_API int rbd_sparsify_with_progress(rbd_image_t image, + size_t sparse_size, + librbd_progress_fn_t cb, + void *cbdata); + +/** + * List all images that are cloned from the image at the + * snapshot that is set via rbd_snap_set(). + * + * This iterates over all pools, so it should be run by a user with + * read access to all of them. pools_len and images_len are filled in + * with the number of bytes put into the pools and images buffers. + * + * If the provided buffers are too short, the required lengths are + * still filled in, but the data is not and -ERANGE is returned. + * Otherwise, the buffers are filled with the pool and image names + * of the children, with a '\0' after each. + * + * @param image which image (and implicitly snapshot) to list clones of + * @param pools buffer in which to store pool names + * @param pools_len number of bytes in pools buffer + * @param images buffer in which to store image names + * @param images_len number of bytes in images buffer + * @returns number of children on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RBD_API ssize_t rbd_list_children(rbd_image_t image, char *pools, + size_t *pools_len, char *images, + size_t *images_len) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_list_children2(rbd_image_t image, + rbd_child_info_t *children, + int *max_children) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API void rbd_list_child_cleanup(rbd_child_info_t *child) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API void rbd_list_children_cleanup(rbd_child_info_t *children, + size_t num_children) + CEPH_RBD_DEPRECATED; + +CEPH_RBD_API int rbd_list_children3(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images); + +CEPH_RBD_API int rbd_list_descendants(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images); + +/** + * @defgroup librbd_h_locking Advisory Locking + * + * An rbd image may be locking exclusively, or shared, to facilitate + * e.g. live migration where the image may be open in two places at once. + * These locks are intended to guard against more than one client + * writing to an image without coordination. They don't need to + * be used for snapshots, since snapshots are read-only. + * + * Currently locks only guard against locks being acquired. + * They do not prevent anything else. + * + * A locker is identified by the internal rados client id of the + * holder and a user-defined cookie. This (client id, cookie) pair + * must be unique for each locker. + * + * A shared lock also has a user-defined tag associated with it. Each + * additional shared lock must specify the same tag or lock + * acquisition will fail. This can be used by e.g. groups of hosts + * using a clustered filesystem on top of an rbd image to make sure + * they're accessing the correct image. + * + * @{ + */ +/** + * List clients that have locked the image and information about the lock. + * + * The number of bytes required in each buffer is put in the + * corresponding size out parameter. If any of the provided buffers + * are too short, -ERANGE is returned after these sizes are filled in. + * + * @param exclusive where to store whether the lock is exclusive (1) or shared (0) + * @param tag where to store the tag associated with the image + * @param tag_len number of bytes in tag buffer + * @param clients buffer in which locker clients are stored, separated by '\0' + * @param clients_len number of bytes in the clients buffer + * @param cookies buffer in which locker cookies are stored, separated by '\0' + * @param cookies_len number of bytes in the cookies buffer + * @param addrs buffer in which locker addresses are stored, separated by '\0' + * @param addrs_len number of bytes in the clients buffer + * @returns number of lockers on success, negative error code on failure + * @returns -ERANGE if any of the buffers are too short + */ +CEPH_RBD_API ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len); + +/** + * Take an exclusive lock on the image. + * + * @param image the image to lock + * @param cookie user-defined identifier for this instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RBD_API int rbd_lock_exclusive(rbd_image_t image, const char *cookie); + +/** + * Take a shared lock on the image. + * + * Other clients may also take a shared lock, as lock as they use the + * same tag. + * + * @param image the image to lock + * @param cookie user-defined identifier for this instance of the lock + * @param tag user-defined identifier for this shared use of the lock + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RBD_API int rbd_lock_shared(rbd_image_t image, const char *cookie, + const char *tag); + +/** + * Release a shared or exclusive lock on the image. + * + * @param image the image to unlock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RBD_API int rbd_unlock(rbd_image_t image, const char *cookie); + +/** + * Release a shared or exclusive lock that was taken by the specified client. + * + * @param image the image to unlock + * @param client the entity holding the lock (as given by rbd_list_lockers()) + * @param cookie user-defined identifier for the instance of the lock to break + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RBD_API int rbd_break_lock(rbd_image_t image, const char *client, + const char *cookie); + +/** @} locking */ + +/* I/O */ +CEPH_RBD_API ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len, + char *buf); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len, + char *buf, int op_flags); +/* DEPRECATED; use rbd_read_iterate2 */ +CEPH_RBD_API int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg); + +/** + * iterate read over an image + * + * Reads each region of the image and calls the callback. If the + * buffer pointer passed to the callback is NULL, the given extent is + * defined to be zeros (a hole). Normally the granularity for the + * callback is the image stripe size. + * + * @param image image to read + * @param ofs offset to start from + * @param len bytes of source image to cover + * @param cb callback for each region + * @returns 0 success, error otherwise + */ +CEPH_RBD_API int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg); +/** + * get difference between two versions of an image + * + * This will return the differences between two versions of an image + * via a callback, which gets the offset and length and a flag + * indicating whether the extent exists (1), or is known/defined to + * be zeros (a hole, 0). If the source snapshot name is NULL, we + * interpret that as the beginning of time and return all allocated + * regions of the image. The end version is whatever is currently + * selected for the image handle (either a snapshot or the writeable + * head). + * + * @param fromsnapname start snapshot name, or NULL + * @param ofs start offset + * @param len len in bytes of region to report on + * @param include_parent 1 if full history diff should include parent + * @param whole_object 1 if diff extents should cover whole object + * @param cb callback to call for each allocated region + * @param arg argument to pass to the callback + * @returns 0 on success, or negative error code on error + */ +CEPH_RBD_API int rbd_diff_iterate(rbd_image_t image, + const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), + void *arg); +CEPH_RBD_API int rbd_diff_iterate2(rbd_image_t image, + const char *fromsnapname, + uint64_t ofs, uint64_t len, + uint8_t include_parent, uint8_t whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg); +CEPH_RBD_API ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, int op_flags); +CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len); +CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, size_t data_len, + int op_flags); +CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, + size_t len, int zero_flags, + int op_flags); +CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs, + size_t len, const char *cmp_buf, + const char *buf, + uint64_t *mismatch_off, + int op_flags); + +CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c); + +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c, + int op_flags); +CEPH_RBD_API int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c); +CEPH_RBD_API int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c, int op_flags); +CEPH_RBD_API int rbd_aio_readv(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c); +CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len, + const char *buf, size_t data_len, + rbd_completion_t c, int op_flags); +CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, + size_t len, rbd_completion_t c, + int zero_flags, int op_flags); +CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image, + uint64_t off, size_t len, + const char *cmp_buf, + const char *buf, + rbd_completion_t c, + uint64_t *mismatch_off, + int op_flags); +CEPH_RBD_API ssize_t rbd_aio_compare_and_writev(rbd_image_t image, + uint64_t off, + const struct iovec *cmp_iov, + int cmp_iovcnt, + const struct iovec *iov, + int iovcnt, + rbd_completion_t c, + uint64_t *mismatch_off, + int op_flags); + +CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg, + rbd_callback_t complete_cb, + rbd_completion_t *c); +CEPH_RBD_API int rbd_aio_is_complete(rbd_completion_t c); +CEPH_RBD_API int rbd_aio_wait_for_complete(rbd_completion_t c); +CEPH_RBD_API ssize_t rbd_aio_get_return_value(rbd_completion_t c); +CEPH_RBD_API void *rbd_aio_get_arg(rbd_completion_t c); +CEPH_RBD_API void rbd_aio_release(rbd_completion_t c); +CEPH_RBD_API int rbd_flush(rbd_image_t image); +/** + * Start a flush if caching is enabled. Get a callback when + * the currently pending writes are on disk. + * + * @param image the image to flush writes to + * @param c what to call when flushing is complete + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c); + +/** + * Drop any cached data for an image + * + * @param image the image to invalidate cached data for + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image); + +CEPH_RBD_API int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp); + +CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len); +CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value); +CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key); +/** + * List all metadatas associated with this image. + * + * This iterates over all metadatas, key_len and val_len are filled in + * with the number of bytes put into the keys and values buffers. + * + * If the provided buffers are too short, the required lengths are + * still filled in, but the data is not and -ERANGE is returned. + * Otherwise, the buffers are filled with the keys and values + * of the image, with a '\0' after each. + * + * @param image which image (and implicitly snapshot) to list clones of + * @param start_after which name to begin listing after + * (use the empty string to start at the beginning) + * @param max the maximum number of names to lis(if 0 means no limit) + * @param keys buffer in which to store pool names + * @param keys_len number of bytes in pools buffer + * @param values buffer in which to store image names + * @param vals_len number of bytes in images buffer + * @returns number of children on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RBD_API int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max, + char *keys, size_t *key_len, char *values, size_t *vals_len); + +// RBD image mirroring support functions +CEPH_RBD_API int rbd_mirror_image_enable(rbd_image_t image) CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_image_enable2(rbd_image_t image, + rbd_mirror_image_mode_t mode); +CEPH_RBD_API int rbd_mirror_image_disable(rbd_image_t image, bool force); +CEPH_RBD_API int rbd_mirror_image_promote(rbd_image_t image, bool force); +CEPH_RBD_API int rbd_mirror_image_demote(rbd_image_t image); +CEPH_RBD_API int rbd_mirror_image_resync(rbd_image_t image); +CEPH_RBD_API int rbd_mirror_image_create_snapshot(rbd_image_t image, + uint64_t *snap_id); +CEPH_RBD_API int rbd_mirror_image_create_snapshot2(rbd_image_t image, + uint32_t flags, + uint64_t *snap_id); +CEPH_RBD_API int rbd_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *mirror_image_info, + size_t info_size); +CEPH_RBD_API void rbd_mirror_image_get_info_cleanup( + rbd_mirror_image_info_t *mirror_image_info); +CEPH_RBD_API int rbd_mirror_image_get_mode(rbd_image_t image, + rbd_mirror_image_mode_t *mode); + +CEPH_RBD_API int rbd_mirror_image_get_global_status( + rbd_image_t image, + rbd_mirror_image_global_status_t *mirror_image_global_status, + size_t status_size); +CEPH_RBD_API void rbd_mirror_image_global_status_cleanup( + rbd_mirror_image_global_status_t *mirror_image_global_status); + +CEPH_RBD_API int rbd_mirror_image_get_status( + rbd_image_t image, rbd_mirror_image_status_t *mirror_image_status, + size_t status_size) + CEPH_RBD_DEPRECATED; + +CEPH_RBD_API int rbd_mirror_image_get_instance_id(rbd_image_t image, + char *instance_id, + size_t *id_max_length); +CEPH_RBD_API int rbd_aio_mirror_image_promote(rbd_image_t image, bool force, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_demote(rbd_image_t image, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *mirror_image_info, + size_t info_size, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_get_mode(rbd_image_t image, + rbd_mirror_image_mode_t *mode, + rbd_completion_t c); + +CEPH_RBD_API int rbd_aio_mirror_image_get_global_status( + rbd_image_t image, + rbd_mirror_image_global_status_t *mirror_global_image_status, + size_t status_size, rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_get_status( + rbd_image_t image, rbd_mirror_image_status_t *mirror_image_status, + size_t status_size, rbd_completion_t c) + CEPH_RBD_DEPRECATED; + +CEPH_RBD_API int rbd_aio_mirror_image_create_snapshot(rbd_image_t image, + uint32_t flags, + uint64_t *snap_id, + rbd_completion_t c); + +// RBD groups support functions +CEPH_RBD_API int rbd_group_create(rados_ioctx_t p, const char *name); +CEPH_RBD_API int rbd_group_remove(rados_ioctx_t p, const char *name); +CEPH_RBD_API int rbd_group_list(rados_ioctx_t p, char *names, size_t *size); +CEPH_RBD_API int rbd_group_rename(rados_ioctx_t p, const char *src_name, + const char *dest_name); +CEPH_RBD_API int rbd_group_info_cleanup(rbd_group_info_t *group_info, + size_t group_info_size); + +/** + * Register an image metadata change watcher. + * + * @param image the image to watch + * @param handle where to store the internal id assigned to this watch + * @param watch_cb what to do when a notify is received on this image + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_update_watch(rbd_image_t image, uint64_t *handle, + rbd_update_callback_t watch_cb, void *arg); + +/** + * Unregister an image watcher. + * + * @param image the image to unwatch + * @param handle which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_update_unwatch(rbd_image_t image, uint64_t handle); + +/** + * List any watchers of an image. + * + * Watchers will be allocated and stored in the passed watchers array. If there + * are more watchers than max_watchers, -ERANGE will be returned and the number + * of watchers will be stored in max_watchers. + * + * The caller should call rbd_watchers_list_cleanup when finished with the list + * of watchers. + * + * @param image the image to list watchers for. + * @param watchers an array to store watchers in. + * @param max_watchers capacity of the watchers array. + * @returns 0 on success, negative error code on failure. + * @returns -ERANGE if there are too many watchers for the passed array. + * @returns the number of watchers in max_watchers. + */ +CEPH_RBD_API int rbd_watchers_list(rbd_image_t image, + rbd_image_watcher_t *watchers, + size_t *max_watchers); + +CEPH_RBD_API void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers, + size_t num_watchers); + +CEPH_RBD_API int rbd_config_image_list(rbd_image_t image, + rbd_config_option_t *options, + int *max_options); +CEPH_RBD_API void rbd_config_image_list_cleanup(rbd_config_option_t *options, + int max_options); + +CEPH_RBD_API int rbd_group_image_add(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name); +CEPH_RBD_API int rbd_group_image_remove(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name); +CEPH_RBD_API int rbd_group_image_remove_by_id(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_id); +CEPH_RBD_API int rbd_group_image_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t *num_entries); +CEPH_RBD_API int rbd_group_image_list_cleanup(rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t num_entries); + +CEPH_RBD_API int rbd_group_snap_create(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_create2(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name, + uint32_t flags); +CEPH_RBD_API int rbd_group_snap_remove(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_rename(rados_ioctx_t group_p, + const char *group_name, + const char *old_snap_name, + const char *new_snap_name); +CEPH_RBD_API int rbd_group_snap_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t *num_entries); +CEPH_RBD_API int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t num_entries); +CEPH_RBD_API int rbd_group_snap_rollback(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name, + librbd_progress_fn_t cb, + void *cbdata); + +CEPH_RBD_API int rbd_namespace_create(rados_ioctx_t io, + const char *namespace_name); +CEPH_RBD_API int rbd_namespace_remove(rados_ioctx_t io, + const char *namespace_name); +CEPH_RBD_API int rbd_namespace_list(rados_ioctx_t io, char *namespace_names, + size_t *size); +CEPH_RBD_API int rbd_namespace_exists(rados_ioctx_t io, + const char *namespace_name, + bool *exists); + +CEPH_RBD_API int rbd_pool_init(rados_ioctx_t io, bool force); + +CEPH_RBD_API void rbd_pool_stats_create(rbd_pool_stats_t *stats); +CEPH_RBD_API void rbd_pool_stats_destroy(rbd_pool_stats_t stats); +CEPH_RBD_API int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats, + int stat_option, + uint64_t* stat_val); +CEPH_RBD_API int rbd_pool_stats_get(rados_ioctx_t io, rbd_pool_stats_t stats); + +/** + * Register a quiesce/unquiesce watcher. + * + * @param image the image to watch + * @param quiesce_cb what to do when librbd wants to quiesce + * @param unquiesce_cb what to do when librbd wants to unquiesce + * @param arg opaque value to pass to the callbacks + * @param handle where to store the internal id assigned to this watch + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_quiesce_watch(rbd_image_t image, + rbd_update_callback_t quiesce_cb, + rbd_update_callback_t unquiesce_cb, + void *arg, uint64_t *handle); + +/** + * Notify quiesce is complete + * + * @param image the image to notify + * @param handle which watch is complete + * @param r the return code + */ +CEPH_RBD_API void rbd_quiesce_complete(rbd_image_t image, uint64_t handle, + int r); + +/** + * Unregister a quiesce/unquiesce watcher. + * + * @param image the image to unwatch + * @param handle which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_quiesce_unwatch(rbd_image_t image, uint64_t handle); + +#if __GNUC__ >= 4 + #pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* CEPH_LIBRBD_H */ diff --git a/shared/rbd/librbd.hpp b/shared/rbd/librbd.hpp new file mode 100644 index 0000000000..5d307cdedf --- /dev/null +++ b/shared/rbd/librbd.hpp @@ -0,0 +1,869 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __LIBRBD_HPP +#define __LIBRBD_HPP + +#include +#include +#include +#include +#include "../rados/buffer.h" +#include "../rados/librados.hpp" +#include "librbd.h" + +#if __GNUC__ >= 4 + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +namespace librbd { + + using librados::IoCtx; + + class Image; + class ImageOptions; + class PoolStats; + typedef void *image_ctx_t; + typedef void *completion_t; + typedef void (*callback_t)(completion_t cb, void *arg); + + typedef struct { + std::string id; + std::string name; + } image_spec_t; + + typedef struct { + int64_t pool_id; + std::string pool_name; + std::string pool_namespace; + std::string image_id; + std::string image_name; + bool trash; + } linked_image_spec_t; + + typedef rbd_snap_namespace_type_t snap_namespace_type_t; + + typedef struct { + uint64_t id; + snap_namespace_type_t namespace_type; + std::string name; + } snap_spec_t; + + typedef struct { + uint64_t id; + uint64_t size; + std::string name; + } snap_info_t; + + typedef struct { + int64_t group_pool; + std::string group_name; + std::string group_snap_name; + } snap_group_namespace_t; + + typedef rbd_snap_mirror_state_t snap_mirror_state_t; + + typedef struct { + snap_mirror_state_t state; + std::set mirror_peer_uuids; + bool complete; + std::string primary_mirror_uuid; + uint64_t primary_snap_id; + uint64_t last_copied_object_number; + } snap_mirror_namespace_t; + + typedef struct { + std::string client; + std::string cookie; + std::string address; + } locker_t; + + typedef rbd_mirror_peer_direction_t mirror_peer_direction_t; + + typedef struct { + std::string uuid; + std::string cluster_name; + std::string client_name; + } mirror_peer_t CEPH_RBD_DEPRECATED; + + typedef struct { + std::string uuid; + mirror_peer_direction_t direction; + std::string site_name; + std::string mirror_uuid; + std::string client_name; + time_t last_seen; + } mirror_peer_site_t; + + typedef rbd_mirror_image_mode_t mirror_image_mode_t; + typedef rbd_mirror_image_state_t mirror_image_state_t; + + typedef struct { + std::string global_id; + mirror_image_state_t state; + bool primary; + } mirror_image_info_t; + + typedef rbd_mirror_image_status_state_t mirror_image_status_state_t; + + typedef struct { + std::string name; + mirror_image_info_t info; + mirror_image_status_state_t state; + std::string description; + time_t last_update; + bool up; + } mirror_image_status_t CEPH_RBD_DEPRECATED; + + typedef struct { + std::string mirror_uuid; + mirror_image_status_state_t state; + std::string description; + time_t last_update; + bool up; + } mirror_image_site_status_t; + + typedef struct { + std::string name; + mirror_image_info_t info; + std::vector site_statuses; + } mirror_image_global_status_t; + + typedef rbd_group_image_state_t group_image_state_t; + + typedef struct { + std::string name; + int64_t pool; + group_image_state_t state; + } group_image_info_t; + + typedef struct { + std::string name; + int64_t pool; + } group_info_t; + + typedef rbd_group_snap_state_t group_snap_state_t; + + typedef struct { + std::string name; + group_snap_state_t state; + } group_snap_info_t; + + typedef rbd_image_info_t image_info_t; + + class CEPH_RBD_API ProgressContext + { + public: + virtual ~ProgressContext(); + virtual int update_progress(uint64_t offset, uint64_t total) = 0; + }; + + typedef struct { + std::string id; + std::string name; + rbd_trash_image_source_t source; + time_t deletion_time; + time_t deferment_end_time; + } trash_image_info_t; + + typedef struct { + std::string pool_name; + std::string image_name; + std::string image_id; + bool trash; + } child_info_t; + + typedef struct { + std::string addr; + int64_t id; + uint64_t cookie; + } image_watcher_t; + + typedef rbd_image_migration_state_t image_migration_state_t; + + typedef struct { + int64_t source_pool_id; + std::string source_pool_namespace; + std::string source_image_name; + std::string source_image_id; + int64_t dest_pool_id; + std::string dest_pool_namespace; + std::string dest_image_name; + std::string dest_image_id; + image_migration_state_t state; + std::string state_description; + } image_migration_status_t; + + typedef rbd_config_source_t config_source_t; + + typedef struct { + std::string name; + std::string value; + config_source_t source; + } config_option_t; + + typedef rbd_encryption_format_t encryption_format_t; + typedef rbd_encryption_algorithm_t encryption_algorithm_t; + typedef rbd_encryption_options_t encryption_options_t; + typedef rbd_encryption_spec_t encryption_spec_t; + + typedef struct { + encryption_algorithm_t alg; + std::string passphrase; + } encryption_luks1_format_options_t; + + typedef struct { + encryption_algorithm_t alg; + std::string passphrase; + } encryption_luks2_format_options_t; + + typedef struct { + std::string passphrase; + } encryption_luks_format_options_t; + +class CEPH_RBD_API RBD +{ +public: + RBD(); + ~RBD(); + + // This must be dynamically allocated with new, and + // must be released with release(). + // Do not use delete. + struct AioCompletion { + void *pc; + AioCompletion(void *cb_arg, callback_t complete_cb); + bool is_complete(); + int wait_for_complete(); + ssize_t get_return_value(); + void *get_arg(); + void release(); + }; + + void version(int *major, int *minor, int *extra); + + int open(IoCtx& io_ctx, Image& image, const char *name); + int open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname); + int open_by_id(IoCtx& io_ctx, Image& image, const char *id); + int open_by_id(IoCtx& io_ctx, Image& image, const char *id, const char *snapname); + int aio_open(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname, RBD::AioCompletion *c); + int aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname, RBD::AioCompletion *c); + // see librbd.h + int open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname); + int open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname); + int aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname, RBD::AioCompletion *c); + int aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname, RBD::AioCompletion *c); + int features_to_string(uint64_t features, std::string *str_features); + int features_from_string(const std::string str_features, uint64_t *features); + + int list(IoCtx& io_ctx, std::vector& names) + CEPH_RBD_DEPRECATED; + int list2(IoCtx& io_ctx, std::vector* images); + + int create(IoCtx& io_ctx, const char *name, uint64_t size, int *order); + int create2(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order); + int create3(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count); + int create4(IoCtx& io_ctx, const char *name, uint64_t size, + ImageOptions& opts); + int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order); + int clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order, uint64_t stripe_unit, int stripe_count); + int clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, ImageOptions& opts); + int remove(IoCtx& io_ctx, const char *name); + int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx); + int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname); + + int trash_move(IoCtx &io_ctx, const char *name, uint64_t delay); + int trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info); + int trash_list(IoCtx &io_ctx, std::vector &entries); + int trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold); + int trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, float threshold, + ProgressContext &pctx); + int trash_remove(IoCtx &io_ctx, const char *image_id, bool force); + int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id, + bool force, ProgressContext &pctx); + int trash_restore(IoCtx &io_ctx, const char *id, const char *name); + + // Migration + int migration_prepare(IoCtx& io_ctx, const char *image_name, + IoCtx& dest_io_ctx, const char *dest_image_name, + ImageOptions& opts); + int migration_prepare_import(const char *source_spec, IoCtx& dest_io_ctx, + const char *dest_image_name, ImageOptions& opts); + int migration_execute(IoCtx& io_ctx, const char *image_name); + int migration_execute_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_abort(IoCtx& io_ctx, const char *image_name); + int migration_abort_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_commit(IoCtx& io_ctx, const char *image_name); + int migration_commit_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_status(IoCtx& io_ctx, const char *image_name, + image_migration_status_t *status, size_t status_size); + + // RBD pool mirroring support functions + int mirror_site_name_get(librados::Rados& rados, std::string* site_name); + int mirror_site_name_set(librados::Rados& rados, + const std::string& site_name); + + int mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode); + int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode); + + int mirror_uuid_get(IoCtx& io_ctx, std::string* mirror_uuid); + + int mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token); + int mirror_peer_bootstrap_import(IoCtx& io_ctx, + mirror_peer_direction_t direction, + const std::string &token); + + int mirror_peer_site_add(IoCtx& io_ctx, std::string *uuid, + mirror_peer_direction_t direction, + const std::string &site_name, + const std::string &client_name); + int mirror_peer_site_set_name(IoCtx& io_ctx, const std::string& uuid, + const std::string &site_name); + int mirror_peer_site_set_client_name(IoCtx& io_ctx, const std::string& uuid, + const std::string &client_name); + int mirror_peer_site_set_direction(IoCtx& io_ctx, const std::string& uuid, + mirror_peer_direction_t direction); + int mirror_peer_site_remove(IoCtx& io_ctx, const std::string& uuid); + int mirror_peer_site_list(IoCtx& io_ctx, + std::vector *peers); + int mirror_peer_site_get_attributes( + IoCtx& io_ctx, const std::string &uuid, + std::map *key_vals); + int mirror_peer_site_set_attributes( + IoCtx& io_ctx, const std::string &uuid, + const std::map& key_vals); + + int mirror_image_global_status_list( + IoCtx& io_ctx, const std::string &start_id, size_t max, + std::map *images); + int mirror_image_status_summary(IoCtx& io_ctx, + std::map *states); + int mirror_image_instance_id_list(IoCtx& io_ctx, const std::string &start_id, + size_t max, std::map *sevice_ids); + int mirror_image_info_list(IoCtx& io_ctx, mirror_image_mode_t *mode_filter, + const std::string &start_id, size_t max, + std::map> *entries); + + /// mirror_peer_ commands are deprecated to mirror_peer_site_ equivalents + int mirror_peer_add(IoCtx& io_ctx, std::string *uuid, + const std::string &cluster_name, + const std::string &client_name) + CEPH_RBD_DEPRECATED; + int mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid) + CEPH_RBD_DEPRECATED; + int mirror_peer_list(IoCtx& io_ctx, std::vector *peers) + CEPH_RBD_DEPRECATED; + int mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid, + const std::string &client_name) + CEPH_RBD_DEPRECATED; + int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid, + const std::string &cluster_name) + CEPH_RBD_DEPRECATED; + int mirror_peer_get_attributes( + IoCtx& io_ctx, const std::string &uuid, + std::map *key_vals) + CEPH_RBD_DEPRECATED; + int mirror_peer_set_attributes( + IoCtx& io_ctx, const std::string &uuid, + const std::map& key_vals) + CEPH_RBD_DEPRECATED; + + /// mirror_image_status_list command is deprecated to + /// mirror_image_global_status_list + + int mirror_image_status_list( + IoCtx& io_ctx, const std::string &start_id, size_t max, + std::map *images) + CEPH_RBD_DEPRECATED; + + // RBD groups support functions + int group_create(IoCtx& io_ctx, const char *group_name); + int group_remove(IoCtx& io_ctx, const char *group_name); + int group_list(IoCtx& io_ctx, std::vector *names); + int group_rename(IoCtx& io_ctx, const char *src_group_name, + const char *dest_group_name); + + int group_image_add(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_name); + int group_image_remove(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_name); + int group_image_remove_by_id(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_id); + int group_image_list(IoCtx& io_ctx, const char *group_name, + std::vector *images, + size_t group_image_info_size); + + int group_snap_create(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_create2(IoCtx& io_ctx, const char *group_name, + const char *snap_name, uint32_t flags); + int group_snap_remove(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_rename(IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, const char *new_snap_name); + int group_snap_list(IoCtx& group_ioctx, const char *group_name, + std::vector *snaps, + size_t group_snap_info_size); + int group_snap_rollback(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_rollback_with_progress(IoCtx& io_ctx, const char *group_name, + const char *snap_name, + ProgressContext& pctx); + + int namespace_create(IoCtx& ioctx, const char *namespace_name); + int namespace_remove(IoCtx& ioctx, const char *namespace_name); + int namespace_list(IoCtx& io_ctx, std::vector* namespace_names); + int namespace_exists(IoCtx& io_ctx, const char *namespace_name, bool *exists); + + int pool_init(IoCtx& io_ctx, bool force); + int pool_stats_get(IoCtx& io_ctx, PoolStats *pool_stats); + + int pool_metadata_get(IoCtx &io_ctx, const std::string &key, + std::string *value); + int pool_metadata_set(IoCtx &io_ctx, const std::string &key, + const std::string &value); + int pool_metadata_remove(IoCtx &io_ctx, const std::string &key); + int pool_metadata_list(IoCtx &io_ctx, const std::string &start, uint64_t max, + std::map *pairs); + + int config_list(IoCtx& io_ctx, std::vector *options); + +private: + /* We don't allow assignment or copying */ + RBD(const RBD& rhs); + const RBD& operator=(const RBD& rhs); +}; + +class CEPH_RBD_API ImageOptions { +public: + ImageOptions(); + ImageOptions(rbd_image_options_t opts); + ImageOptions(const ImageOptions &imgopts); + ~ImageOptions(); + + int set(int optname, const std::string& optval); + int set(int optname, uint64_t optval); + int get(int optname, std::string* optval) const; + int get(int optname, uint64_t* optval) const; + int is_set(int optname, bool* is_set); + int unset(int optname); + void clear(); + bool empty() const; + +private: + friend class RBD; + friend class Image; + + rbd_image_options_t opts; +}; + +class CEPH_RBD_API PoolStats { +public: + PoolStats(); + ~PoolStats(); + + PoolStats(const PoolStats&) = delete; + PoolStats& operator=(const PoolStats&) = delete; + + int add(rbd_pool_stat_option_t option, uint64_t* opt_val); + +private: + friend class RBD; + + rbd_pool_stats_t pool_stats; +}; + +class CEPH_RBD_API UpdateWatchCtx { +public: + virtual ~UpdateWatchCtx() {} + /** + * Callback activated when we receive a notify event. + */ + virtual void handle_notify() = 0; +}; + +class CEPH_RBD_API QuiesceWatchCtx { +public: + virtual ~QuiesceWatchCtx() {} + /** + * Callback activated when we want to quiesce. + */ + virtual void handle_quiesce() = 0; + + /** + * Callback activated when we want to unquiesce. + */ + virtual void handle_unquiesce() = 0; +}; + +class CEPH_RBD_API Image +{ +public: + Image(); + ~Image(); + + int close(); + int aio_close(RBD::AioCompletion *c); + + int resize(uint64_t size); + int resize2(uint64_t size, bool allow_shrink, ProgressContext& pctx); + int resize_with_progress(uint64_t size, ProgressContext& pctx); + int stat(image_info_t &info, size_t infosize); + int get_name(std::string *name); + int get_id(std::string *id); + std::string get_block_name_prefix(); + int64_t get_data_pool_id(); + int parent_info(std::string *parent_poolname, std::string *parent_name, + std::string *parent_snapname) + CEPH_RBD_DEPRECATED; + int parent_info2(std::string *parent_poolname, std::string *parent_name, + std::string *parent_id, std::string *parent_snapname) + CEPH_RBD_DEPRECATED; + int get_parent(linked_image_spec_t *parent_image, snap_spec_t *parent_snap); + + int get_migration_source_spec(std::string* source_spec); + + int old_format(uint8_t *old); + int size(uint64_t *size); + int get_group(group_info_t *group_info, size_t group_info_size); + int features(uint64_t *features); + int update_features(uint64_t features, bool enabled); + int get_op_features(uint64_t *op_features); + int overlap(uint64_t *overlap); + int get_flags(uint64_t *flags); + int set_image_notification(int fd, int type); + + /* exclusive lock feature */ + int is_exclusive_lock_owner(bool *is_owner); + int lock_acquire(rbd_lock_mode_t lock_mode); + int lock_release(); + int lock_get_owners(rbd_lock_mode_t *lock_mode, + std::list *lock_owners); + int lock_break(rbd_lock_mode_t lock_mode, const std::string &lock_owner); + + /* object map feature */ + int rebuild_object_map(ProgressContext &prog_ctx); + + int check_object_map(ProgressContext &prog_ctx); + + int copy(IoCtx& dest_io_ctx, const char *destname); + int copy2(Image& dest); + int copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts); + int copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts, + size_t sparse_size); + int copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + ProgressContext &prog_ctx); + int copy_with_progress2(Image& dest, ProgressContext &prog_ctx); + int copy_with_progress3(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx); + int copy_with_progress4(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx, + size_t sparse_size); + + /* deep copy */ + int deep_copy(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts); + int deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx); + + /* encryption */ + int encryption_format(encryption_format_t format, encryption_options_t opts, + size_t opts_size); + int encryption_load(encryption_format_t format, encryption_options_t opts, + size_t opts_size); + int encryption_load2(const encryption_spec_t *specs, size_t spec_count); + + /* striping */ + uint64_t get_stripe_unit() const; + uint64_t get_stripe_count() const; + + int get_create_timestamp(struct timespec *timestamp); + int get_access_timestamp(struct timespec *timestamp); + int get_modify_timestamp(struct timespec *timestamp); + + int flatten(); + int flatten_with_progress(ProgressContext &prog_ctx); + + int sparsify(size_t sparse_size); + int sparsify_with_progress(size_t sparse_size, ProgressContext &prog_ctx); + /** + * Returns a pair of poolname, imagename for each clone + * of this image at the currently set snapshot. + */ + int list_children(std::set > *children) + CEPH_RBD_DEPRECATED; + /** + * Returns a structure of poolname, imagename, imageid and trash flag + * for each clone of this image at the currently set snapshot. + */ + int list_children2(std::vector *children) + CEPH_RBD_DEPRECATED; + int list_children3(std::vector *images); + int list_descendants(std::vector *images); + + /* advisory locking (see librbd.h for details) */ + int list_lockers(std::list *lockers, + bool *exclusive, std::string *tag); + int lock_exclusive(const std::string& cookie); + int lock_shared(const std::string& cookie, const std::string& tag); + int unlock(const std::string& cookie); + int break_lock(const std::string& client, const std::string& cookie); + + /* snapshots */ + int snap_list(std::vector& snaps); + /* DEPRECATED; use snap_exists2 */ + bool snap_exists(const char *snapname) CEPH_RBD_DEPRECATED; + int snap_exists2(const char *snapname, bool *exists); + int snap_create(const char *snapname); + int snap_create2(const char *snapname, uint32_t flags, ProgressContext& pctx); + int snap_remove(const char *snapname); + int snap_remove2(const char *snapname, uint32_t flags, ProgressContext& pctx); + int snap_remove_by_id(uint64_t snap_id); + int snap_rollback(const char *snap_name); + int snap_rollback_with_progress(const char *snap_name, ProgressContext& pctx); + int snap_protect(const char *snap_name); + int snap_unprotect(const char *snap_name); + int snap_is_protected(const char *snap_name, bool *is_protected); + int snap_set(const char *snap_name); + int snap_set_by_id(uint64_t snap_id); + int snap_get_name(uint64_t snap_id, std::string *snap_name); + int snap_get_id(const std::string snap_name, uint64_t *snap_id); + int snap_rename(const char *srcname, const char *dstname); + int snap_get_limit(uint64_t *limit); + int snap_set_limit(uint64_t limit); + int snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp); + int snap_get_namespace_type(uint64_t snap_id, + snap_namespace_type_t *namespace_type); + int snap_get_group_namespace(uint64_t snap_id, + snap_group_namespace_t *group_namespace, + size_t snap_group_namespace_size); + int snap_get_trash_namespace(uint64_t snap_id, std::string* original_name); + int snap_get_mirror_namespace( + uint64_t snap_id, snap_mirror_namespace_t *mirror_namespace, + size_t snap_mirror_namespace_size); + + /* I/O */ + ssize_t read(uint64_t ofs, size_t len, ceph::bufferlist& bl); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + ssize_t read2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags); + int64_t read_iterate(uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), void *arg); + int read_iterate2(uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), void *arg); + /** + * get difference between two versions of an image + * + * This will return the differences between two versions of an image + * via a callback, which gets the offset and length and a flag + * indicating whether the extent exists (1), or is known/defined to + * be zeros (a hole, 0). If the source snapshot name is NULL, we + * interpret that as the beginning of time and return all allocated + * regions of the image. The end version is whatever is currently + * selected for the image handle (either a snapshot or the writeable + * head). + * + * @param fromsnapname start snapshot name, or NULL + * @param ofs start offset + * @param len len in bytes of region to report on + * @param include_parent true if full history diff should include parent + * @param whole_object 1 if diff extents should cover whole object + * @param cb callback to call for each allocated region + * @param arg argument to pass to the callback + * @returns 0 on success, or negative error code on error + */ + int diff_iterate(const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), void *arg); + int diff_iterate2(const char *fromsnapname, + uint64_t ofs, uint64_t len, + bool include_parent, bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), void *arg); + + ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags); + + int discard(uint64_t ofs, uint64_t len); + ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags); + ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags); + + /** + * compare and write from/to image + * + * Compare data in compare bufferlist to data at offset in image. + * len bytes of the compare bufferlist are compared, i.e. the compare + * bufferlist has to be at least len bytes long. + * If the compare is successful len bytes from the write bufferlist + * are written to the image, i.e. the write bufferlist also has to be + * at least len bytes long. + * If the compare is unsuccessful no data is written and the + * offset in the bufferlist where the compare first differed + * is returned through mismatch_off. + * + * @param off offset in image + * @param len length of compare, length of write + * @param cmp_bl bufferlist to compare from + * @param bl bufferlist to write to image if compare succeeds + * @param c aio completion to notify when compare and write is complete + * @param mismatch_off (out) offset in bufferlist where compare first differed + * @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ + ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl, + ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags); + + int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + + int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c); + int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c, + int zero_flags, int op_flags); + + int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl, + ceph::bufferlist& bl, RBD::AioCompletion *c, + uint64_t *mismatch_off, int op_flags); + + /** + * read async from image + * + * The target bufferlist is populated with references to buffers + * that contain the data for the given extent of the image. + * + * NOTE: If caching is enabled, the bufferlist will directly + * reference buffers in the cache to avoid an unnecessary data copy. + * As a result, if the user intends to modify the buffer contents + * directly, they should make a copy first (unconditionally, or when + * the reference count on ther underlying buffer is more than 1). + * + * @param off offset in image + * @param len length of read + * @param bl bufferlist to read into + * @param c aio completion to notify when read is complete + */ + int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + + int flush(); + /** + * Start a flush if caching is enabled. Get a callback when + * the currently pending writes are on disk. + * + * @param image the image to flush writes to + * @param c what to call when flushing is complete + * @returns 0 on success, negative error code on failure + */ + int aio_flush(RBD::AioCompletion *c); + + /** + * Drop any cached data for this image + * + * @returns 0 on success, negative error code on failure + */ + int invalidate_cache(); + + int poll_io_events(RBD::AioCompletion **comps, int numcomp); + + int metadata_get(const std::string &key, std::string *value); + int metadata_set(const std::string &key, const std::string &value); + int metadata_remove(const std::string &key); + /** + * Returns a pair of key/value for this image + */ + int metadata_list(const std::string &start, uint64_t max, std::map *pairs); + + // RBD image mirroring support functions + int mirror_image_enable() CEPH_RBD_DEPRECATED; + int mirror_image_enable2(mirror_image_mode_t mode); + int mirror_image_disable(bool force); + int mirror_image_promote(bool force); + int mirror_image_demote(); + int mirror_image_resync(); + int mirror_image_create_snapshot(uint64_t *snap_id); + int mirror_image_create_snapshot2(uint32_t flags, uint64_t *snap_id); + int mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size); + int mirror_image_get_mode(mirror_image_mode_t *mode); + int mirror_image_get_global_status( + mirror_image_global_status_t *mirror_image_global_status, + size_t status_size); + int mirror_image_get_status( + mirror_image_status_t *mirror_image_status, size_t status_size) + CEPH_RBD_DEPRECATED; + int mirror_image_get_instance_id(std::string *instance_id); + int aio_mirror_image_promote(bool force, RBD::AioCompletion *c); + int aio_mirror_image_demote(RBD::AioCompletion *c); + int aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size, RBD::AioCompletion *c); + int aio_mirror_image_get_mode(mirror_image_mode_t *mode, + RBD::AioCompletion *c); + int aio_mirror_image_get_global_status( + mirror_image_global_status_t *mirror_image_global_status, + size_t status_size, RBD::AioCompletion *c); + int aio_mirror_image_get_status( + mirror_image_status_t *mirror_image_status, size_t status_size, + RBD::AioCompletion *c) + CEPH_RBD_DEPRECATED; + int aio_mirror_image_create_snapshot(uint32_t flags, uint64_t *snap_id, + RBD::AioCompletion *c); + + int update_watch(UpdateWatchCtx *ctx, uint64_t *handle); + int update_unwatch(uint64_t handle); + + int list_watchers(std::list &watchers); + + int config_list(std::vector *options); + + int quiesce_watch(QuiesceWatchCtx *ctx, uint64_t *handle); + int quiesce_unwatch(uint64_t handle); + void quiesce_complete(uint64_t handle, int r); + +private: + friend class RBD; + + Image(const Image& rhs); + const Image& operator=(const Image& rhs); + + image_ctx_t ctx; +}; + +} // namespace librbd + +#if __GNUC__ >= 4 + #pragma GCC diagnostic pop +#endif + +#endif // __LIBRBD_HPP