diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b0b1a8..45dc8fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,27 +15,75 @@ project(Fenix C)
set(FENIX_VERSION_MAJOR 1)
set(FENIX_VERSION_MINOR 0)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
option(BUILD_EXAMPLES "Builds example programs from the examples directory" OFF)
option(BUILD_TESTING "Builds tests and test modes of files" OFF)
+option(BUILD_DOCS "Builds documentation if is doxygen found" ON)
+option(DOCS_ONLY "Only build documentation" OFF)
#Solves an issue with some system environments putting their MPI headers before
#the headers CMake includes.
option(CRAYPE_INC_FIX "Adds detected MPI headers directly to this project" ON)
-find_package(MPI REQUIRED)
-add_subdirectory(src)
+if(NOT DOCS_ONLY)
+ find_package(MPI REQUIRED)
+ add_subdirectory(src)
+
+ include(CTest)
+ list(APPEND MPIEXEC_PREFLAGS "--with-ft;mpi")
-include(CTest)
-list(APPEND MPIEXEC_PREFLAGS "--with-ft;mpi")
+ if(BUILD_EXAMPLES)
+ add_subdirectory(examples)
+ endif()
+
+ if(BUILD_TESTING)
+ add_subdirectory(test)
+ endif()
-if(BUILD_EXAMPLES)
- add_subdirectory(examples)
endif()
-if(BUILD_TESTING)
- add_subdirectory(test)
+if(BUILD_DOCS)
+ find_package(Doxygen)
+ if(DOXYGEN_FOUND)
+ list(APPEND DOXYGEN_EXAMPLE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/examples)
+ list(APPEND DOXYGEN_EXAMPLE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/doc/examples)
+ list(APPEND DOXYGEN_IMAGE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/doc/img)
+ set(DOXYGEN_USE_MDFILE_AS_MAINPAGE doc/Introduction.md)
+ set(DOXYGEN_TOC_INCLUDE_HEADINGS 0)
+ set(DOXYGEN_DISABLE_INDEX YES)
+ set(DOXYGEN_GENERATE_TREEVIEW YES)
+ set(DOXYGEN_FULL_SIDEBAR NO)
+ set(DOXYGEN_HTML_EXTRA_STYLESHEET ${CMAKE_CURRENT_SOURCE_DIR}/doc/DoxygenStyle.css)
+ set(DOXYGEN_LAYOUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/doc/DoxygenLayout.xml)
+ set(DOXYGEN_OUTPUT_DIRECTORY doc)
+
+ set(DOXYGEN_GENERATE_MAN YES)
+ set(DOXYGEN_GENERATE_HTML YES)
+ set(DOXYGEN_GENERATE_LATEX YES)
+
+ set(DOXYGEN_QUIET YES)
+ set(DOXYGEN_WARN_IF_UNDOCUMENTED NO)
+ set(DOXYGEN_WARN_IF_DOC_ERROR YES)
+ set(DOXYGEN_WARN_NO_PARAMDOC YES)
+ set(DOXYGEN_SHOW_INCLUDE_FILES NO)
+ set(DOXYGEN_WARN_IF_UNDOC_ENUM_VAL NO)
+ list(APPEND DOXYGEN_ALIASES "returnstatus=@return FENIX_SUCCESS if successful, any [return code](@ref ReturnCodes) otherwise.")
+ list(APPEND DOXYGEN_ALIASES "unimplemented=@qualifier UNIMPLEMENTED @brief @htmlonly @endhtmlonly UNIMPLEMENTED @htmlonly @endhtmlonly")
+
+ doxygen_add_docs(doc
+ doc/Introduction.md doc/fake_init.h include src
+ ALL
+ COMMENT "Generate Fenix documentation")
+ message(STATUS "Run `make doc` to build documentation")
+
+ install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/doc DESTINATION ${CMAKE_INSTALL_PREFIX})
+
+ else()
+ message(STATUS "Doxygen not found, `make docs` disabled")
+ endif()
endif()
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
new file mode 100644
index 0000000..535b044
--- /dev/null
+++ b/doc/DoxygenLayout.xml
@@ -0,0 +1,269 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/doc/DoxygenStyle.css b/doc/DoxygenStyle.css
new file mode 100644
index 0000000..770b1c6
--- /dev/null
+++ b/doc/DoxygenStyle.css
@@ -0,0 +1,41 @@
+/*Move qualifiers (e.g. collective, unimplemented) to being above function name instead of bottom right*/
+/* It's too easy to miss as-is, especially the unimplemented tag.*/
+table.mlabels {
+ direction: rtl;
+ writing-mode: vertical-rl;
+}
+/*Undo the weird writing-mode changes at each mlabels table member*/
+table.mlabels td.mlabels-right {
+ writing-mode: horizontal-tb;
+ text-align: left;
+ width: auto;
+}
+table.mlabels td.mlabels-left {
+ writing-mode: horizontal-tb;
+ text-align: left;
+ width: auto;
+}
+/*Undo the table direction change in the subtable of function parameters*/
+table.mlabels table.memname {
+ float: left;
+ direction: ltr;
+}
+
+/*Make the qualifier labels slightly larger, and bold.*/
+table.mlabels td.mlabels-right span.mlabel {
+ font-weight: bold;
+ font-size: 12px;
+}
+
+
+/*
+ * Hide the "UNIMPLEMENTED" tag within the function's detailed description
+ * It's visible already.
+*/
+div.memdoc span.mlabel {
+ display: none;
+}
+
+table.params {
+ word-wrap: break-all;
+}
diff --git a/doc/Introduction.md b/doc/Introduction.md
new file mode 100644
index 0000000..1e8d68b
--- /dev/null
+++ b/doc/Introduction.md
@@ -0,0 +1,51 @@
+Fenix is a software library compatible with the Message Passing
+Interface (MPI) to support fault recovery without application
+shutdown. Fenix has two components: process recovery and data
+recovery. Process recovery is used to repair communicators whose
+ranks suffered failure detected by the MPI runtime. Data recovery
+is an optional feature that can be used to implement a
+high-performance in-memory checkpoint/restart mechanism.
+
+Below is a brief overview of these two components, but see the
+[Process Recovery](@ref ProcessRecovery) and [Data Recovery](@ref DataRecovery)
+topics for more details.
+
+## Process Recovery
+
+The core feature of process recovery is creation of a resilient
+communicator that will automatically repair itself. This recovery
+is achieved by setting aside some number of ranks as *spare ranks*.
+When a failure is detected, the spare ranks are used to replace
+the failed ranks.
+
+The exact process of recovery is subject to some nuances of the OpenMPI
+ULFM specification, which Fenix is implemented on top of. For example,
+messages may have locally succeeded while failing on other participating
+ranks.
+
+![An example process flow diagram for recovery using Fenix](fenix_process_flow.png){html: width=300px}
+
+The default recovery pattern is to perform a `longjmp` to the location of
+#Fenix_Init following communicator repairs. This emulates the typical offline
+checkpoint/restart pattern, but without the need to restart the application.
+However, `longjmp` has some nebulous behavior in many applications. Fenix also
+supports a non-jumping recovery pattern. This is more predictable across compilers
+and optimizations, but requires checking the return value of every MPI call to
+detect failed operations (though communicator repair is still automatic). A
+good practice for C++ applications is to use the non-jumping pattern, but add
+a Fenix error-handler callback to throw an exception on failure.
+
+## Data Recovery
+
+Fenix provides its own redundant data storage API to facilitate
+data recovery along with process recovery, but the user can choose
+other data recovery options to meet a variety of application needs.
+For example, data could be recovered by approximately interpolating
+values from unaffected, topologically neighboring ranks instead of
+by reading stored redundant data. In addition, the user may decide
+to use external libraries such as
+[VeloC](https://veloc.readthedocs.io/en/latest/).
+
+> Any Fenix function without a return type, e.g. #Fenix_Init, may be
+> implemented via macros, in which case it cannot be used to resolve
+> function pointers.
diff --git a/doc/examples/DataRecovery.md b/doc/examples/DataRecovery.md
new file mode 100644
index 0000000..e2f8ddf
--- /dev/null
+++ b/doc/examples/DataRecovery.md
@@ -0,0 +1,114 @@
+Fenix provides options for redundant storage of application data
+to facilitate application data recovery in a transparent manner.
+Fenix contains functions to control consistency of collections of
+such data, as well as their level of persistence. Functions with
+the prefix \c Fenix\_Data\_ perform store, versioning, restore,
+and other relevant operations and form the Fenix data recovery API.
+The user can select a specific set of application data, identified
+by its location in memory, label it using [Fenix_Data_member_create](@ref Fenix_Data_member_create),
+and copy it into Fenix's redundant storage space through
+[Fenix_Data_member(i)store(v)](@ref Fenix_Data_member_store) at a
+point in time. Subsequently, #Fenix_Data_commit finalizes all
+preceding Fenix store operations involving this data group and
+assigns a unique time stamp to the resulting data *snapshot*,
+marking the data as potentially recoverable after a loss of ranks.
+Individual pieces of data can then be restored whenever they are
+needed with #Fenix_Data_member_restore, for example after a failure
+occurs. We note that Fenix's data storage and recovery facility
+aims primarily to support in-memory recovery.
+
+Populating redundant data storage using Fenix may involve the
+dispersion of data created by one rank to other ranks within the
+system, making the store operation semantically a collective
+operation. However, Fenix does not require store operations to be
+globally synchronizing. For example, execution of
+ #Fenix_Data_member_store for a particular collection of data
+could potentially be finished in some ranks, but not yet in others.
+And if certain ranks nominally participating in the storage
+operations have no actual data movement responsibility, Fenix is
+allowerd to let them exit the operation immediately. Consequently,
+Fenix data storage functions should not be used for synchronization
+purposes.
+
+Multiple distinct pieces (members) of data assigned to Fenix-managed
+redundant storage, can be associated with a specific instance of
+a Fenix *data group* to form a semantic unit. Committing such a
+group ensures that the data involved is available for recovery.
+
+## Data Groups
+
+-----
+A Fenix *data group* provides dual functionality. First, it serves
+as a container for a set of data objects (*members*) that are
+committed together, and hence provides transaction semantics.
+Second, it recognizes that #Fenix_Data_member_store is an operation
+carried out collectively by a group of ranks, but not necessarily
+by all active ranks in the MPI environment. Hence, it adopts the
+convenient MPI vehicle of \c communicators to indicate the subset
+of ranks involved. Data groups are composed of members that
+describe the actual application data and the redundancy policy
+to be used for securely storing the members.
+
+Data groups can and should be recreated after each failure (i.e. do not
+conditionally skip the creation after initialization).
+
+See #Fenix_Data_group_create
+for creating a data group.
+
+## Data Redundancy Policies
+
+-----
+Fenix internally uses an extensible system for defining data
+policies to keep the door open to easily adding new data policies
+and configuring them on a per-data-group basis. We currently
+support a single, configurable, memory-based policy.
+
+### In Memory Redundancy Policy (IMR)
+
+IMR is referenced with the FENIX_DATA_POLICY_IN_MEMORY_RAID definition,
+and takes as input an array of integers with the following usage:
+
+* Mode: (1 or 5) Chooses storage mimicking the given RAID style.
+* Separation: Sets the rank separation for groups used to store redundant data.
+ Users should choose a separation that attempts to ensure the ranks
+ chosen for grouping are not colocated on nodes/racks to minimize the
+ chance of multiple ranks in a group
+* GroupSize: For Mode 5 only, sets the size of the parity groups, minimum 3.
+
+The policy is designed to localize recovery as much as possible. Communication
+amongst group members is required (as failure during recovery operations
+can lead to inconsistent beliefs about which ranks have recovered data),
+but groups without recovering ranks may then all recover locally rather
+than communicating further. Groups need not wait for ranks outside of
+their group to enter or exit recovery.
+
+* **Mode 1**: Groups ranks into dyadically paired partners of Rank N and
+ Rank (N+Separation). For odd-size communicators, a single
+ group of size 3 will also form of the first, middle, and last
+ ranks. Each rank stores a copy of its own data and a copy of
+ its partner's. For groups of three, partner data storage is
+ chained. Should both partners fail (or any two for groups of
+ three) before recovery operations have completed, data will be
+ unrecoverable.
+
+ **Memory Usage**: Each rank stores a copy of its own data and of its
+ partner's data for each timestamp, where checkpoint depth D
+ stores D+1 checkpoints. Therefore for data size M,
+ (D+1)*M*2 bytes are used.
+
+ **Computation**: None.
+
+* **Mode 5**: Groups ranks into parity groups of size GroupSize.
+ Groups are formed of Rank N, N+Separation, N+2*Separation.
+ If any two ranks in a group fail before recovery operations
+ have completed, data will be unrecoverable.
+
+ **Memory Usage**: Each rank stores a copy of its own data and
+ M/(GroupSize-1) parity bytes per timestamp. Therefore,
+ (D+1)*M*(GroupSize/(GroupSize-1)) bytes are used.
+
+ **Computation**: O(M) parity bit calculations.
+
+These options enable users to trade reliability and computation for memory
+space, which may be necessary for applications with large memory usage.
+
diff --git a/doc/examples/IMR.md b/doc/examples/IMR.md
new file mode 100644
index 0000000..76f852a
--- /dev/null
+++ b/doc/examples/IMR.md
@@ -0,0 +1,6 @@
+# In Memory Redundancy (IMR) {#md_IMR}
+
+Fenix supports one data storage policy, IMR,
+which stores data through either a RAID-1-like
+buddy rank mechanism or a RAID-5-like parity
+mechanism.
diff --git a/doc/examples/ProcessRecovery.md b/doc/examples/ProcessRecovery.md
new file mode 100644
index 0000000..e2d8e1f
--- /dev/null
+++ b/doc/examples/ProcessRecovery.md
@@ -0,0 +1,19 @@
+Functions and types for process recovery.
+
+* Only communicators derived from the communicator returned by
+Fenix_Init are eligible for reconstruction.
+After communicators have been repaired, they contain the same
+number of ranks as before the failure occurred, unless the user
+did not allocate sufficient redundant resources (*spare ranks*)
+and instructed Fenix not to create new ranks. In this case
+communicators will still be repaired, but will contain fewer
+ranks than before the failure occurred.
+
+* To ease adoption of MPI fault tolerance, Fenix automatically
+captures any errors resulting from MPI library calls that are a
+result of a damaged communicator (other errors reported by the
+MPI runtime are ignored by Fenix and are returned to the
+application, for handling by the application writer). In other
+words, programmers do not need to replace calls to the MPI library
+with calls to Fenix (for example, *Fenix_Send* instead of
+*MPI_Send*).
diff --git a/doc/fake_init.h b/doc/fake_init.h
new file mode 100644
index 0000000..a9afa16
--- /dev/null
+++ b/doc/fake_init.h
@@ -0,0 +1,4 @@
+//!@weakgroup ProcessRecovery
+//!@{
+void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error);
+//!@}
diff --git a/doc/img/fenix_process_flow.png b/doc/img/fenix_process_flow.png
new file mode 100644
index 0000000..e94029a
Binary files /dev/null and b/doc/img/fenix_process_flow.png differ
diff --git a/include/fenix.h b/include/fenix.h
index 1a283bf..b875ed7 100644
--- a/include/fenix.h
+++ b/include/fenix.h
@@ -66,10 +66,21 @@ extern "C" {
#include "fenix_data_subset.h"
#include "fenix_process_recovery.h"
+/**
+ * @file
+ * @brief Contains all API function calls and Fenix types.
+ * This is the only header file a user should include.
+ */
+
+/**
+ * @defgroup ReturnCodes Return Codes
+ * @brief All possible return codes from Fenix functions.
+ * @{
+ */
#define FENIX_SUCCESS 0
#define FENIX_ERROR_UNINITIALIZED -9
#define FENIX_ERROR_NOCATEGORY -10
-#define FENIX_ERROR_CALLBACK_NOT_REGISTERD -11
+#define FENIX_ERROR_CALLBACK_NOT_REGISTERED -11
#define FENIX_ERROR_GROUP_CREATE -12
#define FENIX_ERROR_MEMBER_CREATE -13
#define FENIX_ERROR_COMMIT_BARRIER -133
@@ -91,39 +102,110 @@ extern "C" {
#define FENIX_ERROR_CANCELLED -50
#define FENIX_WARNING_SPARE_RANKS_DEPLETED 100
#define FENIX_WARNING_PARTIAL_RESTORE 101
+/**@}*/
-#define FENIX_DATA_GROUP_WORLD_ID 10
-#define FENIX_GROUP_ID_MAX 11
-#define FENIX_TIME_STAMP_MAX 12
-#define FENIX_DATA_MEMBER_ALL 15
-#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11
-#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12
-#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13
-#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14
-#define FENIX_DATA_SNAPSHOT_LATEST -1
-#define FENIX_DATA_SNAPSHOT_ALL 16
-#define FENIX_DATA_SUBSET_CREATED 2
-
+//!@internal @brief Agreement code for error handler
#define FENIX_ERRHANDLER_LOC 1
+//!@internal @brief Agreement code for data commit barrier
#define FENIX_DATA_COMMIT_BARRIER_LOC 2
-#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13
+/**
+ * @defgroup ProcessRecovery Process Recovery
+ * @details @include{doc} ProcessRecovery.md
+ * @{
+ */
+
+/**
+ * @brief All possible roles returned by Fenix_Init
+ *
+ * Describes the current process's state in reference
+ * to process recovery.
+ *
+ * It is important to note that FENIX_ROLE_RECOVERED_RANK
+ * is only guaranteed to be the value after a single failure,
+ * so users ought not use the role to directly ensure a valid
+ * state if they desire to be resilient to failures during their
+ * failure recovery process.
+ */
typedef enum {
+ //!No failures have occurred yet
FENIX_ROLE_INITIAL_RANK = 0,
+ //!This rank was a spare before the most recent failure, or was just spawned
FENIX_ROLE_RECOVERED_RANK = 1,
+ //!This rank was not a spare before the most recent failure
FENIX_ROLE_SURVIVOR_RANK = 2
} Fenix_Rank_role;
-typedef struct {
- MPI_Request mpi_send_req;
- MPI_Request mpi_recv_req;
-} Fenix_Request;
-
-extern const Fenix_Data_subset FENIX_DATA_SUBSET_FULL;
-extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY;
-
+/**
+ * @fn void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error);
+ * @brief Build a resilient communicator and set the restart point.
+ *
+ * This function must be called by all ranks in \c comm, after MPI initialization. All calling ranks must
+ * pass the same values for the parameters \c comm, \c spare_ranks, \c spawn, and \c info. \c Fenix_init
+ * must be called exactly once by each rank. This function is used (1) to activate the Fenix library, (2)
+ * to specify extra resources in case of rank failure, and (3) to create a logical resumption point in case
+ * of rank failure.
+ *
+ * For C, the program may rely on the the state of any variables defined and set before the call to \c Fenix_Init.
+ * But note that the code executed before \c Fenix_Init is executed by all ranks in the system (including spare
+ * ranks, see below). For C++, the state of objects declared before \c Fenix_Init but within the same scope as
+ * \c Fenix_Init is compiler-dependant, and it is recommended to place \c Fenix_Init within a subscope exluding
+ * any variables expected to no be destructed.
+ *
+ * It is recommended to access argc and argv only after executin \c Fenix_Init, since command line arguments
+ * passed to this function that apply to Fenix may be removed by \c Fenix_Init.
+ *
+ * \c Fenix_Init is blocking in the following sense. If it is entered for the first time via a regular, explicit
+ * function call, it must be entered by all ranks in communicator \c comm. If it is entered after an error
+ * intercepted by Fenix (it if the default execution resumption point, see _info below), no ranks are allowed
+ * to exit from it until all *non-failed* ranks have returned control to it. **Note**: Typically control is
+ * returned automatically through revocation of the resilient communicator, which means ranks which have long
+ * delays between MPI function calls or ranks which only use communicators unaffected by failure may lead to
+ * long delays between a failure and its recovery.
+ *
+ * Ranks to be used as spare ranks by Fenix will be available to the application only before \c Fenix_Init,
+ * or after they are used to replace a failed rank, in which case they turn into active ranks. This document
+ * refers to the latter as \c RECOVERED ranks (see #Fenix_Rank_role). Note that all spare
+ * ranks that have not been used to recover from failures (and, therefore, are still reserved by Fenix and kept
+ * inside \c Fenix_Init) will automatically call \c MPI_Finalize and exit when all active ranks have entered the
+ * #Fenix_Finalize call.
+ *
+ * No Fenix functions may be called before \c Fenix_Init, except #Fenix_Initialized.
+ *
+ * @param[out] role The current role of this rank (see #Fenix_Rank_role)
+ * @param[in] comm The base communicator to construct a resilient communicator from,
+ * which must include any spare ranks (see below) the user deems necessary.
+ * MPI_COMM_WORLDis a valid value, but MPI_COMM_SELF is not.
+ * @param[out] newcomm Resilient output communicator, managed by Fenix and derived
+ * from comm, to be used by the application instead of comm.
+ * @param[inout] argc Pointer to application main's argc parameter
+ * @param[inout] argv Pointer to application main's argv parameter
+ * @param[in] spare_ranks The number of ranks in comm that are exempted by Fenix
+ * in the construction of the resilient communicator by Fenix_Init. These ranks
+ * are kept in reserve to substitute for failed ranks. Failed ranks in resilient
+ * communicators are replaced by spare or spawned ranks.
+ * @param[in] spawn *Unimplemented*: Whether to enable spawning new ranks to replace
+ * failed ranks when spares are unavailable.
+ * @param[in] info Fenix recovery configuration parameters, may be MPI_INFO_NULL
+ * Supports the "FENIX_RESUME_MODE" key, used to indicate where execution should resume upon
+ * rank failure for all active (non-spare) ranks in any resilient communicators, not only for
+ * those ranks in communicators that failed. The following values associated with the
+ * "resume_mode" key are supported:
+ * - "Fenix_init" (default): execution resumes at logical exit of Fenix_Init.
+ * - "NO_JUMP": execution continues from the failing MPI call. Errors are otherwise handled
+ * as normal, but return the error code as well. Applications should typically
+ * either check for return codes or assign an error callback through Fenix.
+ * @param[out] error The return status of \c Fenix_Init
+ * Used to signal that a non-fatal error or special condition was encountered in the execution of
+ * Fenix_Init, or FENIX_SUCCESS otherwise. It has the same value across all ranks released by
+ * Fenix_Init. If spawning is explicitly disabled (_spawn equals false) and spare ranks have been
+ * depleted, Fenix will repair resilience communicators by shrinking them and will report such
+ * shrinkage in this return parameter through the value FENIX_WARNING_SPARE_RANKS_DEPLETED.
+ */
+
+//!@internal
#define Fenix_Init(_role, _comm, _newcomm, _argc, _argv, _spare_ranks, \
_spawn, _info, _error) \
{ \
@@ -137,96 +219,461 @@ extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY;
__fenix_postinit( _error ); \
}
-int Fenix_Initialized(int *);
+/**
+ * @brief Sets flag to true if Fenix_Init has been called, else false.
+ * @param[out] flag Pointer to the flag to be set.
+ * @returnstatus
+ */
+int Fenix_Initialized(int *flag);
+
+/**
+ * @brief Register a callback to be invoked after failure process recovery.
+ *
+ * This function registers a callback to be invoked after a failure has been recovered by Fenix,
+ * and right before resuming application execution (e.g. returning from #Fenix_Init by default).
+ * If this function is called more than once, the different callbacks will be called in the
+ * reverse order that they were registered (i.e. as a callback stack).
+ *
+ * Callback functions are passed the newly-repaired resilient communicator, the error code returned
+ * by MPI in the communication action which caused a failure recovery, and the user-provided \c void*
+ * callback data.
+ *
+ * Callbacks will only be invoked by survivor ranks, since spare ranks or respawned ranks had no way
+ * to register them before a failure.
+ *
+ * @param[in] recover the callback function to register.
+ * @param[in] callback_data The user-provided data which will be passed to the callback.
+ *
+ * @returnstatus
+ */
int Fenix_Callback_register(void (*recover)(MPI_Comm, int, void *),
void *callback_data);
+
+//!@unimplemented Returns the number of ranks with a given #Fenix_Rank_role
int Fenix_get_number_of_ranks_with_role(int, int *);
+//!@unimplemented Returns the #Fenix_Rank_role for a given rank
int Fenix_get_role(MPI_Comm comm, int rank, int *role);
+/**
+ * @brief Get the list of ranks that failed in the most recent failure.
+ * @param[out] fail_list Set to a list of failed ranks.
+ * @return The number of failed ranks.
+ */
+int Fenix_Process_fail_list(int** fail_list);
+
+/**
+ * @brief Check a pre-recovery request without error
+ * @param[in] request The request to check
+ * @param[out] status The status of the request
+ * @return True if the request was cancelled or has unknown completion status,
+ * false if it completed successfully.
+ */
+int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status);
+
+
+/**
+ * @brief Clean up Fenix state. Each active rank must call \c Fenix_Finalize before exiting.
+ *
+ * This function cleans up all Fenix state, if any. If an MPI program using the Fenix library terminates
+ * normally (i.e. not due to a call to \c MPI_Abort, or an unrecoverable error) then each rank must call
+ * \c Fenix_Finalize before it exits. It must be called before \c MPI_Finalize, and after #Fenix_Init.
+ * There shall be no function calls after this function, except #Fenix_Initialized.
+ *
+ * As noted in the description of #Fenix_Init, all spare ranks that have not been used to
+ * recover from failures (and therefore are still reserved by Fenix and kept inside #Fenix_Init) will call
+ * \c MPI_Finalize and exit when all active ranks have called \c Fenix_Finalize.
+ *
+ * **Advice**: Sometimes users may want to remove ranks proactively from the execution, for example because
+ * monitoring data shows that failure of a rank is imminent or that a rank is executing un-manageably slowly.
+ * This can be accomplished by calling \c exit on the targeted ranks, followed by an invocation of MPI_Barrier.
+ * The removed ranks will be reported as failed and error handling will progress appropriately. No calls to finalize
+ * are needed in this case.
+ */
int Fenix_Finalize();
-int Fenix_Data_group_create(int group_id, MPI_Comm, int start_time_stamp,
+/**@}*/
+
+
+/**
+ * @defgroup DataRecovery Data Recovery
+ * @brief Functions for storing and restoring data in Fenix.
+ * @details @include{doc} DataRecovery.md
+ *
+ * @{
+ */
+#define FENIX_DATA_GROUP_WORLD_ID 10
+#define FENIX_GROUP_ID_MAX 11
+#define FENIX_TIME_STAMP_MAX 12
+#define FENIX_DATA_MEMBER_ALL 15
+#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11
+#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12
+#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13
+#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14
+#define FENIX_DATA_SNAPSHOT_LATEST -1
+#define FENIX_DATA_SNAPSHOT_ALL 16
+#define FENIX_DATA_SUBSET_CREATED 2
+
+#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13
+
+/**
+ * @unimplemented As MPI_Request, but for Fenix asynchronous data recovery calls
+ */
+typedef struct {
+ MPI_Request mpi_send_req;
+ MPI_Request mpi_recv_req;
+} Fenix_Request;
+
+//!@brief A standin for checkpointing/recovering all available data in a member.
+extern const Fenix_Data_subset FENIX_DATA_SUBSET_FULL;
+
+//!@brief A standin for checkpointing/recovering none of the available data in a member.
+extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY;
+
+
+/**
+ * @brief Create a Data Group
+ * @qualifier collective
+ *
+ * If a group with this group_id was already created in the past and has not been deleted, the
+ * parameters of this call are ignored and this function simply serves to coordinate with any
+ * ranks that have not yet created this group (e.g. due to a failure).
+ *
+ * All calling ranks must pass the same values for the parameters \c group_id, \c comm,
+ * \c start_time_stamp, \c policy_name, and \c policy_value.
+ *
+ * @param group_id A unique identifier to this group.
+ * @param comm A resilient communicator on which the group is formed.
+ * @param start_time_stamp The time_stamp to be used for the first commit in this group.
+ * @param depth
+ * @parblock
+ * The number of successive snapshots of this group that are retained by Fenix, in
+ * addition to the most recent one, and that can be recovered by calling Fenix data member
+ * restore functions.
+ *
+ * For example, a depth of 0 means Fenix will keep only the necessary data to restore the
+ * most recent snapshot, freeing or overwriting older snapshots automatically. A depth
+ * of -1 is currently not supported, but would ordinarily indicate that no snapshots should
+ * be removed automatically.
+ * @endparblock
+ * @param policy_name Currently, may only be FENIX_DATA_POLICY_IN_MEMORY_RAID
+ * @param policy_value Pointer to data passed along to the policy.
+ * See the specific policy for more information.
+ * @param flag pointer to store policy-specific status or errors
+ * @return FENIX_SUCCESS, or an error value.
+ */
+int Fenix_Data_group_create(int group_id, MPI_Comm comm, int start_time_stamp,
int depth, int policy_name, void* policy_value,
int* flag);
+/**
+ * @brief Create a data member for store/restore operations
+ * @qualifier collective
+ * @qualifier local
+ *
+ * All calling ranks in the group's communicator must pass the same values for the parameters
+ * \c member_id, \c datatype, and \c group_id.
+ *
+ * @param group_id Identifier to a data group within which to create the member.
+ * @param member_id An integer unique within the data group that identifies the data in
+ * \c source_buffer. Must be nonnegative and less than FENIX_MEMBER_ID_MAX, which is
+ * guaranteed to be at least 2^30.
+ * @param buffer Address of the data to be copied to redundant storage maintained by Fenix.
+ * Note that this parameter may also be specified using #Fenix_Data_member_attr_set, which
+ * is critical for non-survivor ranks after a failure which will have an invalid address
+ * which was generated on the failed rank and must update.
+ * @param count The maximum number of contiguous elements of type \c datatype of the data to be
+ * stored. Need not be the same in all calling ranks.
+ * @param datatype The MPI_Datatype of the elements in \c source_buffer
+ *
+ * @return FENIX_SUCCESS, or an error value.
+ */
int Fenix_Data_member_create(int group_id, int member_id, void *buffer,
int count, MPI_Datatype datatype);
+/**
+ * @brief Get the storage policy of a data group
+ *
+ * @param group_id Identified to the data group to query
+ * @param policy_name The identifier of the policy name of the data group.
+ * @param policy_value A location within which to store the policy_values this group's
+ * policy was configured with.
+ * @param flag A location set to true if a policy value was extracted, else false.
+ * @return FENIX_SUCCESS, or an error value.
+ */
int Fenix_Data_group_get_redundancy_policy(int group_id, int* policy_name,
void *policy_value, int *flag);
+//!@unimplemented Block on completion of the store operation specified by the request.
int Fenix_Data_wait(Fenix_Request request);
+
+//!@unimplemented Query completion of the store operation specified by the request.
int Fenix_Data_test(Fenix_Request request, int *flag);
+
+/**
+ * @brief Store a particular group member into the group's resilient storage space, in uncommitted storage.
+ * @qualifier collective
+ *
+ * The user can safely modify the member's data buffer after this call, as the current state is copied immediately.
+ * Multiple calls may be used to incrementally store data (using subset_specifiers), or overwrite old data prior to a commit.
+ *
+ * @param group_id All ranks must provide the same group_id
+ * @param member_id All ranks must provide the same member_id
+ * @param subset_specifier Which subset of the data to store. It is always valid for every rank to provide the same
+ * subset_specifier; depending on the group's policy, varying combinations of specifiers may be possible.
+ * @return FENIX_SUCCESS, or an error value.
+ */
int Fenix_Data_member_store(int group_id, int member_id,
Fenix_Data_subset subset_specifier);
-int Fenix_Data_member_storev(int member_id, int group_id,
+
+//!@unimplemented As [store](#Fenix_Data_member_store), but subsets may vary rank-to-rank.
+int Fenix_Data_member_storev(int group_id, int member_id,
Fenix_Data_subset subset_specifier);
-int Fenix_Data_member_istore(int member_id, int group_id,
+//!@unimplemented As [store](#Fenix_Data_member_store), but asynchronous.
+int Fenix_Data_member_istore(int group_id, int member_id,
Fenix_Data_subset subset_specifier,
Fenix_Request *request);
-int Fenix_Data_member_istorev(int member_id, int group_id,
+//!@unimplemented As [istore](#Fenix_Data_member_istore), but asynchronous.
+int Fenix_Data_member_istorev(int group_id, int member_id,
Fenix_Data_subset subset_specifier,
Fenix_Request *request);
+/**
+ * @brief Commit stored data members to the group's next snapshot.
+ * @qualifier collective
+ * @qualifier local
+ *
+ * This function is used to freeze the current state of a data group,
+ * together with all its application data that has been stored in Fenix’
+ * redundant storage, and label it with a time stamp, thus creating a
+ * snapshot of the stored application data. Only data that has been
+ * committed is eligible for recovery through #Fenix_Data_member_restore.
+ * An application needs to call #Fenix_Data_wait for all pending asynchronous
+ * [Fenix_Data_member_istore(v)](@ref Fenix_Data_member_istore) operations
+ * in the group before committing.
+ *
+ * @param[in] group_id The group to commit
+ * @param[out] time_stamp The time stamp of the new snapshot
+ * @returnstatus
+ */
int Fenix_Data_commit(int group_id, int *time_stamp);
+/**
+ * @brief As [commit](#Fenix_Data_commit), but ensures a globally consistent commit.
+ * @qualifier collective
+ *
+ * This function does not function as a traditional barrier.
+ * The commit will proceed if all *non-failed* ranks reach the barrier.
+ * This allows for commits to be made when a rank fails after storing all
+ * of its data into resilient storage.
+ *
+ * @param[in] group_id The group to commit
+ * @param[out] time_stamp The time stamp of the new snapshot
+ * @returnstatus
+ */
int Fenix_Data_commit_barrier(int group_id, int *time_stamp);
+//!@unimplemented Block until all ranks in the group have reached this point.
int Fenix_Data_barrier(int group_id);
+/**
+ * @brief Restore the data of a group member from a snapshot.
+ * @qualifier collective
+ *
+ * All ranks in the group’s resilient communicator must pass the
+ * same values for the parameters group_id, member_id, and time_stamp.
+ * This function is used to retrieve data from consistent snapshot
+ * members. This function can only be used if the size of the
+ * communicator used to store the data is the same as that at the time
+ * of data recovery (this implies non-shrinking communicator recovery
+ * in case of a rank loss).
+ *
+ * If the size of the buffer needing to receive the recovery data is
+ * unknown for a particular rank, it can be queried using
+ * #Fenix_Data_member_attr_get.
+ *
+ * @param[in] group_id The group to restore from
+ * @param[in] member_id The member to restore
+ * @param[out] target_buffer The buffer to store the restored data
+ * @param[in] max_count The maximum number of elements to restore
+ * @param[in] time_stamp The time stamp of the snapshot to restore from
+ * @param[out] found_data The subset of the data that was found in the snapshot
+ * @returnstatus
+ */
int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer,
int max_count, int time_stamp, Fenix_Data_subset* found_data);
+/**
+ * @brief Local-only version of Fenix_Data_member_restore
+ *
+ * This function restores the data of a group member from the local
+ * snapshot.
+ *
+ * @param[in] group_id The group to restore from
+ * @param[in] member_id The member to restore
+ * @param[out] target_buffer The buffer to store the restored data
+ * @param[in] max_count The maximum number of elements to restore
+ * @param[in] time_stamp The time stamp of the snapshot to restore from
+ * @param[out] found_data The subset of the data that was found in the snapshot
+ * @returnstatus
+ */
int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer,
int max_count, int time_stamp, Fenix_Data_subset* found_data);
+//!@unimplemented As #Fenix_Data_member_restore, but restores from a specific rank's data.
int Fenix_Data_member_restore_from_rank(int member_id, void *data, int max_count,
int time_stamp, int group_id,
int source_rank);
+/**
+ * @brief Create a data subset for use in store operations.
+ *
+ * Creates a subset based on num_blocks pairs of
+ * {start_offset,end_offset},
+ * {start_offset+stride,end_offset+stride},
+ * {start_offset+2*stride,end_offset+2*stride},
+ * etc.
+ *
+ * The value of start_offset must be smaller than or equal
+ * to the value of end_offset to indicate non-negative block
+ * size. Otherwise, the function returns an error code.
+ *
+ * Created subsets must be deleted with #Fenix_Data_subset_delete
+ * to free memory.
+ *
+ * @param[in] num_blocks The number of contiguous data blocks.
+ * @param[in] start_offset The index of the first element in the first data block.
+ * @param[in] end_offset The index of the last element in the first data block.
+ * @param[in] stride Regular shift between successive data blocks.
+ * @param[out] subset_specifier The created subset.
+ * @returnstatus
+ */
int Fenix_Data_subset_create(int num_blocks, int start_offset, int end_offset,
int stride, Fenix_Data_subset *subset_specifier);
+/**
+ * @brief As #Fenix_Data_subset_create, but with varying start and end offsets.
+ *
+ * Creates a subset based on num_blocks pairs of {start_offset,end_offset}.
+ * The value of start_offset must be smaller than or equal to end_offset
+ * to indicate non-negative block size. Otherwise, the function returns an
+ * error code.
+ *
+ * Created subsets must be deleted with #Fenix_Data_subset_delete
+ * to free memory.
+ *
+ * @param[in] num_blocks The number of contiguous data blocks.
+ * @param[in] array_start_offsets The index of the first element in each data block.
+ * @param[in] array_end_offsets The index of the last element in each data block.
+ * @param[out] subset_specifier The created subset.
+ */
int Fenix_Data_subset_createv(int num_blocks, int *array_start_offsets,
int *array_end_offsets,
Fenix_Data_subset *subset_specifier);
+/**
+ * @brief Delete a data subset.
+ *
+ * Frees the memory associated with a data subset object.
+ *
+ * @param[in] subset_specifier The subset to delete.
+ * @returnstatus
+ */
int Fenix_Data_subset_delete(Fenix_Data_subset *subset_specifier);
+//!@unimplemented Get the number of members in a data group.
int Fenix_Data_group_get_number_of_members(int group_id, int *number_of_members);
-int Fenix_Data_group_get_member_at_position(int position, int *member_id,
- int group_id);
-
+//!@unimplemented Get member ID based on member index
+int Fenix_Data_group_get_member_at_position(int group_id, int *member_id,
+ int position);
+
+/**
+ * @brief Get the number of locally-available snapshots in a data group.
+ *
+ * May include snapshots that are inconsistent across the group.
+ *
+ * @param[in] group_id The group to query
+ * @param[out] number_of_snapshots The number of snapshots in the group
+ * @returnstatus
+ */
int Fenix_Data_group_get_number_of_snapshots(int group_id,
int *number_of_snapshots);
+/**
+ * @brief Get the time stamp of a snapshot at a given index.
+ *
+ * Snapshots are indexed in reverse order in which the user committed them
+ * (e.g. the most recent available snapshot has position=0).
+ *
+ * @param[in] group_id The group to query
+ * @param[in] position The index of the snapshot, which must be [0, number_of_snapshots)
+ * @param[out] time_stamp The time stamp of the snapshot
+ *
+ */
int Fenix_Data_group_get_snapshot_at_position(int group_id, int position,
int *time_stamp);
+//!@unimplemented Get the value of a member's attribute.
int Fenix_Data_member_attr_get(int group_id, int member_id, int attributename,
void *attributevalue, int *flag, int source_rank);
+/**
+ * @brief Set the value of a member's attribute.
+ *
+ * Valid names are #FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER, #FENIX_DATA_MEMBER_ATTRIBUTE_COUNT,
+ * and #FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE.
+ *
+ * The COUNT and DATATYPE attributes may only be set before the first store operation.
+ * Contrary to the Fenix specification, returning to #Fenix_Init after a failure does not
+ * allow the user to set these attributes again.
+ *
+ * @param[in] group_id The group to update
+ * @param[in] member_id The member to update
+ * @param[in] attribute_name The attribute to update
+ * @param[in] attribute_value The new value of the attribute
+ * @param[out] flag Set to true if the attribute was set, else false
+ * @returnstatus
+ */
int Fenix_Data_member_attr_set(int group_id, int member_id, int attribute_name,
void *attribute_value, int *flag);
+/**
+ * @brief Delete a snapshot from a data group.
+ * @qualifier local
+ *
+ * @param[in] group_id The group to delete from
+ * @param[in] time_stamp The time stamp of the snapshot to delete
+ * @returnstatus
+ */
int Fenix_Data_snapshot_delete(int group_id, int time_stamp);
+/**
+ * @brief Delete a data group.
+ * @qualifier local
+ *
+ * @param[in] group_id The group to delete
+ * @returnstatus
+ */
int Fenix_Data_group_delete(int group_id);
+/**
+ * @brief Delete a data member.
+ * @qualifier local
+ *
+ * @param[in] group_id The group to delete from
+ * @param[in] member_id The member to delete
+ * @returnstatus
+ */
int Fenix_Data_member_delete(int group_id, int member_id);
-
-int Fenix_Process_fail_list(int** fail_list);
-
-int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status);
+/**@}*/
#if defined(c_plusplus) || defined(__cplusplus)
}