diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b0b1a8..45dc8fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,27 +15,75 @@ project(Fenix C) set(FENIX_VERSION_MAJOR 1) set(FENIX_VERSION_MINOR 0) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + option(BUILD_EXAMPLES "Builds example programs from the examples directory" OFF) option(BUILD_TESTING "Builds tests and test modes of files" OFF) +option(BUILD_DOCS "Builds documentation if is doxygen found" ON) +option(DOCS_ONLY "Only build documentation" OFF) #Solves an issue with some system environments putting their MPI headers before #the headers CMake includes. option(CRAYPE_INC_FIX "Adds detected MPI headers directly to this project" ON) -find_package(MPI REQUIRED) -add_subdirectory(src) +if(NOT DOCS_ONLY) + find_package(MPI REQUIRED) + add_subdirectory(src) + + include(CTest) + list(APPEND MPIEXEC_PREFLAGS "--with-ft;mpi") -include(CTest) -list(APPEND MPIEXEC_PREFLAGS "--with-ft;mpi") + if(BUILD_EXAMPLES) + add_subdirectory(examples) + endif() + + if(BUILD_TESTING) + add_subdirectory(test) + endif() -if(BUILD_EXAMPLES) - add_subdirectory(examples) endif() -if(BUILD_TESTING) - add_subdirectory(test) +if(BUILD_DOCS) + find_package(Doxygen) + if(DOXYGEN_FOUND) + list(APPEND DOXYGEN_EXAMPLE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/examples) + list(APPEND DOXYGEN_EXAMPLE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/doc/examples) + list(APPEND DOXYGEN_IMAGE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/doc/img) + set(DOXYGEN_USE_MDFILE_AS_MAINPAGE doc/Introduction.md) + set(DOXYGEN_TOC_INCLUDE_HEADINGS 0) + set(DOXYGEN_DISABLE_INDEX YES) + set(DOXYGEN_GENERATE_TREEVIEW YES) + set(DOXYGEN_FULL_SIDEBAR NO) + set(DOXYGEN_HTML_EXTRA_STYLESHEET ${CMAKE_CURRENT_SOURCE_DIR}/doc/DoxygenStyle.css) + set(DOXYGEN_LAYOUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/doc/DoxygenLayout.xml) + set(DOXYGEN_OUTPUT_DIRECTORY doc) + + set(DOXYGEN_GENERATE_MAN YES) + set(DOXYGEN_GENERATE_HTML YES) + set(DOXYGEN_GENERATE_LATEX YES) + + set(DOXYGEN_QUIET YES) + set(DOXYGEN_WARN_IF_UNDOCUMENTED NO) + set(DOXYGEN_WARN_IF_DOC_ERROR YES) + set(DOXYGEN_WARN_NO_PARAMDOC YES) + set(DOXYGEN_SHOW_INCLUDE_FILES NO) + set(DOXYGEN_WARN_IF_UNDOC_ENUM_VAL NO) + list(APPEND DOXYGEN_ALIASES "returnstatus=@return FENIX_SUCCESS if successful, any [return code](@ref ReturnCodes) otherwise.") + list(APPEND DOXYGEN_ALIASES "unimplemented=@qualifier UNIMPLEMENTED @brief @htmlonly @endhtmlonly UNIMPLEMENTED @htmlonly @endhtmlonly") + + doxygen_add_docs(doc + doc/Introduction.md doc/fake_init.h include src + ALL + COMMENT "Generate Fenix documentation") + message(STATUS "Run `make doc` to build documentation") + + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/doc DESTINATION ${CMAKE_INSTALL_PREFIX}) + + else() + message(STATUS "Doxygen not found, `make docs` disabled") + endif() endif() diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml new file mode 100644 index 0000000..535b044 --- /dev/null +++ b/doc/DoxygenLayout.xml @@ -0,0 +1,269 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/DoxygenStyle.css b/doc/DoxygenStyle.css new file mode 100644 index 0000000..770b1c6 --- /dev/null +++ b/doc/DoxygenStyle.css @@ -0,0 +1,41 @@ +/*Move qualifiers (e.g. collective, unimplemented) to being above function name instead of bottom right*/ +/* It's too easy to miss as-is, especially the unimplemented tag.*/ +table.mlabels { + direction: rtl; + writing-mode: vertical-rl; +} +/*Undo the weird writing-mode changes at each mlabels table member*/ +table.mlabels td.mlabels-right { + writing-mode: horizontal-tb; + text-align: left; + width: auto; +} +table.mlabels td.mlabels-left { + writing-mode: horizontal-tb; + text-align: left; + width: auto; +} +/*Undo the table direction change in the subtable of function parameters*/ +table.mlabels table.memname { + float: left; + direction: ltr; +} + +/*Make the qualifier labels slightly larger, and bold.*/ +table.mlabels td.mlabels-right span.mlabel { + font-weight: bold; + font-size: 12px; +} + + +/* + * Hide the "UNIMPLEMENTED" tag within the function's detailed description + * It's visible already. +*/ +div.memdoc span.mlabel { + display: none; +} + +table.params { + word-wrap: break-all; +} diff --git a/doc/Introduction.md b/doc/Introduction.md new file mode 100644 index 0000000..1e8d68b --- /dev/null +++ b/doc/Introduction.md @@ -0,0 +1,51 @@ +Fenix is a software library compatible with the Message Passing +Interface (MPI) to support fault recovery without application +shutdown. Fenix has two components: process recovery and data +recovery. Process recovery is used to repair communicators whose +ranks suffered failure detected by the MPI runtime. Data recovery +is an optional feature that can be used to implement a +high-performance in-memory checkpoint/restart mechanism. + +Below is a brief overview of these two components, but see the +[Process Recovery](@ref ProcessRecovery) and [Data Recovery](@ref DataRecovery) +topics for more details. + +## Process Recovery + +The core feature of process recovery is creation of a resilient +communicator that will automatically repair itself. This recovery +is achieved by setting aside some number of ranks as *spare ranks*. +When a failure is detected, the spare ranks are used to replace +the failed ranks. + +The exact process of recovery is subject to some nuances of the OpenMPI +ULFM specification, which Fenix is implemented on top of. For example, +messages may have locally succeeded while failing on other participating +ranks. + +![An example process flow diagram for recovery using Fenix](fenix_process_flow.png){html: width=300px} + +The default recovery pattern is to perform a `longjmp` to the location of +#Fenix_Init following communicator repairs. This emulates the typical offline +checkpoint/restart pattern, but without the need to restart the application. +However, `longjmp` has some nebulous behavior in many applications. Fenix also +supports a non-jumping recovery pattern. This is more predictable across compilers +and optimizations, but requires checking the return value of every MPI call to +detect failed operations (though communicator repair is still automatic). A +good practice for C++ applications is to use the non-jumping pattern, but add +a Fenix error-handler callback to throw an exception on failure. + +## Data Recovery + +Fenix provides its own redundant data storage API to facilitate +data recovery along with process recovery, but the user can choose +other data recovery options to meet a variety of application needs. +For example, data could be recovered by approximately interpolating +values from unaffected, topologically neighboring ranks instead of +by reading stored redundant data. In addition, the user may decide +to use external libraries such as +[VeloC](https://veloc.readthedocs.io/en/latest/). + +> Any Fenix function without a return type, e.g. #Fenix_Init, may be +> implemented via macros, in which case it cannot be used to resolve +> function pointers. diff --git a/doc/examples/DataRecovery.md b/doc/examples/DataRecovery.md new file mode 100644 index 0000000..e2f8ddf --- /dev/null +++ b/doc/examples/DataRecovery.md @@ -0,0 +1,114 @@ +Fenix provides options for redundant storage of application data +to facilitate application data recovery in a transparent manner. +Fenix contains functions to control consistency of collections of +such data, as well as their level of persistence. Functions with +the prefix \c Fenix\_Data\_ perform store, versioning, restore, +and other relevant operations and form the Fenix data recovery API. +The user can select a specific set of application data, identified +by its location in memory, label it using [Fenix_Data_member_create](@ref Fenix_Data_member_create), +and copy it into Fenix's redundant storage space through +[Fenix_Data_member(i)store(v)](@ref Fenix_Data_member_store) at a +point in time. Subsequently, #Fenix_Data_commit finalizes all +preceding Fenix store operations involving this data group and +assigns a unique time stamp to the resulting data *snapshot*, +marking the data as potentially recoverable after a loss of ranks. +Individual pieces of data can then be restored whenever they are +needed with #Fenix_Data_member_restore, for example after a failure +occurs. We note that Fenix's data storage and recovery facility +aims primarily to support in-memory recovery. + +Populating redundant data storage using Fenix may involve the +dispersion of data created by one rank to other ranks within the +system, making the store operation semantically a collective +operation. However, Fenix does not require store operations to be +globally synchronizing. For example, execution of + #Fenix_Data_member_store for a particular collection of data +could potentially be finished in some ranks, but not yet in others. +And if certain ranks nominally participating in the storage +operations have no actual data movement responsibility, Fenix is +allowerd to let them exit the operation immediately. Consequently, +Fenix data storage functions should not be used for synchronization +purposes. + +Multiple distinct pieces (members) of data assigned to Fenix-managed +redundant storage, can be associated with a specific instance of +a Fenix *data group* to form a semantic unit. Committing such a +group ensures that the data involved is available for recovery. + +## Data Groups + +----- +A Fenix *data group* provides dual functionality. First, it serves +as a container for a set of data objects (*members*) that are +committed together, and hence provides transaction semantics. +Second, it recognizes that #Fenix_Data_member_store is an operation +carried out collectively by a group of ranks, but not necessarily +by all active ranks in the MPI environment. Hence, it adopts the +convenient MPI vehicle of \c communicators to indicate the subset +of ranks involved. Data groups are composed of members that +describe the actual application data and the redundancy policy +to be used for securely storing the members. + +Data groups can and should be recreated after each failure (i.e. do not +conditionally skip the creation after initialization). + +See #Fenix_Data_group_create +for creating a data group. + +## Data Redundancy Policies + +----- +Fenix internally uses an extensible system for defining data +policies to keep the door open to easily adding new data policies +and configuring them on a per-data-group basis. We currently +support a single, configurable, memory-based policy. + +### In Memory Redundancy Policy (IMR) + +IMR is referenced with the FENIX_DATA_POLICY_IN_MEMORY_RAID definition, +and takes as input an array of integers with the following usage: + +* Mode: (1 or 5) Chooses storage mimicking the given RAID style. +* Separation: Sets the rank separation for groups used to store redundant data. + Users should choose a separation that attempts to ensure the ranks + chosen for grouping are not colocated on nodes/racks to minimize the + chance of multiple ranks in a group +* GroupSize: For Mode 5 only, sets the size of the parity groups, minimum 3. + +The policy is designed to localize recovery as much as possible. Communication +amongst group members is required (as failure during recovery operations +can lead to inconsistent beliefs about which ranks have recovered data), +but groups without recovering ranks may then all recover locally rather +than communicating further. Groups need not wait for ranks outside of +their group to enter or exit recovery. + +* **Mode 1**: Groups ranks into dyadically paired partners of Rank N and + Rank (N+Separation). For odd-size communicators, a single + group of size 3 will also form of the first, middle, and last + ranks. Each rank stores a copy of its own data and a copy of + its partner's. For groups of three, partner data storage is + chained. Should both partners fail (or any two for groups of + three) before recovery operations have completed, data will be + unrecoverable. + + **Memory Usage**: Each rank stores a copy of its own data and of its + partner's data for each timestamp, where checkpoint depth D + stores D+1 checkpoints. Therefore for data size M, + (D+1)*M*2 bytes are used. + + **Computation**: None. + +* **Mode 5**: Groups ranks into parity groups of size GroupSize. + Groups are formed of Rank N, N+Separation, N+2*Separation. + If any two ranks in a group fail before recovery operations + have completed, data will be unrecoverable. + + **Memory Usage**: Each rank stores a copy of its own data and + M/(GroupSize-1) parity bytes per timestamp. Therefore, + (D+1)*M*(GroupSize/(GroupSize-1)) bytes are used. + + **Computation**: O(M) parity bit calculations. + +These options enable users to trade reliability and computation for memory +space, which may be necessary for applications with large memory usage. + diff --git a/doc/examples/IMR.md b/doc/examples/IMR.md new file mode 100644 index 0000000..76f852a --- /dev/null +++ b/doc/examples/IMR.md @@ -0,0 +1,6 @@ +# In Memory Redundancy (IMR) {#md_IMR} + +Fenix supports one data storage policy, IMR, +which stores data through either a RAID-1-like +buddy rank mechanism or a RAID-5-like parity +mechanism. diff --git a/doc/examples/ProcessRecovery.md b/doc/examples/ProcessRecovery.md new file mode 100644 index 0000000..e2d8e1f --- /dev/null +++ b/doc/examples/ProcessRecovery.md @@ -0,0 +1,19 @@ +Functions and types for process recovery. + +* Only communicators derived from the communicator returned by +Fenix_Init are eligible for reconstruction. +After communicators have been repaired, they contain the same +number of ranks as before the failure occurred, unless the user +did not allocate sufficient redundant resources (*spare ranks*) +and instructed Fenix not to create new ranks. In this case +communicators will still be repaired, but will contain fewer +ranks than before the failure occurred. + +* To ease adoption of MPI fault tolerance, Fenix automatically +captures any errors resulting from MPI library calls that are a +result of a damaged communicator (other errors reported by the +MPI runtime are ignored by Fenix and are returned to the +application, for handling by the application writer). In other +words, programmers do not need to replace calls to the MPI library +with calls to Fenix (for example, *Fenix_Send* instead of +*MPI_Send*). diff --git a/doc/fake_init.h b/doc/fake_init.h new file mode 100644 index 0000000..a9afa16 --- /dev/null +++ b/doc/fake_init.h @@ -0,0 +1,4 @@ +//!@weakgroup ProcessRecovery +//!@{ +void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error); +//!@} diff --git a/doc/img/fenix_process_flow.png b/doc/img/fenix_process_flow.png new file mode 100644 index 0000000..e94029a Binary files /dev/null and b/doc/img/fenix_process_flow.png differ diff --git a/include/fenix.h b/include/fenix.h index 1a283bf..b875ed7 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -66,10 +66,21 @@ extern "C" { #include "fenix_data_subset.h" #include "fenix_process_recovery.h" +/** + * @file + * @brief Contains all API function calls and Fenix types. + * This is the only header file a user should include. + */ + +/** + * @defgroup ReturnCodes Return Codes + * @brief All possible return codes from Fenix functions. + * @{ + */ #define FENIX_SUCCESS 0 #define FENIX_ERROR_UNINITIALIZED -9 #define FENIX_ERROR_NOCATEGORY -10 -#define FENIX_ERROR_CALLBACK_NOT_REGISTERD -11 +#define FENIX_ERROR_CALLBACK_NOT_REGISTERED -11 #define FENIX_ERROR_GROUP_CREATE -12 #define FENIX_ERROR_MEMBER_CREATE -13 #define FENIX_ERROR_COMMIT_BARRIER -133 @@ -91,39 +102,110 @@ extern "C" { #define FENIX_ERROR_CANCELLED -50 #define FENIX_WARNING_SPARE_RANKS_DEPLETED 100 #define FENIX_WARNING_PARTIAL_RESTORE 101 +/**@}*/ -#define FENIX_DATA_GROUP_WORLD_ID 10 -#define FENIX_GROUP_ID_MAX 11 -#define FENIX_TIME_STAMP_MAX 12 -#define FENIX_DATA_MEMBER_ALL 15 -#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11 -#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12 -#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13 -#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14 -#define FENIX_DATA_SNAPSHOT_LATEST -1 -#define FENIX_DATA_SNAPSHOT_ALL 16 -#define FENIX_DATA_SUBSET_CREATED 2 - +//!@internal @brief Agreement code for error handler #define FENIX_ERRHANDLER_LOC 1 +//!@internal @brief Agreement code for data commit barrier #define FENIX_DATA_COMMIT_BARRIER_LOC 2 -#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13 +/** + * @defgroup ProcessRecovery Process Recovery + * @details @include{doc} ProcessRecovery.md + * @{ + */ + +/** + * @brief All possible roles returned by Fenix_Init + * + * Describes the current process's state in reference + * to process recovery. + * + * It is important to note that FENIX_ROLE_RECOVERED_RANK + * is only guaranteed to be the value after a single failure, + * so users ought not use the role to directly ensure a valid + * state if they desire to be resilient to failures during their + * failure recovery process. + */ typedef enum { + //!No failures have occurred yet FENIX_ROLE_INITIAL_RANK = 0, + //!This rank was a spare before the most recent failure, or was just spawned FENIX_ROLE_RECOVERED_RANK = 1, + //!This rank was not a spare before the most recent failure FENIX_ROLE_SURVIVOR_RANK = 2 } Fenix_Rank_role; -typedef struct { - MPI_Request mpi_send_req; - MPI_Request mpi_recv_req; -} Fenix_Request; - -extern const Fenix_Data_subset FENIX_DATA_SUBSET_FULL; -extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY; - +/** + * @fn void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error); + * @brief Build a resilient communicator and set the restart point. + * + * This function must be called by all ranks in \c comm, after MPI initialization. All calling ranks must + * pass the same values for the parameters \c comm, \c spare_ranks, \c spawn, and \c info. \c Fenix_init + * must be called exactly once by each rank. This function is used (1) to activate the Fenix library, (2) + * to specify extra resources in case of rank failure, and (3) to create a logical resumption point in case + * of rank failure. + * + * For C, the program may rely on the the state of any variables defined and set before the call to \c Fenix_Init. + * But note that the code executed before \c Fenix_Init is executed by all ranks in the system (including spare + * ranks, see below). For C++, the state of objects declared before \c Fenix_Init but within the same scope as + * \c Fenix_Init is compiler-dependant, and it is recommended to place \c Fenix_Init within a subscope exluding + * any variables expected to no be destructed. + * + * It is recommended to access argc and argv only after executin \c Fenix_Init, since command line arguments + * passed to this function that apply to Fenix may be removed by \c Fenix_Init. + * + * \c Fenix_Init is blocking in the following sense. If it is entered for the first time via a regular, explicit + * function call, it must be entered by all ranks in communicator \c comm. If it is entered after an error + * intercepted by Fenix (it if the default execution resumption point, see _info below), no ranks are allowed + * to exit from it until all *non-failed* ranks have returned control to it. **Note**: Typically control is + * returned automatically through revocation of the resilient communicator, which means ranks which have long + * delays between MPI function calls or ranks which only use communicators unaffected by failure may lead to + * long delays between a failure and its recovery. + * + * Ranks to be used as spare ranks by Fenix will be available to the application only before \c Fenix_Init, + * or after they are used to replace a failed rank, in which case they turn into active ranks. This document + * refers to the latter as \c RECOVERED ranks (see #Fenix_Rank_role). Note that all spare + * ranks that have not been used to recover from failures (and, therefore, are still reserved by Fenix and kept + * inside \c Fenix_Init) will automatically call \c MPI_Finalize and exit when all active ranks have entered the + * #Fenix_Finalize call. + * + * No Fenix functions may be called before \c Fenix_Init, except #Fenix_Initialized. + * + * @param[out] role The current role of this rank (see #Fenix_Rank_role) + * @param[in] comm The base communicator to construct a resilient communicator from, + * which must include any spare ranks (see below) the user deems necessary. + * MPI_COMM_WORLDis a valid value, but MPI_COMM_SELF is not. + * @param[out] newcomm Resilient output communicator, managed by Fenix and derived + * from comm, to be used by the application instead of comm. + * @param[inout] argc Pointer to application main's argc parameter + * @param[inout] argv Pointer to application main's argv parameter + * @param[in] spare_ranks The number of ranks in comm that are exempted by Fenix + * in the construction of the resilient communicator by Fenix_Init. These ranks + * are kept in reserve to substitute for failed ranks. Failed ranks in resilient + * communicators are replaced by spare or spawned ranks. + * @param[in] spawn *Unimplemented*: Whether to enable spawning new ranks to replace + * failed ranks when spares are unavailable. + * @param[in] info Fenix recovery configuration parameters, may be MPI_INFO_NULL + * Supports the "FENIX_RESUME_MODE" key, used to indicate where execution should resume upon + * rank failure for all active (non-spare) ranks in any resilient communicators, not only for + * those ranks in communicators that failed. The following values associated with the + * "resume_mode" key are supported: + * - "Fenix_init" (default): execution resumes at logical exit of Fenix_Init. + * - "NO_JUMP": execution continues from the failing MPI call. Errors are otherwise handled + * as normal, but return the error code as well. Applications should typically + * either check for return codes or assign an error callback through Fenix. + * @param[out] error The return status of \c Fenix_Init
+ * Used to signal that a non-fatal error or special condition was encountered in the execution of + * Fenix_Init, or FENIX_SUCCESS otherwise. It has the same value across all ranks released by + * Fenix_Init. If spawning is explicitly disabled (_spawn equals false) and spare ranks have been + * depleted, Fenix will repair resilience communicators by shrinking them and will report such + * shrinkage in this return parameter through the value FENIX_WARNING_SPARE_RANKS_DEPLETED. + */ + +//!@internal #define Fenix_Init(_role, _comm, _newcomm, _argc, _argv, _spare_ranks, \ _spawn, _info, _error) \ { \ @@ -137,96 +219,461 @@ extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY; __fenix_postinit( _error ); \ } -int Fenix_Initialized(int *); +/** + * @brief Sets flag to true if Fenix_Init has been called, else false. + * @param[out] flag Pointer to the flag to be set. + * @returnstatus + */ +int Fenix_Initialized(int *flag); + +/** + * @brief Register a callback to be invoked after failure process recovery. + * + * This function registers a callback to be invoked after a failure has been recovered by Fenix, + * and right before resuming application execution (e.g. returning from #Fenix_Init by default). + * If this function is called more than once, the different callbacks will be called in the + * reverse order that they were registered (i.e. as a callback stack). + * + * Callback functions are passed the newly-repaired resilient communicator, the error code returned + * by MPI in the communication action which caused a failure recovery, and the user-provided \c void* + * callback data. + * + * Callbacks will only be invoked by survivor ranks, since spare ranks or respawned ranks had no way + * to register them before a failure. + * + * @param[in] recover the callback function to register. + * @param[in] callback_data The user-provided data which will be passed to the callback. + * + * @returnstatus + */ int Fenix_Callback_register(void (*recover)(MPI_Comm, int, void *), void *callback_data); + +//!@unimplemented Returns the number of ranks with a given #Fenix_Rank_role int Fenix_get_number_of_ranks_with_role(int, int *); +//!@unimplemented Returns the #Fenix_Rank_role for a given rank int Fenix_get_role(MPI_Comm comm, int rank, int *role); +/** + * @brief Get the list of ranks that failed in the most recent failure. + * @param[out] fail_list Set to a list of failed ranks. + * @return The number of failed ranks. + */ +int Fenix_Process_fail_list(int** fail_list); + +/** + * @brief Check a pre-recovery request without error + * @param[in] request The request to check + * @param[out] status The status of the request + * @return True if the request was cancelled or has unknown completion status, + * false if it completed successfully. + */ +int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status); + + +/** + * @brief Clean up Fenix state. Each active rank must call \c Fenix_Finalize before exiting. + * + * This function cleans up all Fenix state, if any. If an MPI program using the Fenix library terminates + * normally (i.e. not due to a call to \c MPI_Abort, or an unrecoverable error) then each rank must call + * \c Fenix_Finalize before it exits. It must be called before \c MPI_Finalize, and after #Fenix_Init. + * There shall be no function calls after this function, except #Fenix_Initialized. + * + * As noted in the description of #Fenix_Init, all spare ranks that have not been used to + * recover from failures (and therefore are still reserved by Fenix and kept inside #Fenix_Init) will call + * \c MPI_Finalize and exit when all active ranks have called \c Fenix_Finalize. + * + * **Advice**: Sometimes users may want to remove ranks proactively from the execution, for example because + * monitoring data shows that failure of a rank is imminent or that a rank is executing un-manageably slowly. + * This can be accomplished by calling \c exit on the targeted ranks, followed by an invocation of MPI_Barrier. + * The removed ranks will be reported as failed and error handling will progress appropriately. No calls to finalize + * are needed in this case. + */ int Fenix_Finalize(); -int Fenix_Data_group_create(int group_id, MPI_Comm, int start_time_stamp, +/**@}*/ + + +/** + * @defgroup DataRecovery Data Recovery + * @brief Functions for storing and restoring data in Fenix. + * @details @include{doc} DataRecovery.md + * + * @{ + */ +#define FENIX_DATA_GROUP_WORLD_ID 10 +#define FENIX_GROUP_ID_MAX 11 +#define FENIX_TIME_STAMP_MAX 12 +#define FENIX_DATA_MEMBER_ALL 15 +#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11 +#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12 +#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13 +#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14 +#define FENIX_DATA_SNAPSHOT_LATEST -1 +#define FENIX_DATA_SNAPSHOT_ALL 16 +#define FENIX_DATA_SUBSET_CREATED 2 + +#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13 + +/** + * @unimplemented As MPI_Request, but for Fenix asynchronous data recovery calls + */ +typedef struct { + MPI_Request mpi_send_req; + MPI_Request mpi_recv_req; +} Fenix_Request; + +//!@brief A standin for checkpointing/recovering all available data in a member. +extern const Fenix_Data_subset FENIX_DATA_SUBSET_FULL; + +//!@brief A standin for checkpointing/recovering none of the available data in a member. +extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY; + + +/** + * @brief Create a Data Group + * @qualifier collective + * + * If a group with this group_id was already created in the past and has not been deleted, the + * parameters of this call are ignored and this function simply serves to coordinate with any + * ranks that have not yet created this group (e.g. due to a failure). + * + * All calling ranks must pass the same values for the parameters \c group_id, \c comm, + * \c start_time_stamp, \c policy_name, and \c policy_value. + * + * @param group_id A unique identifier to this group. + * @param comm A resilient communicator on which the group is formed. + * @param start_time_stamp The time_stamp to be used for the first commit in this group. + * @param depth + * @parblock + * The number of successive snapshots of this group that are retained by Fenix, in + * addition to the most recent one, and that can be recovered by calling Fenix data member + * restore functions. + * + * For example, a depth of 0 means Fenix will keep only the necessary data to restore the + * most recent snapshot, freeing or overwriting older snapshots automatically. A depth + * of -1 is currently not supported, but would ordinarily indicate that no snapshots should + * be removed automatically. + * @endparblock + * @param policy_name Currently, may only be FENIX_DATA_POLICY_IN_MEMORY_RAID + * @param policy_value Pointer to data passed along to the policy. + * See the specific policy for more information. + * @param flag pointer to store policy-specific status or errors + * @return FENIX_SUCCESS, or an error value. + */ +int Fenix_Data_group_create(int group_id, MPI_Comm comm, int start_time_stamp, int depth, int policy_name, void* policy_value, int* flag); +/** + * @brief Create a data member for store/restore operations + * @qualifier collective + * @qualifier local + * + * All calling ranks in the group's communicator must pass the same values for the parameters + * \c member_id, \c datatype, and \c group_id. + * + * @param group_id Identifier to a data group within which to create the member. + * @param member_id An integer unique within the data group that identifies the data in + * \c source_buffer. Must be nonnegative and less than FENIX_MEMBER_ID_MAX, which is + * guaranteed to be at least 2^30. + * @param buffer Address of the data to be copied to redundant storage maintained by Fenix. + * Note that this parameter may also be specified using #Fenix_Data_member_attr_set, which + * is critical for non-survivor ranks after a failure which will have an invalid address + * which was generated on the failed rank and must update. + * @param count The maximum number of contiguous elements of type \c datatype of the data to be + * stored. Need not be the same in all calling ranks. + * @param datatype The MPI_Datatype of the elements in \c source_buffer + * + * @return FENIX_SUCCESS, or an error value. + */ int Fenix_Data_member_create(int group_id, int member_id, void *buffer, int count, MPI_Datatype datatype); +/** + * @brief Get the storage policy of a data group + * + * @param group_id Identified to the data group to query + * @param policy_name The identifier of the policy name of the data group. + * @param policy_value A location within which to store the policy_values this group's + * policy was configured with. + * @param flag A location set to true if a policy value was extracted, else false. + * @return FENIX_SUCCESS, or an error value. + */ int Fenix_Data_group_get_redundancy_policy(int group_id, int* policy_name, void *policy_value, int *flag); +//!@unimplemented Block on completion of the store operation specified by the request. int Fenix_Data_wait(Fenix_Request request); + +//!@unimplemented Query completion of the store operation specified by the request. int Fenix_Data_test(Fenix_Request request, int *flag); + +/** + * @brief Store a particular group member into the group's resilient storage space, in uncommitted storage. + * @qualifier collective + * + * The user can safely modify the member's data buffer after this call, as the current state is copied immediately. + * Multiple calls may be used to incrementally store data (using subset_specifiers), or overwrite old data prior to a commit. + * + * @param group_id All ranks must provide the same group_id + * @param member_id All ranks must provide the same member_id + * @param subset_specifier Which subset of the data to store. It is always valid for every rank to provide the same + * subset_specifier; depending on the group's policy, varying combinations of specifiers may be possible. + * @return FENIX_SUCCESS, or an error value. + */ int Fenix_Data_member_store(int group_id, int member_id, Fenix_Data_subset subset_specifier); -int Fenix_Data_member_storev(int member_id, int group_id, + +//!@unimplemented As [store](#Fenix_Data_member_store), but subsets may vary rank-to-rank. +int Fenix_Data_member_storev(int group_id, int member_id, Fenix_Data_subset subset_specifier); -int Fenix_Data_member_istore(int member_id, int group_id, +//!@unimplemented As [store](#Fenix_Data_member_store), but asynchronous. +int Fenix_Data_member_istore(int group_id, int member_id, Fenix_Data_subset subset_specifier, Fenix_Request *request); -int Fenix_Data_member_istorev(int member_id, int group_id, +//!@unimplemented As [istore](#Fenix_Data_member_istore), but asynchronous. +int Fenix_Data_member_istorev(int group_id, int member_id, Fenix_Data_subset subset_specifier, Fenix_Request *request); +/** + * @brief Commit stored data members to the group's next snapshot. + * @qualifier collective + * @qualifier local + * + * This function is used to freeze the current state of a data group, + * together with all its application data that has been stored in Fenix’ + * redundant storage, and label it with a time stamp, thus creating a + * snapshot of the stored application data. Only data that has been + * committed is eligible for recovery through #Fenix_Data_member_restore. + * An application needs to call #Fenix_Data_wait for all pending asynchronous + * [Fenix_Data_member_istore(v)](@ref Fenix_Data_member_istore) operations + * in the group before committing. + * + * @param[in] group_id The group to commit + * @param[out] time_stamp The time stamp of the new snapshot + * @returnstatus + */ int Fenix_Data_commit(int group_id, int *time_stamp); +/** + * @brief As [commit](#Fenix_Data_commit), but ensures a globally consistent commit. + * @qualifier collective + * + * This function does not function as a traditional barrier. + * The commit will proceed if all *non-failed* ranks reach the barrier. + * This allows for commits to be made when a rank fails after storing all + * of its data into resilient storage. + * + * @param[in] group_id The group to commit + * @param[out] time_stamp The time stamp of the new snapshot + * @returnstatus + */ int Fenix_Data_commit_barrier(int group_id, int *time_stamp); +//!@unimplemented Block until all ranks in the group have reached this point. int Fenix_Data_barrier(int group_id); +/** + * @brief Restore the data of a group member from a snapshot. + * @qualifier collective + * + * All ranks in the group’s resilient communicator must pass the + * same values for the parameters group_id, member_id, and time_stamp. + * This function is used to retrieve data from consistent snapshot + * members. This function can only be used if the size of the + * communicator used to store the data is the same as that at the time + * of data recovery (this implies non-shrinking communicator recovery + * in case of a rank loss). + * + * If the size of the buffer needing to receive the recovery data is + * unknown for a particular rank, it can be queried using + * #Fenix_Data_member_attr_get. + * + * @param[in] group_id The group to restore from + * @param[in] member_id The member to restore + * @param[out] target_buffer The buffer to store the restored data + * @param[in] max_count The maximum number of elements to restore + * @param[in] time_stamp The time stamp of the snapshot to restore from + * @param[out] found_data The subset of the data that was found in the snapshot + * @returnstatus + */ int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset* found_data); +/** + * @brief Local-only version of Fenix_Data_member_restore + * + * This function restores the data of a group member from the local + * snapshot. + * + * @param[in] group_id The group to restore from + * @param[in] member_id The member to restore + * @param[out] target_buffer The buffer to store the restored data + * @param[in] max_count The maximum number of elements to restore + * @param[in] time_stamp The time stamp of the snapshot to restore from + * @param[out] found_data The subset of the data that was found in the snapshot + * @returnstatus + */ int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset* found_data); +//!@unimplemented As #Fenix_Data_member_restore, but restores from a specific rank's data. int Fenix_Data_member_restore_from_rank(int member_id, void *data, int max_count, int time_stamp, int group_id, int source_rank); +/** + * @brief Create a data subset for use in store operations. + * + * Creates a subset based on num_blocks pairs of + * {start_offset,end_offset}, + * {start_offset+stride,end_offset+stride}, + * {start_offset+2*stride,end_offset+2*stride}, + * etc. + * + * The value of start_offset must be smaller than or equal + * to the value of end_offset to indicate non-negative block + * size. Otherwise, the function returns an error code. + * + * Created subsets must be deleted with #Fenix_Data_subset_delete + * to free memory. + * + * @param[in] num_blocks The number of contiguous data blocks. + * @param[in] start_offset The index of the first element in the first data block. + * @param[in] end_offset The index of the last element in the first data block. + * @param[in] stride Regular shift between successive data blocks. + * @param[out] subset_specifier The created subset. + * @returnstatus + */ int Fenix_Data_subset_create(int num_blocks, int start_offset, int end_offset, int stride, Fenix_Data_subset *subset_specifier); +/** + * @brief As #Fenix_Data_subset_create, but with varying start and end offsets. + * + * Creates a subset based on num_blocks pairs of {start_offset,end_offset}. + * The value of start_offset must be smaller than or equal to end_offset + * to indicate non-negative block size. Otherwise, the function returns an + * error code. + * + * Created subsets must be deleted with #Fenix_Data_subset_delete + * to free memory. + * + * @param[in] num_blocks The number of contiguous data blocks. + * @param[in] array_start_offsets The index of the first element in each data block. + * @param[in] array_end_offsets The index of the last element in each data block. + * @param[out] subset_specifier The created subset. + */ int Fenix_Data_subset_createv(int num_blocks, int *array_start_offsets, int *array_end_offsets, Fenix_Data_subset *subset_specifier); +/** + * @brief Delete a data subset. + * + * Frees the memory associated with a data subset object. + * + * @param[in] subset_specifier The subset to delete. + * @returnstatus + */ int Fenix_Data_subset_delete(Fenix_Data_subset *subset_specifier); +//!@unimplemented Get the number of members in a data group. int Fenix_Data_group_get_number_of_members(int group_id, int *number_of_members); -int Fenix_Data_group_get_member_at_position(int position, int *member_id, - int group_id); - +//!@unimplemented Get member ID based on member index +int Fenix_Data_group_get_member_at_position(int group_id, int *member_id, + int position); + +/** + * @brief Get the number of locally-available snapshots in a data group. + * + * May include snapshots that are inconsistent across the group. + * + * @param[in] group_id The group to query + * @param[out] number_of_snapshots The number of snapshots in the group + * @returnstatus + */ int Fenix_Data_group_get_number_of_snapshots(int group_id, int *number_of_snapshots); +/** + * @brief Get the time stamp of a snapshot at a given index. + * + * Snapshots are indexed in reverse order in which the user committed them + * (e.g. the most recent available snapshot has position=0). + * + * @param[in] group_id The group to query + * @param[in] position The index of the snapshot, which must be [0, number_of_snapshots) + * @param[out] time_stamp The time stamp of the snapshot + * + */ int Fenix_Data_group_get_snapshot_at_position(int group_id, int position, int *time_stamp); +//!@unimplemented Get the value of a member's attribute. int Fenix_Data_member_attr_get(int group_id, int member_id, int attributename, void *attributevalue, int *flag, int source_rank); +/** + * @brief Set the value of a member's attribute. + * + * Valid names are #FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER, #FENIX_DATA_MEMBER_ATTRIBUTE_COUNT, + * and #FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE. + * + * The COUNT and DATATYPE attributes may only be set before the first store operation. + * Contrary to the Fenix specification, returning to #Fenix_Init after a failure does not + * allow the user to set these attributes again. + * + * @param[in] group_id The group to update + * @param[in] member_id The member to update + * @param[in] attribute_name The attribute to update + * @param[in] attribute_value The new value of the attribute + * @param[out] flag Set to true if the attribute was set, else false + * @returnstatus + */ int Fenix_Data_member_attr_set(int group_id, int member_id, int attribute_name, void *attribute_value, int *flag); +/** + * @brief Delete a snapshot from a data group. + * @qualifier local + * + * @param[in] group_id The group to delete from + * @param[in] time_stamp The time stamp of the snapshot to delete + * @returnstatus + */ int Fenix_Data_snapshot_delete(int group_id, int time_stamp); +/** + * @brief Delete a data group. + * @qualifier local + * + * @param[in] group_id The group to delete + * @returnstatus + */ int Fenix_Data_group_delete(int group_id); +/** + * @brief Delete a data member. + * @qualifier local + * + * @param[in] group_id The group to delete from + * @param[in] member_id The member to delete + * @returnstatus + */ int Fenix_Data_member_delete(int group_id, int member_id); - -int Fenix_Process_fail_list(int** fail_list); - -int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status); +/**@}*/ #if defined(c_plusplus) || defined(__cplusplus) }