Skip to content

Commit

Permalink
Add CommException class
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Whitlock committed Sep 26, 2024
1 parent 49c3bbb commit f624356
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 0 deletions.
2 changes: 2 additions & 0 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
#include <setjmp.h>

#if defined(c_plusplus) || defined(__cplusplus)
#include "fenix_exception.hpp"

extern "C" {
#endif
#include "fenix_data_subset.h"
Expand Down
80 changes: 80 additions & 0 deletions include/fenix_exception.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
//@HEADER
// ************************************************************************
//
//
// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
// _| _| _|_| _| _| _| _|
// _|_|_| _|_|_| _| _| _| _| _|
// _| _| _| _|_| _| _| _|
// _| _|_|_|_| _| _| _|_|_| _| _|
//
//
//
//
// Copyright (C) 2016 Rutgers University and Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi (knteran@sandia.gov) and
// Marc Gamell (mgamell@cac.rutgers.edu)
//
// ************************************************************************
//@HEADER
*/

#ifndef FENIX_EXCEPTION_HPP
#define FENIX_EXCEPTION_HPP

#include <mpi.h>
#include <exception>

namespace Fenix {
struct CommException : public std::exception {
MPI_Comm repaired_comm;
const int fenix_err;
void* user_data;
CommException(MPI_Comm comm, int err, void* data = nullptr) :
repaired_comm(comm), fenix_err(err), user_data(data) { };
};

//Registers a callback that throws a CommException after any Fenix-handled
// communication failure.
//Paramater user_data is passed to the exception
//Returns the callback ID.
int register_exception_callback(void* user_data = nullptr);

}

#endif
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h)

set (Fenix_SOURCES
fenix.cpp
fenix_exception.cpp
fenix_mpi_override.cpp
fenix_opt.cpp
fenix_process_recovery.cpp
Expand Down
13 changes: 13 additions & 0 deletions src/fenix_exception.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "fenix_exception.hpp"
#include "fenix.h"

namespace Fenix {
int register_exception_callback(void* user_data){
return Fenix_Callback_register(
[](MPI_Comm repaired_comm, int fen_err, void* data){
throw CommException(repaired_comm, fen_err, data);
},
user_data
);
}
}
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ add_subdirectory(request_cancelled)
add_subdirectory(no_jump)
add_subdirectory(issend)
add_subdirectory(failed_spares)
add_subdirectory(exception_throw)
15 changes: 15 additions & 0 deletions test/exception_throw/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# This file is part of Fenix
# Copyright (c) 2016 Rutgers University and Sandia Corporation.
# This software is distributed under the BSD License.
# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
# the U.S. Government retains certain rights in this software.
# For more information, see the LICENSE file in the top Fenix
# directory.
#

add_executable(fenix_exceptions fenix_exceptions.cpp)
target_link_libraries(fenix_exceptions fenix MPI::MPI_CXX)

add_test(NAME exception_throw
COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 6 ${MPIEXEC_PREFLAGS} fenix_exceptions ${MPIEXEC_POSTFLAGS})
104 changes: 104 additions & 0 deletions test/exception_throw/fenix_exceptions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
//@HEADER
// ************************************************************************
//
//
// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
// _| _| _|_| _| _| _| _|
// _|_|_| _|_|_| _| _| _| _| _|
// _| _| _| _|_| _| _| _|
// _| _|_|_|_| _| _| _|_|_| _| _|
//
//
//
//
// Copyright (C) 2016 Rutgers University and Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi (knteran@sandia.gov) and
// Marc Gamell (mgamell@cac.rutgers.edu)
//
// ************************************************************************
//@HEADER
*/

#include <mpi.h>

#include <fenix.h>
#include <stdio.h>
#include <signal.h>
#include <sys/types.h>
#include <unistd.h>
#include <pthread.h>

int main(int argc, char **argv) {
volatile int status = 0;

MPI_Init(&argc, &argv);

int fenix_role, error;
MPI_Comm res_comm;
MPI_Info info;
MPI_Info_create(&info);
MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP");
MPI_Info_set(info, "FENIX_UNHANDLED_MODE", "NO_JUMP");
Fenix_Init(&fenix_role, MPI_COMM_WORLD, &res_comm, &argc, &argv, 0, 0, info, &error);

Fenix::register_exception_callback();

if(fenix_role == FENIX_ROLE_SURVIVOR_RANK){
printf("FAILURE: longjmp instead of exception\n");
status = 1;
}

if (fenix_role == FENIX_ROLE_INITIAL_RANK) {
int rank;
MPI_Comm_rank(res_comm, &rank);
if(rank == 1) raise(SIGKILL);

try {
MPI_Barrier(res_comm);
printf("FAILURE: barrier finished without fault\n");
status = 1;
} catch (Fenix::CommException e){
printf("SUCCESS: caught CommException\n");
}
}

Fenix_Finalize();
MPI_Finalize();

return status;
}

0 comments on commit f624356

Please sign in to comment.