Skip to content

Commit

Permalink
Test Bugfixes + don't try to pass MPI_Datatype between ranks
Browse files Browse the repository at this point in the history
MPI_Datatype is vendor-dependent and we aren't allowed to assume
anything about it. Right now, ompi implements as a pointer and we segfault
on recovery sometimes.

Fix unran test, add timeout parameter

Remove ompi version expected to fail from tests
  • Loading branch information
Matthew-Whitlock committed Feb 20, 2023
1 parent d7472e2 commit 43452ed
Show file tree
Hide file tree
Showing 11 changed files with 27 additions and 37 deletions.
2 changes: 1 addition & 1 deletion .github/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ COPY --from=0 ompi_install/ /ompi_install/
ENV PATH="$PATH:/ompi_install/bin"
RUN mkdir fenix_build fenix_install && cd fenix_build && cmake ../fenix_src -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/ompi_install/bin/mpicc \
-DBUILD_EXAMPLES=ON -DBUILD_TESTING=ON -DCMAKE_INSTALL_PREFIX=../fenix_install -DMPIEXEC_PREFLAGS="--allow-run-as-root;--map-by;:OVERSUBSCRIBE" && make install -j8
CMD ["sh", "-c", "cd fenix_build && ctest --verbose"]
CMD ["sh", "-c", "cd fenix_build && ctest --verbose --timeout 60"]
20 changes: 10 additions & 10 deletions .github/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ x-fenix: &fenix
- type=gha,scope=default,mode=max

services:
fenix_ompi_5rc10:
<<: *fenix
image: "fenix:ompi_5rc10"
build:
<<: *fenix-build
x-bake:
cache-from:
- type=gha,scope=ompi_5rc10
cache-to:
- type=gha,scope=ompi_5rc10,mode=max
#fenix_ompi_5rc10:
# <<: *fenix
# image: "fenix:ompi_5rc10"
# build:
# <<: *fenix-build
# x-bake:
# cache-from:
# - type=gha,scope=ompi_5rc10
# cache-to:
# - type=gha,scope=ompi_5rc10,mode=max

fenix_ompi_5:
<<: *fenix
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/ci_checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@ jobs:
files: |
.github/docker-compose.yml
load: true
- name: Test open-mpi v5.0.0rc10
if: success() || failure()
run: docker run fenix:ompi_5rc10
- name: Test open-mpi v5.0.x
if: success() || failure()
run: docker run fenix:ompi_5
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ examples/05_subset_create/subset_create
examples/06_subset_createv/subset_createv
test/request_tracking/fenix_request_tracking_test
test/request_tracking/fenix_request_tracking_test_nofenix
build/

# Other
*~
Expand Down
4 changes: 1 addition & 3 deletions include/fenix_data_member.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ typedef struct __fenix_member_entry {
int memberid;
enum states state;
void *user_data;
MPI_Datatype current_datatype;
int datatype_size;
int current_count;
} fenix_member_entry_t;
Expand All @@ -80,7 +79,6 @@ typedef struct __fenix_member {

typedef struct __member_entry_packet {
int memberid;
MPI_Datatype current_datatype;
int datatype_size;
int current_count;
} fenix_member_entry_packet_t;
Expand All @@ -92,7 +90,7 @@ void __fenix_ensure_member_capacity( fenix_member_t *m );
void __fenix_ensure_version_capacity_from_member( fenix_member_t *m );

fenix_member_entry_t* __fenix_data_member_add_entry(fenix_member_t* member,
int memberid, void* data, int count, MPI_Datatype datatype);
int memberid, void* data, int count, int datatype_size);

int __fenix_data_member_send_metadata(int groupid, int memberid, int dest_rank);
int __fenix_data_member_recv_metadata(int groupid, int src_rank,
Expand Down
3 changes: 1 addition & 2 deletions include/fenix_data_recovery.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,14 @@


typedef struct __data_entry_packet {
MPI_Datatype datatype;
int count;
int datatype_size;
} fenix_data_entry_packet_t;


int __fenix_group_create(int, MPI_Comm, int, int, int, void*, int*);
int __fenix_group_get_redundancy_policy(int, int*, int*, int*);
int __fenix_member_create(int, int, void *, int, MPI_Datatype);
int __fenix_member_create(int, int, void *, int, int);
int __fenix_data_wait(Fenix_Request);
int __fenix_data_test(Fenix_Request, int *);
int __fenix_member_store(int, int, Fenix_Data_subset);
Expand Down
2 changes: 1 addition & 1 deletion src/fenix.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ int Fenix_Data_group_create( int group_id, MPI_Comm comm, int start_time_stamp,
}

int Fenix_Data_member_create( int group_id, int member_id, void *buffer, int count, MPI_Datatype datatype ) {
return __fenix_member_create(group_id, member_id, buffer, count, datatype);
return __fenix_member_create(group_id, member_id, buffer, count, __fenix_get_size(datatype));
}

int Fenix_Data_group_get_redundancy_policy( int group_id, int* policy_name, void *policy_value, int *flag ) {
Expand Down
9 changes: 2 additions & 7 deletions src/fenix_data_member.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ int __fenix_find_next_member_position(fenix_member_t *member) {
}

fenix_member_entry_t* __fenix_data_member_add_entry(fenix_member_t* member,
int memberid, void* data, int count, MPI_Datatype datatype){
int memberid, void* data, int count, int datatype_size){

int member_index = __fenix_find_next_member_position(member);
fenix_member_entry_t* mentry = member->member_entry + member_index;
Expand All @@ -150,11 +150,7 @@ fenix_member_entry_t* __fenix_data_member_add_entry(fenix_member_t* member,
mentry->state = OCCUPIED;
mentry->user_data = data;
mentry->current_count = count;
mentry->current_datatype = datatype;

int dsize;
MPI_Type_size(datatype, &dsize);
mentry->datatype_size = dsize;
mentry->datatype_size = datatype_size;

member->count++;

Expand Down Expand Up @@ -222,7 +218,6 @@ int __fenix_data_member_send_metadata(int groupid, int memberid, int dest_rank){

fenix_member_entry_packet_t packet;
packet.memberid = mentry.memberid;
packet.current_datatype = mentry.current_datatype;
packet.datatype_size = mentry.datatype_size;
packet.current_count = mentry.current_count;

Expand Down
12 changes: 7 additions & 5 deletions src/fenix_data_policy_in_memory_raid.c
Original file line number Diff line number Diff line change
Expand Up @@ -703,8 +703,11 @@ int __imr_member_restore(fenix_group_t* g, int member_id,
//find_mentry returns the error status. We found the member (and corresponding data) if there are no errors.
int found_member = !(__imr_find_mentry(group, member_id, &mentry));

int member_data_index = __fenix_search_memberid(group->base.member, member_id);
fenix_member_entry_t member_data = group->base.member->member_entry[member_data_index];
fenix_member_entry_t member_data;
if(found_member){
int member_data_index = __fenix_search_memberid(group->base.member, member_id);
member_data = group->base.member->member_entry[member_data_index];
}

int recovery_locally_possible;

Expand Down Expand Up @@ -783,12 +786,11 @@ int __imr_member_restore(fenix_group_t* g, int member_id,

//We remake the new member just like the user would.
__fenix_member_create(group->base.groupid, packet.memberid, NULL, packet.current_count,
packet.current_datatype);
packet.datatype_size);

__imr_find_mentry(group, member_id, &mentry);
int member_data_index = __fenix_search_memberid(group->base.member, member_id);
member_data = group->base.member->member_entry[member_data_index];


MPI_Recv((void*)&(group->num_snapshots), 1, MPI_INT, group->partners[1],
RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm, NULL);
Expand Down Expand Up @@ -886,7 +888,7 @@ int __imr_member_restore(fenix_group_t* g, int member_id,

//We remake the new member just like the user would.
__fenix_member_create(group->base.groupid, packet.memberid, NULL, packet.current_count,
packet.current_datatype);
packet.datatype_size);

__imr_find_mentry(group, member_id, &mentry);
int member_data_index = __fenix_search_memberid(group->base.member, member_id);
Expand Down
7 changes: 2 additions & 5 deletions src/fenix_data_recovery.c
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,7 @@ int __fenix_group_get_redundancy_policy(int groupid, int* policy_name, int* poli
* @param count
* @param data_type
*/
int __fenix_member_create(int groupid, int memberid, void *data, int count, MPI_Datatype datatype ) {

int __fenix_member_create(int groupid, int memberid, void *data, int count, int datatype_size ) {
int retval = -1;
int group_index = __fenix_search_groupid( groupid, fenix.data_recovery );
int member_index = -1;
Expand Down Expand Up @@ -219,9 +218,8 @@ int __fenix_member_create(int groupid, int memberid, void *data, int count, MPI_

//First, we'll make a fenix-core member entry, then pass that info to
//the specific data policy.
int member_index = __fenix_find_next_member_position(member);
fenix_member_entry_t* mentry;
mentry = __fenix_data_member_add_entry(member, memberid, data, count, datatype);
mentry = __fenix_data_member_add_entry(member, memberid, data, count, datatype_size);

//Pass the info along to the policy
retval = group->vtbl.member_create(group, mentry);
Expand Down Expand Up @@ -924,7 +922,6 @@ int __fenix_member_set_attribute(int groupid, int memberid, int attributename,
retval = FENIX_ERROR_INVALID_ATTRIBUTE_NAME;
}

mentry->current_datatype = *((MPI_Datatype *)(attributevalue));
mentry->datatype_size = my_datatype_size;
retval = FENIX_SUCCESS;
break;
Expand Down
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ add_subdirectory(request_tracking)
add_subdirectory(request_cancelled)
add_subdirectory(no_jump)
add_subdirectory(issend)
add_subdirectory(failed_spares)

0 comments on commit 43452ed

Please sign in to comment.