diff --git a/criu/files.c b/criu/files.c index a57fb860fb..31e705bcc5 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1811,11 +1811,6 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); - - if (init_dead_pidfd_hash()) { - pr_err("Could not initialise hash map for dead pidfds\n"); - return -1; - } - + init_dead_pidfd_hash(); return collect_image(&files_cinfo); } diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h index 4d2d71700e..bcc0fb45ab 100644 --- a/criu/include/pidfd.h +++ b/criu/include/pidfd.h @@ -7,7 +7,7 @@ extern const struct fdtype_ops pidfd_dump_ops; extern struct collect_image_info pidfd_cinfo; extern int is_pidfd_link(char *link); -extern int init_dead_pidfd_hash(void); +extern void init_dead_pidfd_hash(void); struct pidfd_dump_info { PidfdEntry pidfe; pid_t pid; diff --git a/criu/pidfd.c b/criu/pidfd.c index 3ea3c93094..53b9bcf71a 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -21,32 +21,26 @@ struct pidfd_info { PidfdEntry *pidfe; struct file_desc d; + + struct dead_pidfd *dead; + struct pidfd_info *next; }; struct dead_pidfd { unsigned int ino; - int pid; - size_t count; - mutex_t pidfd_lock; + int creator_id; + struct hlist_node hash; + struct pidfd_info *list; }; #define DEAD_PIDFD_HASH_SIZE 32 static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; -static mutex_t *dead_pidfd_hash_lock; -int init_dead_pidfd_hash(void) +void init_dead_pidfd_hash(void) { for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) INIT_HLIST_HEAD(&dead_pidfd_hash[i]); - - dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); - if (!dead_pidfd_hash_lock) - return -1; - - mutex_init(dead_pidfd_hash_lock); - - return 0; } static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) @@ -54,15 +48,12 @@ static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) struct dead_pidfd *dead; struct hlist_head *chain; - mutex_lock(dead_pidfd_hash_lock); chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; hlist_for_each_entry(dead, chain, hash) { if (dead->ino == ino) { - mutex_unlock(dead_pidfd_hash_lock); return dead; } } - mutex_unlock(dead_pidfd_hash_lock); return NULL; } @@ -142,7 +133,7 @@ static int create_tmp_process(void) return tmp_process; } -static int free_dead_pidfd(struct dead_pidfd *dead) +static int kill_helper(pid_t pid) { int status; sigset_t blockmask, oldmask; @@ -160,15 +151,13 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } - if (kill(dead->pid, SIGKILL) < 0) { - pr_perror("Could not kill temporary process with pid: %d", - dead->pid); + if (kill(pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", pid); goto err; } - if (waitpid(dead->pid, &status, 0) != dead->pid) { - pr_perror("Could not wait on temporary process with pid: %d", - dead->pid); + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Could not wait on temporary process with pid: %d", pid); goto err; } @@ -188,9 +177,6 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } - mutex_lock(dead_pidfd_hash_lock); - hlist_del(&dead->hash); - mutex_unlock(dead_pidfd_hash_lock); return 0; err: return -1; @@ -198,8 +184,9 @@ static int free_dead_pidfd(struct dead_pidfd *dead) static int open_one_pidfd(struct file_desc *d, int *new_fd) { - struct pidfd_info *info; + struct pidfd_info *info, *child; struct dead_pidfd *dead = NULL; + pid_t pid; int pidfd; info = container_of(d, struct pidfd_info, d); @@ -215,34 +202,44 @@ static int open_one_pidfd(struct file_desc *d, int *new_fd) dead = lookup_dead_pidfd(info->pidfe->ino); BUG_ON(!dead); - mutex_lock(&dead->pidfd_lock); - BUG_ON(dead->count == 0); - dead->count--; - if (dead->pid == -1) { - dead->pid = create_tmp_process(); - if (dead->pid < 0) { - mutex_unlock(&dead->pidfd_lock); - goto err_close; + if (info->dead && info->dead->creator_id != info->pidfe->id) { + int ret = recv_desc_from_peer(&info->d, &pidfd); + if (ret != 0) { + if (ret != 1) + pr_err("Can't get fd\n"); + return ret; } + goto out; } - pidfd = pidfd_open(dead->pid, info->pidfe->flags); - if (pidfd < 0) { - pr_perror("Could not open pidfd for %d", info->pidfe->nspid); - mutex_unlock(&dead->pidfd_lock); + pid = create_tmp_process(); + if (pid < 0) goto err_close; - } - if (dead->count == 0) { - if (free_dead_pidfd(dead)) { - pr_err("Failed to delete dead_pidfd struct\n"); - mutex_unlock(&dead->pidfd_lock); - close(pidfd); + for (child = dead->list; child; child = child->next) { + if (child == info) + continue; + pidfd = pidfd_open(pid, child->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", child->pidfe->nspid); goto err_close; } + + if (send_desc_to_peer(pidfd, &child->d)) { + pr_perror("Can't send file descriptor"); + close(pidfd); + return -1; + } + close(pidfd); } - mutex_unlock(&dead->pidfd_lock); + pidfd = pidfd_open(pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + if (kill_helper(pid)) + goto err_close; out: if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { goto err_close; @@ -269,32 +266,31 @@ static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) info->pidfe = pb_msg(msg, PidfdEntry); pr_info_pidfd("Collected ", info->pidfe); + info->dead = NULL; if (info->pidfe->nspid != -1) goto out; dead = lookup_dead_pidfd(info->pidfe->ino); - if (dead) { - mutex_lock(&dead->pidfd_lock); - dead->count++; - mutex_unlock(&dead->pidfd_lock); - goto out; - } - - dead = shmalloc(sizeof(*dead)); if (!dead) { - pr_err("Could not allocate shared memory..\n"); - return -1; + dead = xmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->list = NULL; + dead->ino = info->pidfe->ino; + dead->creator_id = info->pidfe->id; + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); } - INIT_HLIST_NODE(&dead->hash); - dead->ino = info->pidfe->ino; - dead->count = 1; - dead->pid = -1; - mutex_init(&dead->pidfd_lock); + info->dead = dead; + info->next = dead->list; + dead->list = info; + if (dead->creator_id > info->pidfe->id) + dead->creator_id = info->pidfe->id; - mutex_lock(dead_pidfd_hash_lock); - hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); - mutex_unlock(dead_pidfd_hash_lock); out: return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); } diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 44ac64fe57..71a1b6a535 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -56,6 +56,7 @@ TST_NOFILE := \ pidfd_self \ pidfd_of_thread \ pidfd_dead \ + pidfd_diffdead \ pidfd_child \ pidfd_kill \ fd_from_pidfd \ diff --git a/test/zdtm/static/pidfd_diffdead.c b/test/zdtm/static/pidfd_diffdead.c new file mode 100644 index 0000000000..5bc1911a51 --- /dev/null +++ b/test/zdtm/static/pidfd_diffdead.c @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of processes that point to a common dead pidfd\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main and child open a pidfd for grandchild. + * Before C/R we kill grandchild. + * We end up with two pidfds in two diff processes that point to the same dead process. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char *argv[]) +{ +#define READ 0 +#define WRITE 1 + + int child, ret, gchild, status; + struct statx stat; + task_waiter_t t; + unsigned long long ino; + + /* + * We use the inop pipe to send the inode number of the + * pidfd opened in the child to the main process for + * comparison. + */ + int p[2]; + int pidfd; + + test_init(argc, argv); + task_waiter_init(&t); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild; + gchild = test_fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[READ]); + close(p[WRITE]); + while (1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + task_waiter_complete(&t, 1); + + test_waitsig(); + + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &ino, sizeof(ino)) != sizeof(ino)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + close(pidfd); + + /* ino number should be same because both pidfds were for the same process */ + if (ino != stat.stx_ino) { + exit(1); + } + exit(0); + } + } + + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + /* + * We kill grandchild process only after opening pidfd. + */ + if (pidfd_send_signal(pidfd, SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + return 1; + } + + /* Wait for child to waitpid on gchild */ + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + close(p[READ]); + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + goto err; + } + + /* Send inode number of pidfd to child for comparison */ + if (write(p[WRITE], &stat.stx_ino, sizeof(stat.stx_ino)) != sizeof(stat.stx_ino)) { + pr_perror("write"); + goto err; + } + close(p[WRITE]); + + if (kill(child, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto err; + } + + if (!WIFEXITED(status)) { + fail("Expected child to terminate normally"); + goto err; + } + + if (WEXITSTATUS(status) != 0) { + fail("Child failed"); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +}