epoll是linux中最多见的io复用函数,它的高效简洁是其余两个不能比拟的,解决了以前的大量fd问题后,同时针对poll的效率问题作了提高,它利用内核去保存传入的fd而非poll_wait时候才保存传入的fd;另外它也不是将current轮流挂入fd的等待队列中,而是在设备的等待队列醒来时调用一个回调函数。node
咱们来看看源码:linux
asmlinkage long sys_epoll_create(int size) { int error, fd; struct inode *inode; struct file *file; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", current, size)); /* Sanity check on the size parameter */ error = -EINVAL; if (size <= 0) goto eexit_1; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure, and inode and a free file descriptor. */ error = ep_getfd(&fd, &inode, &file); if (error) goto eexit_1; /* Setup the file internal data structure ( "struct eventpoll" ) */ error = ep_file_init(file); if (error) goto eexit_2; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, fd)); return fd; eexit_2: sys_close(fd); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, error)); return error; }
咱们再来看看sys_epoll_ctl():ide
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) { int error; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", current, epfd, op, fd, event)); error = -EFAULT; if (EP_OP_HASH_EVENT(op) && copy_from_user(&epds, event, sizeof(struct epoll_event)))//从用户空间拷贝 goto eexit_1; //拿到flie和tfile,并对tfile、file进行检查 error = -EBADF; file = fget(epfd); if (!file) goto eexit_1; tfile = fget(fd); if (!tfile) goto eexit_2; error = -EPERM; if (!tfile->f_op || !tfile->f_op->poll) goto eexit_3; error = -EINVAL; if (file == tfile || !IS_FILE_EPOLL(file)) goto eexit_3; ep = file->private_data;//获取在epoll_create中建立的私有数据 down_write(&ep->sem); /* Try to lookup the file inside our hash table */ epi = ep_find(ep, tfile, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) error = ep_remove(ep, epi); else error = -ENOENT; break; case EPOLL_CTL_MOD: if (epi) { epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; break; } if (epi) ep_release_epitem(epi); up_write(&ep->sem); eexit_3: fput(tfile); eexit_2: fput(file); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", current, epfd, op, fd, event, error)); return error; }
咱们能够在去看看ep_find:函数
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; unsigned long flags; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; EP_SET_FFD(&ffd, file, fd); read_lock_irqsave(&ep->lock, flags); for (rbp = ep->rbr.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = EP_CMP_FFD(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { ep_use_epitem(epi); epir = epi; break; } } read_unlock_irqrestore(&ep->lock, flags); DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", current, file, epir)); return epir; }
基础的看完了,来看看核心的sys_epoll_wait:ui
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout) { int error; struct file *file; struct eventpoll *ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", current, epfd, events, maxevents, timeout)); /* 事件数量得大于0 */ if (maxevents <= 0) return -EINVAL; /* 验证一下用户的写权限 */ if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))) goto eexit_1; /* 拿到epfd中的file* */ error = -EBADF; file = fget(epfd); if (!file) goto eexit_1; /* 检测fd对应的文件结构 */ error = -EINVAL; if (!IS_FILE_EPOLL(file)) goto eexit_2; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = file->private_data; /* 进入循环 */ error = ep_poll(ep, events, maxevents, timeout); eexit_2: fput(file); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", current, epfd, events, maxevents, timeout, error)); return error; }
进入ep_poll:this
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res, eavail; unsigned long flags; long jtimeout; wait_queue_t wait; /* * Calculate the timeout by checking for the "infinite" value ( -1 ) * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ? MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000; /* 在这里循环开始 */ retry: write_lock_irqsave(&ep->lock, flags);//锁写权限 res = 0; if (list_empty(&ep->rdllist)) { /* 此时为空,在这里阻塞,等待有一个回调函数“把我叫醒” */ init_waitqueue_entry(&wait, current); add_wait_queue(&ep->wq, &wait); for (;;) { /* 除了睡觉和队列为空,都不能退出循环 */ set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&ep->rdllist) || !jtimeout) break; if (signal_pending(current)) { res = -EINTR; break; } write_unlock_irqrestore(&ep->lock, flags); jtimeout = schedule_timeout(jtimeout); write_lock_irqsave(&ep->lock, flags); } remove_wait_queue(&ep->wq, &wait); set_current_state(TASK_RUNNING); } /* 队列中还有没有 */ eavail = !list_empty(&ep->rdllist); write_unlock_irqrestore(&ep->lock, flags); /* 尝试去再次去再次循环 */ if (!res && eavail && !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout) goto retry; return res; }
而在尝试再次循环时候有一个函数ep_events_transfer,这就是向用户返回的就绪fd函数:.net
static int ep_events_transfer(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { int eventcnt = 0; struct list_head txlist; INIT_LIST_HEAD(&txlist); down_read(&ep->sem); //ep_collect_ready_items将完成的在rdlist中的放入txlist if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) { //将txlist中的fd发回给用户,它将poll第二参数置为空,那么只是返回了事件掩码mask eventcnt = ep_send_events(ep, &txlist, events); /* 将txlist中一部分fd返还给rdlist,这里就是EPOLL——ET模式的体现了,若是没有ET模式,那么这个txlist会被清空。而这一部分fd的条件是:!epi->event.events & EPOLLET && epi->revents & epi->event.events,事件被关注,而且事件没有被标注ET */ ep_reinject_items(ep, &txlist); } up_read(&ep->sem); return eventcnt; }
最后来总结一下epoll流程:指针
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *ev);rest
- 第一个参数是epollfd也就是epollcreate返回的那个; - 第二个则是控制字段,由三个宏实现: - EPOLL_CTL_ADD:将描述符fd添加到epoll实例中的兴趣列表中去。对于fd上咱们感兴趣的事件,都指定在ev所指向的结构体中。若是咱们试图向兴趣列表中添加一个已存在的文件描述符,epoll_ctl()将出现EEXIST错误。 - EPOLL_CTL_MOD:修改描述符上设定的事件,须要用到由ev所指向的结构体中的信息。若是咱们试图修改不在兴趣列表中的文件描述符,epoll_ctl()将出现ENOENT错误。 - EPOLL_CTL_DEL:将文件描述符fd从epfd的兴趣列表中移除,该操做忽略参数ev。若是咱们试图移除一个不在epfd的兴趣列表中的文件描述符,epoll_ctl()将出现ENOENT错误。关闭一个文件描述符会自动将其从全部的epoll实例的兴趣列表移除。 - 第三个则是事件fd - 第四个是事件结构体指针 - struct epoll_event { uint32_t events; /* 一个位掩码,它指定了咱们为待检查的描述符fd上所感兴趣的事件集合 */ epoll_data_t data; /* User data */ }; typedef union epoll_data { void *ptr; /* Pointer to user-defind data */ int fd; /* File descriptor */ uint32_t u32; /* 32-bit integer */ uint64_t u64; /* 64-bit integer */ }epoll_data_t; - 而这个event.events可设置为 + EPOLLIN 可读次优先级事件 + EPOLLPRI 可读高优先级事件 + EPOLLRDHUP 套接字对端关闭 + EPOLLOUT 可写事件 + EPOLLET ET模式 + EPOLLONESHOT 完成事件后停用,一次性 + EPOLLERR 错误事件 + EPOLLHUP 出现挂断
int epoll_wait(int epfd, struct epoll_event *evlist, int maxevents, int timeout);code
参数timeout用来肯定epoll_wait()的阻塞行为,有以下几种。
返回值:
参考博客:
https://blog.csdn.net/huangjh...
https://blog.csdn.net/shansha...