epoll函数源码剖析

时间 2019-11-06

标签 epoll 函数源码剖析繁體版

原文原文链接

I/O复用函数

epoll是linux中最多见的io复用函数，它的高效简洁是其余两个不能比拟的，解决了以前的大量fd问题后,同时针对poll的效率问题作了提高，它利用内核去保存传入的fd而非poll_wait时候才保存传入的fd；另外它也不是将current轮流挂入fd的等待队列中，而是在设备的等待队列醒来时调用一个回调函数。node

咱们来看看源码：linux

asmlinkage long sys_epoll_create(int size)
{
    int error, fd;
    struct inode *inode;
    struct file *file;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
             current, size));

    /* Sanity check on the size parameter */
    error = -EINVAL;
    if (size <= 0)
        goto eexit_1;

    /*
     * Creates all the items needed to setup an eventpoll file. That is,
     * a file structure, and inode and a free file descriptor.
     */
    error = ep_getfd(&fd, &inode, &file);
    if (error)
        goto eexit_1;

    /* Setup the file internal data structure ( "struct eventpoll" ) */
    error = ep_file_init(file);
    if (error)
        goto eexit_2;


    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
             current, size, fd));

    return fd;

eexit_2:
    sys_close(fd);
eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
             current, size, error));
    return error;
}

create第一次调用时候建立了新的inode、file、fd，Linux遵循一切皆文件的原则，一切都是文件操做，返回的也是一个fd。这样作还有一个好处，指针的指向并很差判断资源的有效性，可是fd就能够经过current->files->fd_array[]找到。

咱们再来看看sys_epoll_ctl()：ide

asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
 {
    int error;
    struct file *file, *tfile;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
             current, epfd, op, fd, event));

    error = -EFAULT;
    if (EP_OP_HASH_EVENT(op) &&
        copy_from_user(&epds, event, sizeof(struct epoll_event)))//从用户空间拷贝
        goto eexit_1;

    //拿到flie和tfile，并对tfile、file进行检查
    error = -EBADF;
    file = fget(epfd);
    if (!file)
        goto eexit_1;
    tfile = fget(fd);
    if (!tfile)    
        goto eexit_2;
    error = -EPERM;
    if (!tfile->f_op || !tfile->f_op->poll)
        goto eexit_3;
    error = -EINVAL;
    if (file == tfile || !IS_FILE_EPOLL(file))
        goto eexit_3;

    ep = file->private_data;//获取在epoll_create中建立的私有数据
       down_write(&ep->sem);
     /* Try to lookup the file inside our hash table */
    epi = ep_find(ep, tfile, fd);
       error = -EINVAL;
    switch (op) {
    case EPOLL_CTL_ADD:
        if (!epi) {
            epds.events |= POLLERR | POLLHUP;

            error = ep_insert(ep, &epds, tfile, fd);
        } else
            error = -EEXIST;
        break;
    case EPOLL_CTL_DEL:
        if (epi)
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    case EPOLL_CTL_MOD:
        if (epi) {
            epds.events |= POLLERR | POLLHUP;
            error = ep_modify(ep, epi, &epds);
        } else
            error = -ENOENT;
        break;
    }

    if (epi)
        ep_release_epitem(epi);
    up_write(&ep->sem);
 eexit_3:
    fput(tfile);
 eexit_2:
    fput(file);
 eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
             current, epfd, op, fd, event, error));

    return error;
 }

若是操做是ADD那么先在eventpoll进行ep_find()随后若是没有找到epitem那么就进行添加，找到的话就返回EEXIST；
若是是DEL，那么也是先进行查找，找到的话执行ep_remove()，不然返回ENOENT；
若是操做时MOD，那么查找完成后没有返回ENOENT，存在的话就对事件进行修改；

咱们能够在去看看ep_find：函数

static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
   int kcmp;
   unsigned long flags;
   struct rb_node *rbp;
   struct epitem *epi, *epir = NULL;
   struct epoll_filefd ffd;

   EP_SET_FFD(&ffd, file, fd);
   read_lock_irqsave(&ep->lock, flags);
   for (rbp = ep->rbr.rb_node; rbp; ) {
       epi = rb_entry(rbp, struct epitem, rbn);
       kcmp = EP_CMP_FFD(&ffd, &epi->ffd);
       if (kcmp > 0)
           rbp = rbp->rb_right;
       else if (kcmp < 0)
           rbp = rbp->rb_left;
       else {
           ep_use_epitem(epi);
           epir = epi;
           break;
       }
   }
   read_unlock_irqrestore(&ep->lock, flags);

   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
            current, file, epir));

   return epir;
}

能够看出find主要就是为了从eventpoll中找到储存文件描述信息的红黑树中查找指定的fd对应的poitem

基础的看完了，来看看核心的sys_epoll_wait：ui

asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
                   int maxevents, int timeout)
{
    int error;
    struct file *file;
    struct eventpoll *ep;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
             current, epfd, events, maxevents, timeout));

    /* 事件数量得大于0 */
    if (maxevents <= 0)
        return -EINVAL;

    /* 验证一下用户的写权限 */
    if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))))
        goto eexit_1;

    /* 拿到epfd中的file* */
    error = -EBADF;
    file = fget(epfd);
    if (!file)
        goto eexit_1;

    /* 检测fd对应的文件结构 */
    error = -EINVAL;
    if (!IS_FILE_EPOLL(file))
        goto eexit_2;

    /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
    ep = file->private_data;

    /* 进入循环 */
    error = ep_poll(ep, events, maxevents, timeout);

eexit_2:
    fput(file);
eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
             current, epfd, events, maxevents, timeout, error));

    return error;
}

进入ep_poll：this

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
    int res, eavail;
    unsigned long flags;
    long jtimeout;
    wait_queue_t wait;

    /*
     * Calculate the timeout by checking for the "infinite" value ( -1 )
     * and the overflow condition. The passed timeout is in milliseconds,
     * that why (t * HZ) / 1000.
     */
    jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
        MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;

    /* 在这里循环开始 */
retry:
    write_lock_irqsave(&ep->lock, flags);//锁写权限

    res = 0;
    if (list_empty(&ep->rdllist)) {
        /* 此时为空，在这里阻塞，等待有一个回调函数“把我叫醒” */
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&ep->wq, &wait);

        for (;;) {
            /* 除了睡觉和队列为空，都不能退出循环 */
            set_current_state(TASK_INTERRUPTIBLE);
            if (!list_empty(&ep->rdllist) || !jtimeout)
                break;
            if (signal_pending(current)) {
                res = -EINTR;
                break;
            }

            write_unlock_irqrestore(&ep->lock, flags);
            jtimeout = schedule_timeout(jtimeout);
            write_lock_irqsave(&ep->lock, flags);
        }
        remove_wait_queue(&ep->wq, &wait);

        set_current_state(TASK_RUNNING);
    }

    /* 队列中还有没有 */
    eavail = !list_empty(&ep->rdllist);

    write_unlock_irqrestore(&ep->lock, flags);

    /* 尝试去再次去再次循环 */
    if (!res && eavail &&
        !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
        goto retry;

    return res;
}

而负责清空链表的就是ep_insert中的init_poll_funcptr(&epq.pt, ep_ptable_queue_proc)，ep_insert主要就是申请了一个eppoll_entry而且设置回掉函数，相对于poll的每次挂入等待队列，epoll只是在epoll_ctl时候将current挂载，而后经过回调函数收集fd

而在尝试再次循环时候有一个函数ep_events_transfer，这就是向用户返回的就绪fd函数：.net

static int ep_events_transfer(struct eventpoll *ep, struct epoll_event __user *events, 
                                 int maxevents)
   {
       int eventcnt = 0;
       struct list_head txlist;
   
       INIT_LIST_HEAD(&txlist);

       down_read(&ep->sem);
        
       //ep_collect_ready_items将完成的在rdlist中的放入txlist
       if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
           //将txlist中的fd发回给用户，它将poll第二参数置为空，那么只是返回了事件掩码mask
           eventcnt = ep_send_events(ep, &txlist, events);
           /* 将txlist中一部分fd返还给rdlist，这里就是EPOLL——ET模式的体现了，若是没有ET模式，那么这个txlist会被清空。而这一部分fd的条件是：!epi->event.events & EPOLLET && epi->revents & epi->event.events，事件被关注，而且事件没有被标注ET */
           ep_reinject_items(ep, &txlist);
       }
   
       up_read(&ep->sem);
   
       return eventcnt;
   }

最后来总结一下epoll流程：指针

int epoll_create(int size); //这个size在高版本中已经弃用，返回的是注册的文件系统fd

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *ev);rest

- 第一个参数是epollfd也就是epollcreate返回的那个；
- 第二个则是控制字段，由三个宏实现：
  - EPOLL_CTL_ADD：将描述符fd添加到epoll实例中的兴趣列表中去。对于fd上咱们感兴趣的事件，都指定在ev所指向的结构体中。若是咱们试图向兴趣列表中添加一个已存在的文件描述符，epoll_ctl()将出现EEXIST错误。
  - EPOLL_CTL_MOD：修改描述符上设定的事件，须要用到由ev所指向的结构体中的信息。若是咱们试图修改不在兴趣列表中的文件描述符，epoll_ctl()将出现ENOENT错误。
  - EPOLL_CTL_DEL：将文件描述符fd从epfd的兴趣列表中移除，该操做忽略参数ev。若是咱们试图移除一个不在epfd的兴趣列表中的文件描述符，epoll_ctl()将出现ENOENT错误。关闭一个文件描述符会自动将其从全部的epoll实例的兴趣列表移除。
- 第三个则是事件fd
- 第四个是事件结构体指针
  -   
      struct epoll_event  
      {  
            uint32_t    events; /* 一个位掩码，它指定了咱们为待检查的描述符fd上所感兴趣的事件集合 */  
            epoll_data_t    data;   /* User data */  
      };  
      typedef union epoll_data  
      {  
            void        *ptr;   /* Pointer to user-defind data */  
            int     fd; /* File descriptor */  
            uint32_t    u32;    /* 32-bit integer */  
            uint64_t    u64;    /* 64-bit integer */  
      }epoll_data_t;  
  - 而这个event.events可设置为
    + EPOLLIN         可读次优先级事件
    + EPOLLPRI        可读高优先级事件
    + EPOLLRDHUP     套接字对端关闭
    + EPOLLOUT 可写事件
    + EPOLLET ET模式
    + EPOLLONESHOT 完成事件后停用，一次性
    + EPOLLERR 错误事件
    + EPOLLHUP 出现挂断

int epoll_wait(int epfd, struct epoll_event *evlist, int maxevents, int timeout);code
- 参数timeout用来肯定epoll_wait()的阻塞行为，有以下几种。
  - 若是timeout等于-1，调用将一直阻塞，直到兴趣列表中的文件描述符上有事件产生或者直到捕获到一个信号为止。
  - 若是timeout等于0，执行一次非阻塞式地检查，看兴趣列表中的描述符上产生了哪一个事件。
  - 若是timeout大于0，调用将阻塞至多timeout毫秒，直到文件描述符上有事件发生，或者直到捕获到一个信号为止
- 返回值：
  - 成功返回就绪态文件描述符数目
  - 超时返回0
  - 若出错返回-1

参考博客：
https://blog.csdn.net/huangjh...
https://blog.csdn.net/shansha...