Linux kernel Vhost-net 和 Virtio-net代码详解

场景

Host上运行qemu kvm虚拟机,其中虚拟机的网卡类型为virtio-net,而Host上virtio-net backend使用vhost-netnode

 

数据包进入虚拟机代码分析

首先看vhost-net模块注册,主要使用linux内核提供的内存注册机制,这部分开发过linux kernel的人都应该linux

很了解啦api

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static  struct  miscdevice vhost_net_misc = {
     .minor = VHOST_NET_MINOR,
     .name =  "vhost-net" ,
     .fops = &vhost_net_fops,
};
 
static  int  vhost_net_init( void )
{
     if  (experimental_zcopytx)
         vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
     return  misc_register(&vhost_net_misc);
}
module_init(vhost_net_init);
 
static  void  vhost_net_exit( void )
{
     misc_deregister(&vhost_net_misc);
}
module_exit(vhost_net_exit);
 
MODULE_VERSION( "0.0.1" );
MODULE_LICENSE( "GPL v2" );
MODULE_AUTHOR( "Michael S. Tsirkin" );
MODULE_DESCRIPTION( "Host kernel accelerator for virtio net" );
MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
MODULE_ALIAS( "devname:vhost-net" );

其中vhost_net_fops表明字符设备支持的用户态接口。字符设备为/dev/vhost-netapp

1
2
3
4
5
6
7
8
9
10
static  const  struct  file_operations vhost_net_fops = {
     .owner          = THIS_MODULE,
     .release        = vhost_net_release,
     .unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
     .compat_ioctl   = vhost_net_compat_ioctl,
#endif
     .open           = vhost_net_open,
     .llseek     = noop_llseek,
};

当用户态进行使用open系统调用的使用,则执行vhost_net_open函数,该函数主要对该less

字符设备进行初始化socket

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
static  int  vhost_net_open( struct  inode *inode,  struct  file *f)
{
     struct  vhost_net *n = kmalloc( sizeof  *n, GFP_KERNEL);
     struct  vhost_dev *dev;
     struct  vhost_virtqueue **vqs;
     int  r, i;
 
     if  (!n)
         return  -ENOMEM;
     vqs = kmalloc(VHOST_NET_VQ_MAX *  sizeof (*vqs), GFP_KERNEL);
     if  (!vqs) {
         kfree(n);
         return  -ENOMEM;
     }
 
     dev = &n->dev;
     vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
     vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
     n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
     n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
     for  (i = 0; i < VHOST_NET_VQ_MAX; i++) {
         n->vqs[i].ubufs = NULL;
         n->vqs[i].ubuf_info = NULL;
         n->vqs[i].upend_idx = 0;
         n->vqs[i].done_idx = 0;
         n->vqs[i].vhost_hlen = 0;
         n->vqs[i].sock_hlen = 0;
     }
     r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
     if  (r < 0) {
         kfree(n);
         kfree(vqs);
         return  r;
     }
 
     vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
     vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
 
     f->private_data = n;
 
     return  0;
}

从上述代码中能够看出vhost-net模块的核心数据结果关系图以下函数

 

为了获取tap设备的数据包,vhost-net模块注册了该设备的tun scoketoop

1
2
3
4
5
6
7
8
9
10
11
12
static  long  vhost_net_set_backend( struct  vhost_net *n, unsigned index,  int  fd)
{
 
     sock = get_socket(fd);
     if  (IS_ERR(sock)) {
         r = PTR_ERR(sock);
         goto  err_vq;
     }
 
         vq->private_data = sock;
 
}

tun socket的收发包函数为this

1
2
3
4
5
static  const  struct  proto_ops tun_socket_ops = {
     .sendmsg = tun_sendmsg,
     .recvmsg = tun_recvmsg,
     .release = tun_release,
};

  

当tap获取到数据包的时候,vhost-net会调用atom

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
static  void  handle_rx( struct  vhost_net *net)
{
     struct  vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
     struct  vhost_virtqueue *vq = &nvq->vq;
     unsigned uninitialized_var(in),  log ;
     struct  vhost_log *vq_log;
     struct  msghdr msg = {
         .msg_name = NULL,
         .msg_namelen = 0,
         .msg_control = NULL,  /* FIXME: get and handle RX aux data. */
         .msg_controllen = 0,
         .msg_iov = vq->iov,
         .msg_flags = MSG_DONTWAIT,
     };
     struct  virtio_net_hdr_mrg_rxbuf hdr = {
         .hdr.flags = 0,
         .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
     };
     size_t  total_len = 0;
     int  err, mergeable;
     s16 headcount;
     size_t  vhost_hlen, sock_hlen;
     size_t  vhost_len, sock_len;
     struct  socket *sock;
 
     mutex_lock(&vq->mutex);
     sock = vq->private_data;
     if  (!sock)
         goto  out;
     vhost_disable_notify(&net->dev, vq);
 
     vhost_hlen = nvq->vhost_hlen;
     sock_hlen = nvq->sock_hlen;
 
     vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
         vq-> log  : NULL;
     mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
     while  ((sock_len = peek_head_len(sock->sk))) {
         sock_len += sock_hlen;
         vhost_len = sock_len + vhost_hlen;
         headcount = get_rx_bufs(vq, vq->heads, vhost_len,
                     &in, vq_log, & log ,
                     likely(mergeable) ? UIO_MAXIOV : 1);
         /* On error, stop handling until the next kick. */
         if  (unlikely(headcount < 0))
             break ;
         /* On overrun, truncate and discard */
         if  (unlikely(headcount > UIO_MAXIOV)) {
             msg.msg_iovlen = 1;
             err = sock->ops->recvmsg(NULL, sock, &msg,
                          1, MSG_DONTWAIT | MSG_TRUNC);
             pr_debug( "Discarded rx packet: len %zd\n" , sock_len);
             continue ;
         }
         /* OK, now we need to know about added descriptors. */
         if  (!headcount) {
             if  (unlikely(vhost_enable_notify(&net->dev, vq))) {
                 /* They have slipped one in as we were
                  * doing that: check again. */
                 vhost_disable_notify(&net->dev, vq);
                 continue ;
             }
             /* Nothing new?  Wait for eventfd to tell us
              * they refilled. */
             break ;
         }
         /* We don't need to be notified again. */
         if  (unlikely((vhost_hlen)))
             /* Skip header. TODO: support TSO. */
             move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in);
         else
             /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
              * needed because recvmsg can modify msg_iov. */
             copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
         msg.msg_iovlen = in;
         err = sock->ops->recvmsg(NULL, sock, &msg,
                      sock_len, MSG_DONTWAIT | MSG_TRUNC);
         /* Userspace might have consumed the packet meanwhile:
          * it's not supposed to do this usually, but might be hard
          * to prevent. Discard data we got (if any) and keep going. */
         if  (unlikely(err != sock_len)) {
             pr_debug( "Discarded rx packet: "
                  " len %d, expected %zd\n" , err, sock_len);
             vhost_discard_vq_desc(vq, headcount);
             continue ;
         }
         if  (unlikely(vhost_hlen) &&
             memcpy_toiovecend(nvq->hdr, (unsigned  char  *)&hdr, 0,
                       vhost_hlen)) {
             vq_err(vq,  "Unable to write vnet_hdr at addr %p\n" ,
                    vq->iov->iov_base);
             break ;
         }
         /* TODO: Should check and handle checksum. */
 
         hdr.num_buffers = cpu_to_vhost16(vq, headcount);
         if  (likely(mergeable) &&
             memcpy_toiovecend(nvq->hdr, ( void  *)&hdr.num_buffers,
                       offsetof(typeof(hdr), num_buffers),
                       sizeof  hdr.num_buffers)) {
             vq_err(vq,  "Failed num_buffers write" );
             vhost_discard_vq_desc(vq, headcount);
             break ;
         }
         vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
                         headcount);
         if  (unlikely(vq_log))
             vhost_log_write(vq, vq_log,  log , vhost_len);
         total_len += vhost_len;
         if  (unlikely(total_len >= VHOST_NET_WEIGHT)) {
             vhost_poll_queue(&vq->poll);
             break ;
         }
     }
out:
     mutex_unlock(&vq->mutex);
}

从上述代码中能够看出

sock->ops->recvmsg会执行tun socket ops的tun_recvmsg函数,把tap收到的skb,放到virt_queue结构体中而后经过qemu kvm,以中断的形式唤醒virtio-net驱动的收报函数,注意vhost-net的收发包队列与virtio-net的收发包队列

是共享的

1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static  int  virtnet_poll( struct  napi_struct *napi,  int  budget)
{
     struct  receive_queue *rq =
         container_of(napi,  struct  receive_queue, napi);
     struct  virtnet_info *vi = rq->vq->vdev->priv;
     void  *buf;
     unsigned  int  r, len, received = 0;
 
again:
     while  (received < budget &&
            (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
         receive_buf(vi, rq, buf, len);
         --rq->num;
         received++;
     }
 
     if  (rq->num < rq->max / 2) {
         if  (!try_fill_recv(vi, rq, GFP_ATOMIC))
             schedule_delayed_work(&vi->refill, 0);
     }
 
     /* Out of packets? */
     if  (received < budget) {
         r = virtqueue_enable_cb_prepare(rq->vq);
         napi_complete(napi);
         if  (unlikely(virtqueue_poll(rq->vq, r)) &&
             napi_schedule_prep(napi)) {
             virtqueue_disable_cb(rq->vq);
             __napi_schedule(napi);
             goto  again;
         }
     }
 
     return  received;
}

  

该函数receive_buf会调用linux kernel标准的协议栈收报函数netif_receive_skb,至此数据包就经过tap到vhost-net

最终送到了虚拟机中

 

虚拟机向外发包

虚拟机向外发送数据包,首先会走linux 协议栈,协议栈发包最终都会调用网卡的xmit函数,对于

virtio-net网卡其xmit函数为

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
static  netdev_tx_t start_xmit( struct  sk_buff *skb,  struct  net_device *dev)
{
     struct  virtnet_info *vi = netdev_priv(dev);
     int  qnum = skb_get_queue_mapping(skb);
     struct  send_queue *sq = &vi->sq[qnum];
     int  err;
     struct  netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
     bool  kick = !skb->xmit_more;
 
     /* Free up any pending old buffers before queueing new ones. */
     free_old_xmit_skbs(sq);
 
     /* Try to transmit */
     err = xmit_skb(sq, skb);
 
     /* This should not happen! */
     if  (unlikely(err)) {
         dev->stats.tx_fifo_errors++;
         if  (net_ratelimit())
             dev_warn(&dev->dev,
                  "Unexpected TXQ (%d) queue failure: %d\n" , qnum, err);
         dev->stats.tx_dropped++;
         kfree_skb(skb);
         return  NETDEV_TX_OK;
     }
 
     /* Don't wait up for transmitted skbs to be freed. */
     skb_orphan(skb);
     nf_reset(skb);
 
     /* Apparently nice girls don't return TX_BUSY; stop the queue
      * before it gets out of hand.  Naturally, this wastes entries. */
     if  (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
         netif_stop_subqueue(dev, qnum);
         if  (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
             /* More just got used, free them then recheck. */
             free_old_xmit_skbs(sq);
             if  (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
                 netif_start_subqueue(dev, qnum);
                 virtqueue_disable_cb(sq->vq);
             }
         }
     }
 
     if  (kick || netif_xmit_stopped(txq))
         virtqueue_kick(sq->vq);
 
     return  NETDEV_TX_OK;
}

从代码中看就是把skb发到virtqueue中,而后调用virtqueue_kick通知qemu kvm,kvm 会把该数据包

送给vhost-net,vhost-net会调用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
static  void  handle_tx( struct  vhost_net *net)
{
     struct  vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
     struct  vhost_virtqueue *vq = &nvq->vq;
     unsigned out, in, s;
     int  head;
     struct  msghdr msg = {
         .msg_name = NULL,
         .msg_namelen = 0,
         .msg_control = NULL,
         .msg_controllen = 0,
         .msg_iov = vq->iov,
         .msg_flags = MSG_DONTWAIT,
     };
     size_t  len, total_len = 0;
     int  err;
     size_t  hdr_size;
     struct  socket *sock;
     struct  vhost_net_ubuf_ref *uninitialized_var(ubufs);
     bool  zcopy, zcopy_used;
 
     mutex_lock(&vq->mutex);
     sock = vq->private_data;
     if  (!sock)
         goto  out;
 
     vhost_disable_notify(&net->dev, vq);
 
     hdr_size = nvq->vhost_hlen;
     zcopy = nvq->ubufs;
 
     for  (;;) {
         /* Release DMAs done buffers first */
         if  (zcopy)
             vhost_zerocopy_signal_used(net, vq);
 
         /* If more outstanding DMAs, queue the work.
          * Handle upend_idx wrap around
          */
         if  (unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND)
                   % UIO_MAXIOV == nvq->done_idx))
             break ;
 
         head = vhost_get_vq_desc(vq, vq->iov,
                      ARRAY_SIZE(vq->iov),
                      &out, &in,
                      NULL, NULL);
         /* On error, stop handling until the next kick. */
         if  (unlikely(head < 0))
             break ;
         /* Nothing new?  Wait for eventfd to tell us they refilled. */
         if  (head == vq->num) {
             if  (unlikely(vhost_enable_notify(&net->dev, vq))) {
                 vhost_disable_notify(&net->dev, vq);
                 continue ;
             }
             break ;
         }
         if  (in) {
             vq_err(vq,  "Unexpected descriptor format for TX: "
                    "out %d, int %d\n" , out, in);
             break ;
         }
         /* Skip header. TODO: support TSO. */
         s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out);
         msg.msg_iovlen = out;
         len = iov_length(vq->iov, out);
         /* Sanity check */
         if  (!len) {
             vq_err(vq,  "Unexpected header len for TX: "
                    "%zd expected %zd\n" ,
                    iov_length(nvq->hdr, s), hdr_size);
             break ;
         }
         zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN ||
                        nvq->upend_idx != nvq->done_idx);
 
         /* use msg_control to pass vhost zerocopy ubuf info to skb */
         if  (zcopy_used) {
             vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
             if  (!vhost_net_tx_select_zcopy(net) ||
                 len < VHOST_GOODCOPY_LEN) {
                 /* copy don't need to wait for DMA done */
                 vq->heads[nvq->upend_idx].len =
                             VHOST_DMA_DONE_LEN;
                 msg.msg_control = NULL;
                 msg.msg_controllen = 0;
                 ubufs = NULL;
             else  {
                 struct  ubuf_info *ubuf;
                 ubuf = nvq->ubuf_info + nvq->upend_idx;
 
                 vq->heads[nvq->upend_idx].len =
                     VHOST_DMA_IN_PROGRESS;
                 ubuf->callback = vhost_zerocopy_callback;
                 ubuf->ctx = nvq->ubufs;
                 ubuf->desc = nvq->upend_idx;
                 msg.msg_control = ubuf;
                 msg.msg_controllen =  sizeof (ubuf);
                 ubufs = nvq->ubufs;
                 atomic_inc(&ubufs->refcount);
             }
             nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
         else
             msg.msg_control = NULL;
         /* TODO: Check specific error and bomb out unless ENOBUFS? */
         err = sock->ops->sendmsg(NULL, sock, &msg, len);
         if  (unlikely(err < 0)) {
             if  (zcopy_used) {
                 if  (ubufs)
                     vhost_net_ubuf_put(ubufs);
                 nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
                     % UIO_MAXIOV;
             }
             vhost_discard_vq_desc(vq, 1);
             break ;
         }
         if  (err != len)
             pr_debug( "Truncated TX packet: "
                  " len %d != %zd\n" , err, len);
         if  (!zcopy_used)
             vhost_add_used_and_signal(&net->dev, vq, head, 0);
         else
             vhost_zerocopy_signal_used(net, vq);
         total_len += len;
         vhost_net_tx_packet(net);
         if  (unlikely(total_len >= VHOST_NET_WEIGHT)) {
             vhost_poll_queue(&vq->poll);
             break ;
         }
     }
out:
     mutex_unlock(&vq->mutex);
}

在该函数中会调用sock->ops->sendmsg,也就是tun_sendmsg,在该函数中最终会调用

netif_rx,该函数就是协议栈网卡的收报函数,表明tap网卡已经收到数据包了,而后就能够经过

linux协议的briage把数据包发送出去啦

相关文章
相关标签/搜索