Host上运行qemu kvm虚拟机,其中虚拟机的网卡类型为virtio-net,而Host上virtio-net backend使用vhost-netnode
首先看vhost-net模块注册,主要使用linux内核提供的内存注册机制,这部分开发过linux kernel的人都应该linux
很了解啦api
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
static
struct
miscdevice vhost_net_misc = {
.minor = VHOST_NET_MINOR,
.name =
"vhost-net"
,
.fops = &vhost_net_fops,
};
static
int
vhost_net_init(
void
)
{
if
(experimental_zcopytx)
vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
return
misc_register(&vhost_net_misc);
}
module_init(vhost_net_init);
static
void
vhost_net_exit(
void
)
{
misc_deregister(&vhost_net_misc);
}
module_exit(vhost_net_exit);
MODULE_VERSION(
"0.0.1"
);
MODULE_LICENSE(
"GPL v2"
);
MODULE_AUTHOR(
"Michael S. Tsirkin"
);
MODULE_DESCRIPTION(
"Host kernel accelerator for virtio net"
);
MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
MODULE_ALIAS(
"devname:vhost-net"
);
|
其中vhost_net_fops表明字符设备支持的用户态接口。字符设备为/dev/vhost-netapp
1
2
3
4
5
6
7
8
9
10
|
static
const
struct
file_operations vhost_net_fops = {
.owner = THIS_MODULE,
.release = vhost_net_release,
.unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = vhost_net_compat_ioctl,
#endif
.open = vhost_net_open,
.llseek = noop_llseek,
};
|
当用户态进行使用open系统调用的使用,则执行vhost_net_open函数,该函数主要对该less
字符设备进行初始化socket
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
static
int
vhost_net_open(
struct
inode *inode,
struct
file *f)
{
struct
vhost_net *n = kmalloc(
sizeof
*n, GFP_KERNEL);
struct
vhost_dev *dev;
struct
vhost_virtqueue **vqs;
int
r, i;
if
(!n)
return
-ENOMEM;
vqs = kmalloc(VHOST_NET_VQ_MAX *
sizeof
(*vqs), GFP_KERNEL);
if
(!vqs) {
kfree(n);
return
-ENOMEM;
}
dev = &n->dev;
vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
for
(i = 0; i < VHOST_NET_VQ_MAX; i++) {
n->vqs[i].ubufs = NULL;
n->vqs[i].ubuf_info = NULL;
n->vqs[i].upend_idx = 0;
n->vqs[i].done_idx = 0;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
}
r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
if
(r < 0) {
kfree(n);
kfree(vqs);
return
r;
}
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
f->private_data = n;
return
0;
}
|
从上述代码中能够看出vhost-net模块的核心数据结果关系图以下函数
为了获取tap设备的数据包,vhost-net模块注册了该设备的tun scoketoop
1
2
3
4
5
6
7
8
9
10
11
12
|
static
long
vhost_net_set_backend(
struct
vhost_net *n, unsigned index,
int
fd)
{
sock = get_socket(fd);
if
(IS_ERR(sock)) {
r = PTR_ERR(sock);
goto
err_vq;
}
vq->private_data = sock;
}
|
tun socket的收发包函数为this
1
2
3
4
5
|
static
const
struct
proto_ops tun_socket_ops = {
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
.release = tun_release,
};
|
当tap获取到数据包的时候,vhost-net会调用atom
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
static
void
handle_rx(
struct
vhost_net *net)
{
struct
vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
struct
vhost_virtqueue *vq = &nvq->vq;
unsigned uninitialized_var(in),
log
;
struct
vhost_log *vq_log;
struct
msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_control = NULL,
/* FIXME: get and handle RX aux data. */
.msg_controllen = 0,
.msg_iov = vq->iov,
.msg_flags = MSG_DONTWAIT,
};
struct
virtio_net_hdr_mrg_rxbuf hdr = {
.hdr.flags = 0,
.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
};
size_t
total_len = 0;
int
err, mergeable;
s16 headcount;
size_t
vhost_hlen, sock_hlen;
size_t
vhost_len, sock_len;
struct
socket *sock;
mutex_lock(&vq->mutex);
sock = vq->private_data;
if
(!sock)
goto
out;
vhost_disable_notify(&net->dev, vq);
vhost_hlen = nvq->vhost_hlen;
sock_hlen = nvq->sock_hlen;
vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
vq->
log
: NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
while
((sock_len = peek_head_len(sock->sk))) {
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads, vhost_len,
&in, vq_log, &
log
,
likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
if
(unlikely(headcount < 0))
break
;
/* On overrun, truncate and discard */
if
(unlikely(headcount > UIO_MAXIOV)) {
msg.msg_iovlen = 1;
err = sock->ops->recvmsg(NULL, sock, &msg,
1, MSG_DONTWAIT | MSG_TRUNC);
pr_debug(
"Discarded rx packet: len %zd\n"
, sock_len);
continue
;
}
/* OK, now we need to know about added descriptors. */
if
(!headcount) {
if
(unlikely(vhost_enable_notify(&net->dev, vq))) {
/* They have slipped one in as we were
* doing that: check again. */
vhost_disable_notify(&net->dev, vq);
continue
;
}
/* Nothing new? Wait for eventfd to tell us
* they refilled. */
break
;
}
/* We don't need to be notified again. */
if
(unlikely((vhost_hlen)))
/* Skip header. TODO: support TSO. */
move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in);
else
/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
* needed because recvmsg can modify msg_iov. */
copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
msg.msg_iovlen = in;
err = sock->ops->recvmsg(NULL, sock, &msg,
sock_len, MSG_DONTWAIT | MSG_TRUNC);
/* Userspace might have consumed the packet meanwhile:
* it's not supposed to do this usually, but might be hard
* to prevent. Discard data we got (if any) and keep going. */
if
(unlikely(err != sock_len)) {
pr_debug(
"Discarded rx packet: "
" len %d, expected %zd\n"
, err, sock_len);
vhost_discard_vq_desc(vq, headcount);
continue
;
}
if
(unlikely(vhost_hlen) &&
memcpy_toiovecend(nvq->hdr, (unsigned
char
*)&hdr, 0,
vhost_hlen)) {
vq_err(vq,
"Unable to write vnet_hdr at addr %p\n"
,
vq->iov->iov_base);
break
;
}
/* TODO: Should check and handle checksum. */
hdr.num_buffers = cpu_to_vhost16(vq, headcount);
if
(likely(mergeable) &&
memcpy_toiovecend(nvq->hdr, (
void
*)&hdr.num_buffers,
offsetof(typeof(hdr), num_buffers),
sizeof
hdr.num_buffers)) {
vq_err(vq,
"Failed num_buffers write"
);
vhost_discard_vq_desc(vq, headcount);
break
;
}
vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
headcount);
if
(unlikely(vq_log))
vhost_log_write(vq, vq_log,
log
, vhost_len);
total_len += vhost_len;
if
(unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
break
;
}
}
out:
mutex_unlock(&vq->mutex);
}
|
从上述代码中能够看出
sock->ops->recvmsg会执行tun socket ops的tun_recvmsg函数,把tap收到的skb,放到virt_queue结构体中而后经过qemu kvm,以中断的形式唤醒virtio-net驱动的收报函数,注意vhost-net的收发包队列与virtio-net的收发包队列
是共享的
1
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
static
int
virtnet_poll(
struct
napi_struct *napi,
int
budget)
{
struct
receive_queue *rq =
container_of(napi,
struct
receive_queue, napi);
struct
virtnet_info *vi = rq->vq->vdev->priv;
void
*buf;
unsigned
int
r, len, received = 0;
again:
while
(received < budget &&
(buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
receive_buf(vi, rq, buf, len);
--rq->num;
received++;
}
if
(rq->num < rq->max / 2) {
if
(!try_fill_recv(vi, rq, GFP_ATOMIC))
schedule_delayed_work(&vi->refill, 0);
}
/* Out of packets? */
if
(received < budget) {
r = virtqueue_enable_cb_prepare(rq->vq);
napi_complete(napi);
if
(unlikely(virtqueue_poll(rq->vq, r)) &&
napi_schedule_prep(napi)) {
virtqueue_disable_cb(rq->vq);
__napi_schedule(napi);
goto
again;
}
}
return
received;
}
|
该函数receive_buf会调用linux kernel标准的协议栈收报函数netif_receive_skb,至此数据包就经过tap到vhost-net
最终送到了虚拟机中
虚拟机向外发送数据包,首先会走linux 协议栈,协议栈发包最终都会调用网卡的xmit函数,对于
virtio-net网卡其xmit函数为
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
static
netdev_tx_t start_xmit(
struct
sk_buff *skb,
struct
net_device *dev)
{
struct
virtnet_info *vi = netdev_priv(dev);
int
qnum = skb_get_queue_mapping(skb);
struct
send_queue *sq = &vi->sq[qnum];
int
err;
struct
netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
bool
kick = !skb->xmit_more;
/* Free up any pending old buffers before queueing new ones. */
free_old_xmit_skbs(sq);
/* Try to transmit */
err = xmit_skb(sq, skb);
/* This should not happen! */
if
(unlikely(err)) {
dev->stats.tx_fifo_errors++;
if
(net_ratelimit())
dev_warn(&dev->dev,
"Unexpected TXQ (%d) queue failure: %d\n"
, qnum, err);
dev->stats.tx_dropped++;
kfree_skb(skb);
return
NETDEV_TX_OK;
}
/* Don't wait up for transmitted skbs to be freed. */
skb_orphan(skb);
nf_reset(skb);
/* Apparently nice girls don't return TX_BUSY; stop the queue
* before it gets out of hand. Naturally, this wastes entries. */
if
(sq->vq->num_free < 2+MAX_SKB_FRAGS) {
netif_stop_subqueue(dev, qnum);
if
(unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
/* More just got used, free them then recheck. */
free_old_xmit_skbs(sq);
if
(sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
netif_start_subqueue(dev, qnum);
virtqueue_disable_cb(sq->vq);
}
}
}
if
(kick || netif_xmit_stopped(txq))
virtqueue_kick(sq->vq);
return
NETDEV_TX_OK;
}
|
从代码中看就是把skb发到virtqueue中,而后调用virtqueue_kick通知qemu kvm,kvm 会把该数据包
送给vhost-net,vhost-net会调用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
static
void
handle_tx(
struct
vhost_net *net)
{
struct
vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct
vhost_virtqueue *vq = &nvq->vq;
unsigned out, in, s;
int
head;
struct
msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_control = NULL,
.msg_controllen = 0,
.msg_iov = vq->iov,
.msg_flags = MSG_DONTWAIT,
};
size_t
len, total_len = 0;
int
err;
size_t
hdr_size;
struct
socket *sock;
struct
vhost_net_ubuf_ref *uninitialized_var(ubufs);
bool
zcopy, zcopy_used;
mutex_lock(&vq->mutex);
sock = vq->private_data;
if
(!sock)
goto
out;
vhost_disable_notify(&net->dev, vq);
hdr_size = nvq->vhost_hlen;
zcopy = nvq->ubufs;
for
(;;) {
/* Release DMAs done buffers first */
if
(zcopy)
vhost_zerocopy_signal_used(net, vq);
/* If more outstanding DMAs, queue the work.
* Handle upend_idx wrap around
*/
if
(unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND)
% UIO_MAXIOV == nvq->done_idx))
break
;
head = vhost_get_vq_desc(vq, vq->iov,
ARRAY_SIZE(vq->iov),
&out, &in,
NULL, NULL);
/* On error, stop handling until the next kick. */
if
(unlikely(head < 0))
break
;
/* Nothing new? Wait for eventfd to tell us they refilled. */
if
(head == vq->num) {
if
(unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq);
continue
;
}
break
;
}
if
(in) {
vq_err(vq,
"Unexpected descriptor format for TX: "
"out %d, int %d\n"
, out, in);
break
;
}
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out);
msg.msg_iovlen = out;
len = iov_length(vq->iov, out);
/* Sanity check */
if
(!len) {
vq_err(vq,
"Unexpected header len for TX: "
"%zd expected %zd\n"
,
iov_length(nvq->hdr, s), hdr_size);
break
;
}
zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN ||
nvq->upend_idx != nvq->done_idx);
/* use msg_control to pass vhost zerocopy ubuf info to skb */
if
(zcopy_used) {
vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
if
(!vhost_net_tx_select_zcopy(net) ||
len < VHOST_GOODCOPY_LEN) {
/* copy don't need to wait for DMA done */
vq->heads[nvq->upend_idx].len =
VHOST_DMA_DONE_LEN;
msg.msg_control = NULL;
msg.msg_controllen = 0;
ubufs = NULL;
}
else
{
struct
ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
vq->heads[nvq->upend_idx].len =
VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
msg.msg_control = ubuf;
msg.msg_controllen =
sizeof
(ubuf);
ubufs = nvq->ubufs;
atomic_inc(&ubufs->refcount);
}
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
}
else
msg.msg_control = NULL;
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
if
(unlikely(err < 0)) {
if
(zcopy_used) {
if
(ubufs)
vhost_net_ubuf_put(ubufs);
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV;
}
vhost_discard_vq_desc(vq, 1);
break
;
}
if
(err != len)
pr_debug(
"Truncated TX packet: "
" len %d != %zd\n"
, err, len);
if
(!zcopy_used)
vhost_add_used_and_signal(&net->dev, vq, head, 0);
else
vhost_zerocopy_signal_used(net, vq);
total_len += len;
vhost_net_tx_packet(net);
if
(unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
break
;
}
}
out:
mutex_unlock(&vq->mutex);
}
|
在该函数中会调用sock->ops->sendmsg,也就是tun_sendmsg,在该函数中最终会调用
netif_rx,该函数就是协议栈网卡的收报函数,表明tap网卡已经收到数据包了,而后就能够经过
linux协议的briage把数据包发送出去啦