說一說 linux 虛擬機的 ringbuffer 大小由來
linux 下的網卡 ringbuffer 大小可以通過 ethtool -g eth0 獲取,eth0 是網卡設備名,如果是其他網卡則更換爲對應網卡名即可。
[root@dev-1] /data0/src/kvm/qemu-5.2.0$ ethtool -g eth0
Ring parameters for eth0:
Pre-set maximums:
RX: 1024
RX Mini: 0
RX Jumbo: 0
TX: 1024
Current hardware settings:
RX: 1024
RX Mini: 0
RX Jumbo: 0
TX: 1024
從上面獲取的信息可以看到,eth0 網卡的最大發送和接收 ringbuffer 大小都是 1024,而當時已經設置(使用)的發送和接收網卡 ringbuffer 大小是 1024。
那這個 1024 大小是如何獲取到的呢?是存放在哪裏的呢?
接下來通過源碼一步步的帶大家瞭解虛擬機的 ringbuffer 的由來。
通過下載 ethtool 的源碼入手,從 ethtool.c 的 main 函數查看:
static const struct option {
const char *opts;
int want_device;
int (*func)(struct cmd_context *);
char *help;
char *opthelp;
} args[] = {
{ "-g|--show-ring", 1, do_gring, "Query RX/TX ring parameters" },
}
// 通過do_gring函數獲取對應網卡的隊列大小(ringbuffer)
static int do_gring(struct cmd_context *ctx)
{
struct ethtool_ringparam ering;
int err;
if (ctx->argc != 0)
exit_bad_args();
fprintf(stdout, "Ring parameters for %s:\n", ctx->devname);
ering.cmd = ETHTOOL_GRINGPARAM;
err = send_ioctl(ctx, &ering);
if (err == 0) {
err = dump_ring(&ering);
if (err)
return err;
} else {
perror("Cannot get device ring settings");
return 76;
}
return 0;
}
int send_ioctl(struct cmd_context *ctx, void *cmd)
{
ctx->ifr.ifr_data = cmd;
return ioctl(ctx->fd, SIOCETHTOOL, &ctx->ifr);
}
調用 ioctl 系統調用
COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
compat_ulong_t, arg32)
{
unsigned long arg = arg32;
struct fd f = fdget(fd);
int error = -EBADF;
if (!f.file)
goto out;
/* RED-PEN how should LSM module know it's handling 32bit? */
error = security_file_ioctl(f.file, cmd, arg);
if (error)
goto out_fput;
/*
* To allow the compat_ioctl handlers to be self contained
* we need to check the common ioctls here first.
* Just handle them with the standard handlers below.
*/
switch (cmd) {
case FIOCLEX:
case FIONCLEX:
case FIONBIO:
case FIOASYNC:
case FIOQSIZE:
break;
......
default: //從上面的send_ioctl函數可以看到cmd=SIOCETHTOOL,所以默認走到default邏輯
if (f.file->f_op->compat_ioctl) {
error = f.file->f_op->compat_ioctl(f.file, cmd, arg);//默認走compat_ioctl
if (error != -ENOIOCTLCMD)
goto out_fput;
}
if (!f.file->f_op->unlocked_ioctl)
goto do_ioctl;
break;
}
......
}
compat_ioctl() 是個函數指針,註冊的對應實際函數是 compat_sock_ioctl() 如下所示:
tatic const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read_iter = sock_read_iter,
.write_iter = sock_write_iter,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
static long compat_sock_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct socket *sock = file->private_data;
int ret = -ENOIOCTLCMD;
struct sock *sk;
struct net *net;
sk = sock->sk;
net = sock_net(sk);
if (sock->ops->compat_ioctl)
ret = sock->ops->compat_ioctl(sock, cmd, arg);
if (ret == -ENOIOCTLCMD &&
(cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
ret = compat_wext_handle_ioctl(net, cmd, arg);
if (ret == -ENOIOCTLCMD)
ret = compat_sock_ioctl_trans(file, sock, cmd, arg);
return ret;
}
static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
unsigned int cmd, unsigned long arg)
{
void __user *argp = compat_ptr(arg);
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
return compat_ifr_data_ioctl(net, cmd, argp);
switch (cmd) {
case SIOCSIFBR:
case SIOCGIFBR:
return old_bridge_ioctl(argp);
case SIOCGIFCONF:
return compat_dev_ifconf(net, argp);
case SIOCETHTOOL:
return ethtool_ioctl(net, argp);
......
}
cmd=SIOCETHTOOL,所以會調用 ethtool_ioctl() 函數。
static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
{
struct compat_ethtool_rxnfc __user *compat_rxnfc;
bool convert_in = false, convert_out = false;
size_t buf_size = 0;
struct ethtool_rxnfc __user *rxnfc = NULL;
struct ifreq ifr;
u32 rule_cnt = 0, actual_rule_cnt;
u32 ethcmd;
u32 data;
int ret;
if (get_user(data, &ifr32->ifr_ifru.ifru_data))
return -EFAULT;
compat_rxnfc = compat_ptr(data);
if (get_user(ethcmd, &compat_rxnfc->cmd))
return -EFAULT;
/* Most ethtool structures are defined without padding.
* Unfortunately struct ethtool_rxnfc is an exception.
*/
switch (ethcmd) {
default:
break;
case ETHTOOL_GRXRINGS:
case ETHTOOL_GRXCLSRLCNT:
case ETHTOOL_GRXCLSRULE:
case ETHTOOL_SRXCLSRLINS:
convert_out = true;
/* fall through */
case ETHTOOL_SRXCLSRLDEL:
buf_size += sizeof(struct ethtool_rxnfc);
convert_in = true;
rxnfc = compat_alloc_user_space(buf_size);
break;
}
......
ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL);
if (ret)
return ret;
......
}
int dev_ethtool(struct net *net, struct ifreq *ifr)
{
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
void __user *useraddr = ifr->ifr_data;
u32 ethcmd, sub_cmd;
int rc;
netdev_features_t old_features;
if (!dev || !netif_device_present(dev))
return -ENODEV;
if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
if (ethcmd == ETHTOOL_PERQUEUE) {
if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
return -EFAULT;
} else {
sub_cmd = ethcmd;
}
/* Allow some commands to be done by anyone */
switch (sub_cmd) {
case ETHTOOL_GSET:
case ETHTOOL_GDRVINFO:
case ETHTOOL_GMSGLVL:
case ETHTOOL_GLINK:
case ETHTOOL_GCOALESCE:
case ETHTOOL_GRINGPARAM:
case ETHTOOL_GRINGPARAM:
rc = ethtool_get_ringparam(dev, useraddr);// 通過ethtool_get_ringparam()函數獲取ringbuffer大小
break;
case ETHTOOL_SRINGPARAM:
rc = ethtool_set_ringparam(dev, useraddr);
break;
......
}
static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
if (!dev->ethtool_ops->get_ringparam)
return -EOPNOTSUPP;
dev->ethtool_ops->get_ringparam(dev, &ringparam);
if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
return -EFAULT;
return 0;
}
//get_ringparam註冊的對應函數是virtnet_get_ringparam
static const struct ethtool_ops virtnet_ethtool_ops = {
.get_drvinfo = virtnet_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_ringparam = virtnet_get_ringparam,
.get_strings = virtnet_get_strings,
.get_sset_count = virtnet_get_sset_count,
.get_ethtool_stats = virtnet_get_ethtool_stats,
.set_channels = virtnet_set_channels,
.get_channels = virtnet_get_channels,
.get_ts_info = ethtool_op_get_ts_info,
.get_link_ksettings = virtnet_get_link_ksettings,
.set_link_ksettings = virtnet_set_link_ksettings,
};
static void virtnet_get_ringparam(struct net_device *dev,
struct ethtool_ringparam *ring)
{
struct virtnet_info *vi = netdev_priv(dev);
ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
ring->rx_pending = ring->rx_max_pending;
ring->tx_pending = ring->tx_max_pending;
}
unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
return vq->vring.num;// vring.num就是隊列大小(ringbuffer)
}
OK,以上就是通過 ethtool 工具獲取網卡 ringbuffer 的過程,從最後可以看到,針對虛擬機獲取網卡 ringbuffer,最終走到了 virtio-net 驅動層,獲取的是 vring.num 變量的值作爲網卡的 ringbuffer 大小返回。那麼 virtio-net 層的 vring.num 是在什麼時候傳入的呢?
接下來從虛擬化層面闡述下 vring.num 的由來。
虛擬機的網卡隊列長度(ringbuffer)大小在定義 XML 文件的時候可以傳入,該大小默認值是 256,即如果不設置網卡隊列長度的話, 也是最小值,默認就是 256,最大值是 1024,所以 virtio-net 驅動創建的網卡隊列長度大小在 [256,1024] 範圍內。
<interface type='direct'>
<mac address='52:54:00:3b:a7:a7'/>
<source dev='br0' mode='bridge'/>
<model type='virtio'/>
<driver name='vhost' queues='2' rx_queue_size='1024' tx_queue_size='1024'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/>
</interface>
以下是 qemu 的部分代碼,代碼主要在啓動 VM 的過程中初始化 virtio-net 網卡的 ringbuffer 信息。
#define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256
#define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256
/* for now, only allow larger queues; with virtio-1, guest can downsize */
#define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
#define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
#define VIRTQUEUE_MAX_SIZE 1024
static void virtio_net_device_realize(DeviceState *dev, Error **errp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIONet *n = VIRTIO_NET(dev);
NetClientState *nc;
int i;
if (n->net_conf.mtu) {
n->host_features |= (1ULL << VIRTIO_NET_F_MTU);
}
.......
if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE ||
n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE ||
!is_power_of_2(n->net_conf.rx_queue_size)) { //對配置的虛擬機隊列長度進行判斷,必須在VIRTIO_NET_RX_QUEUE_MIN_SIZE和VIRTQUEUE_MAX_SIZE 之間
error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), "
"must be a power of 2 between %d and %d.",
n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE,
VIRTQUEUE_MAX_SIZE);
virtio_cleanup(vdev);
return;
}
if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE ||
n->net_conf.tx_queue_size > VIRTQUEUE_MAX_SIZE ||
!is_power_of_2(n->net_conf.tx_queue_size)) {
error_setg(errp, "Invalid tx_queue_size (= %" PRIu16 "), "
"must be a power of 2 between %d and %d",
n->net_conf.tx_queue_size, VIRTIO_NET_TX_QUEUE_MIN_SIZE,
VIRTQUEUE_MAX_SIZE);
virtio_cleanup(vdev);
return;
}
.......
for (i = 0; i < n->max_queues; i++) {
virtio_net_add_queue(n, i);
}
static void virtio_net_add_queue(VirtIONet *n, int index)
{
VirtIODevice *vdev = VIRTIO_DEVICE(n);
n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
virtio_net_handle_rx);//對每個網卡隊列用rx_queue_size初始化網卡隊列長度
if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
n->vqs[index].tx_vq =
virtio_add_queue(vdev, n->net_conf.tx_queue_size,
virtio_net_handle_tx_timer);
n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
virtio_net_tx_timer,
&n->vqs[index]);
} else {
n->vqs[index].tx_vq =
virtio_add_queue(vdev, n->net_conf.tx_queue_size,
virtio_net_handle_tx_bh);
n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]);
}
n->vqs[index].tx_waiting = 0;
n->vqs[index].n = n;
}
VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
VirtIOHandleOutput handle_output)
{
int i;
for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
if (vdev->vq[i].vring.num == 0)
break;
}
if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE)
abort();
vdev->vq[i].vring.num = queue_size;// vring.num = queue_size
vdev->vq[i].vring.num_default = queue_size;
vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;
vdev->vq[i].handle_output = handle_output;
vdev->vq[i].handle_aio_output = NULL;
vdev->vq[i].used_elems = g_malloc0(sizeof(VirtQueueElement) *
queue_size);
return &vdev->vq[i];
}
總結:從上面的分析可以看到,ethtool -g eth0 獲取到的網卡 ringbuffer 大小實際是從 virtio-net 驅動層獲取到的 vring.num 大小,而 vring.num 的大小是在定義虛擬機 xml 文件的時候通過 xml 定義的 rx_queue_size 和 tx_queue_size 兩個值作爲 vring.num 的大小。
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/FIZmg0r72joytjWo5-PmPQ