說一說 linux 虛擬機的 ringbuffer 大小由來

linux 下的網卡 ringbuffer 大小可以通過 ethtool -g eth0 獲取,eth0 是網卡設備名,如果是其他網卡則更換爲對應網卡名即可。

[root@dev-1] /data0/src/kvm/qemu-5.2.0$ ethtool -g eth0
Ring parameters for eth0:
Pre-set maximums:
RX:  1024
RX Mini: 0
RX Jumbo: 0
TX:  1024
Current hardware settings:
RX:  1024
RX Mini: 0
RX Jumbo: 0
TX:  1024

從上面獲取的信息可以看到,eth0 網卡的最大發送和接收 ringbuffer 大小都是 1024,而當時已經設置(使用)的發送和接收網卡 ringbuffer 大小是 1024。

那這個 1024 大小是如何獲取到的呢?是存放在哪裏的呢?

接下來通過源碼一步步的帶大家瞭解虛擬機的 ringbuffer 的由來。

通過下載 ethtool 的源碼入手,從 ethtool.c 的 main 函數查看:

static const struct option {
    const char *opts;
    int want_device;
    int (*func)(struct cmd_context *);
    char *help;
    char *opthelp;
} args[] = {
  { "-g|--show-ring", 1, do_gring, "Query RX/TX ring parameters" },
}
// 通過do_gring函數獲取對應網卡的隊列大小(ringbuffer)
 static int do_gring(struct cmd_context *ctx)
 {
     struct ethtool_ringparam ering;
     int err;
     if (ctx->argc != 0)
         exit_bad_args();
     fprintf(stdout, "Ring parameters for %s:\n", ctx->devname);
     ering.cmd = ETHTOOL_GRINGPARAM;
     err = send_ioctl(ctx, &ering);
     if (err == 0) {
         err = dump_ring(&ering);
         if (err)
             return err;
     } else {
         perror("Cannot get device ring settings");
         return 76;
     }
     return 0;
 }
int send_ioctl(struct cmd_context *ctx, void *cmd)
{
    ctx->ifr.ifr_data = cmd;
    return ioctl(ctx->fd, SIOCETHTOOL, &ctx->ifr);
}

調用 ioctl 系統調用

 COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
                compat_ulong_t, arg32)
 {
     unsigned long arg = arg32;
     struct fd f = fdget(fd);
     int error = -EBADF;
     if (!f.file)
         goto out;
     /* RED-PEN how should LSM module know it's handling 32bit? */
     error = security_file_ioctl(f.file, cmd, arg);
     if (error)
         goto out_fput;
     /*
      * To allow the compat_ioctl handlers to be self contained
      * we need to check the common ioctls here first.
      * Just handle them with the standard handlers below.
      */
     switch (cmd) {
     case FIOCLEX:
     case FIONCLEX:
     case FIONBIO:
     case FIOASYNC:
     case FIOQSIZE:
         break;
     ......
     default: //從上面的send_ioctl函數可以看到cmd=SIOCETHTOOL,所以默認走到default邏輯
         if (f.file->f_op->compat_ioctl) {
             error = f.file->f_op->compat_ioctl(f.file, cmd, arg);//默認走compat_ioctl
             if (error != -ENOIOCTLCMD)
                 goto out_fput;
         }
         if (!f.file->f_op->unlocked_ioctl)
             goto do_ioctl;
         break;
     }     
     ...... 
}

compat_ioctl() 是個函數指針,註冊的對應實際函數是 compat_sock_ioctl() 如下所示:

tatic const struct file_operations socket_file_ops = {
    .owner =    THIS_MODULE,
    .llseek =   no_llseek,
    .read_iter =    sock_read_iter,
    .write_iter =   sock_write_iter,
    .poll =     sock_poll,
    .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl = compat_sock_ioctl,
#endif
    .mmap =     sock_mmap,
    .release =  sock_close,
    .fasync =   sock_fasync,
    .sendpage = sock_sendpage,
    .splice_write = generic_splice_sendpage,
    .splice_read =  sock_splice_read,
};
 static long compat_sock_ioctl(struct file *file, unsigned int cmd,
                   unsigned long arg)
{
     struct socket *sock = file->private_data;
     int ret = -ENOIOCTLCMD;
     struct sock *sk;
     struct net *net;
     sk = sock->sk;
     net = sock_net(sk);
     if (sock->ops->compat_ioctl)
         ret = sock->ops->compat_ioctl(sock, cmd, arg);
     if (ret == -ENOIOCTLCMD &&
         (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
         ret = compat_wext_handle_ioctl(net, cmd, arg);
     if (ret == -ENOIOCTLCMD)
         ret = compat_sock_ioctl_trans(file, sock, cmd, arg);
     return ret;
 }
static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
             unsigned int cmd, unsigned long arg)
{
    void __user *argp = compat_ptr(arg);
    struct sock *sk = sock->sk;
    struct net *net = sock_net(sk);
    if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
        return compat_ifr_data_ioctl(net, cmd, argp);
    switch (cmd) {
    case SIOCSIFBR:
    case SIOCGIFBR:
        return old_bridge_ioctl(argp);
    case SIOCGIFCONF:
        return compat_dev_ifconf(net, argp);
    case SIOCETHTOOL:
        return ethtool_ioctl(net, argp);
    ......
}

cmd=SIOCETHTOOL,所以會調用 ethtool_ioctl() 函數。

 static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
{
     struct compat_ethtool_rxnfc __user *compat_rxnfc;
     bool convert_in = false, convert_out = false;
     size_t buf_size = 0;
     struct ethtool_rxnfc __user *rxnfc = NULL;
     struct ifreq ifr;
     u32 rule_cnt = 0, actual_rule_cnt;
     u32 ethcmd;
     u32 data;
     int ret;
     if (get_user(data, &ifr32->ifr_ifru.ifru_data))
         return -EFAULT;
     compat_rxnfc = compat_ptr(data);
     if (get_user(ethcmd, &compat_rxnfc->cmd))
         return -EFAULT;
     /* Most ethtool structures are defined without padding.
      * Unfortunately struct ethtool_rxnfc is an exception.
      */
     switch (ethcmd) {
     default:
         break;
     case ETHTOOL_GRXRINGS:
     case ETHTOOL_GRXCLSRLCNT:
     case ETHTOOL_GRXCLSRULE:
     case ETHTOOL_SRXCLSRLINS:
         convert_out = true;
         /* fall through */
     case ETHTOOL_SRXCLSRLDEL:
         buf_size += sizeof(struct ethtool_rxnfc);
         convert_in = true;
         rxnfc = compat_alloc_user_space(buf_size);
         break;
     }
     ......
     ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL);
     if (ret)
         return ret;
     ......
}                   
int dev_ethtool(struct net *net, struct ifreq *ifr)
{
    struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
    void __user *useraddr = ifr->ifr_data;
    u32 ethcmd, sub_cmd;
    int rc;
    netdev_features_t old_features;
    if (!dev || !netif_device_present(dev))
        return -ENODEV;
    if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd)))
        return -EFAULT;
    if (ethcmd == ETHTOOL_PERQUEUE) {
        if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
            return -EFAULT;
    } else {
        sub_cmd = ethcmd;
    }
    /* Allow some commands to be done by anyone */
    switch (sub_cmd) {
    case ETHTOOL_GSET:
    case ETHTOOL_GDRVINFO:
    case ETHTOOL_GMSGLVL:
    case ETHTOOL_GLINK:
    case ETHTOOL_GCOALESCE:
    case ETHTOOL_GRINGPARAM:
case ETHTOOL_GRINGPARAM:
        rc = ethtool_get_ringparam(dev, useraddr);// 通過ethtool_get_ringparam()函數獲取ringbuffer大小
        break;
    case ETHTOOL_SRINGPARAM:
        rc = ethtool_set_ringparam(dev, useraddr);
        break;
    ......
}
static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
{
    struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
    if (!dev->ethtool_ops->get_ringparam)
        return -EOPNOTSUPP;
    dev->ethtool_ops->get_ringparam(dev, &ringparam);
    if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
        return -EFAULT;
    return 0;
}
//get_ringparam註冊的對應函數是virtnet_get_ringparam
static const struct ethtool_ops virtnet_ethtool_ops = {
    .get_drvinfo = virtnet_get_drvinfo,
    .get_link = ethtool_op_get_link,
    .get_ringparam = virtnet_get_ringparam,
    .get_strings = virtnet_get_strings,
    .get_sset_count = virtnet_get_sset_count,
    .get_ethtool_stats = virtnet_get_ethtool_stats,
    .set_channels = virtnet_set_channels,
    .get_channels = virtnet_get_channels,
    .get_ts_info = ethtool_op_get_ts_info,
    .get_link_ksettings = virtnet_get_link_ksettings,
    .set_link_ksettings = virtnet_set_link_ksettings,
};
static void virtnet_get_ringparam(struct net_device *dev,
                struct ethtool_ringparam *ring)
{
    struct virtnet_info *vi = netdev_priv(dev);
    ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
    ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
    ring->rx_pending = ring->rx_max_pending;
    ring->tx_pending = ring->tx_max_pending;
}
unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
{
    struct vring_virtqueue *vq = to_vvq(_vq);
    return vq->vring.num;// vring.num就是隊列大小(ringbuffer)
}

OK,以上就是通過 ethtool 工具獲取網卡 ringbuffer 的過程,從最後可以看到,針對虛擬機獲取網卡 ringbuffer,最終走到了 virtio-net 驅動層,獲取的是 vring.num 變量的值作爲網卡的 ringbuffer 大小返回。那麼 virtio-net 層的 vring.num 是在什麼時候傳入的呢?

接下來從虛擬化層面闡述下 vring.num 的由來。

虛擬機的網卡隊列長度(ringbuffer)大小在定義 XML 文件的時候可以傳入,該大小默認值是 256,即如果不設置網卡隊列長度的話, 也是最小值,默認就是 256,最大值是 1024,所以 virtio-net 驅動創建的網卡隊列長度大小在 [256,1024] 範圍內。

      <interface type='direct'>
        <mac address='52:54:00:3b:a7:a7'/>
        <source dev='br0' mode='bridge'/>
        <model type='virtio'/>
       <driver name='vhost' queues='2' rx_queue_size='1024' tx_queue_size='1024'/>
        <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/>
     </interface>

以下是 qemu 的部分代碼,代碼主要在啓動 VM 的過程中初始化 virtio-net 網卡的 ringbuffer 信息。

  #define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256
 #define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256
 /* for now, only allow larger queues; with virtio-1, guest can downsize */
 #define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
 #define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
 #define VIRTQUEUE_MAX_SIZE 1024
 static void virtio_net_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIONet *n = VIRTIO_NET(dev);
     NetClientState *nc;
     int i;
     if (n->net_conf.mtu) {
         n->host_features |= (1ULL << VIRTIO_NET_F_MTU);
     }
    .......
    if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE ||
        n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE ||
        !is_power_of_2(n->net_conf.rx_queue_size)) { //對配置的虛擬機隊列長度進行判斷,必須在VIRTIO_NET_RX_QUEUE_MIN_SIZE和VIRTQUEUE_MAX_SIZE 之間
        error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), "
                   "must be a power of 2 between %d and %d.",
                   n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE,
                   VIRTQUEUE_MAX_SIZE);
        virtio_cleanup(vdev);
        return;
    }
    if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE ||
        n->net_conf.tx_queue_size > VIRTQUEUE_MAX_SIZE ||
        !is_power_of_2(n->net_conf.tx_queue_size)) {
        error_setg(errp, "Invalid tx_queue_size (= %" PRIu16 "), "
                   "must be a power of 2 between %d and %d",
                   n->net_conf.tx_queue_size, VIRTIO_NET_TX_QUEUE_MIN_SIZE,
                   VIRTQUEUE_MAX_SIZE);
        virtio_cleanup(vdev);
        return;
    }
     ....... 
     for (i = 0; i < n->max_queues; i++) {
        virtio_net_add_queue(n, i);
    }
static void virtio_net_add_queue(VirtIONet *n, int index)
{                            
    VirtIODevice *vdev = VIRTIO_DEVICE(n); 
    n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
                                           virtio_net_handle_rx);//對每個網卡隊列用rx_queue_size初始化網卡隊列長度
    if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
        n->vqs[index].tx_vq =
            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
                             virtio_net_handle_tx_timer);
        n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                              virtio_net_tx_timer,
                                              &n->vqs[index]);
    } else {
        n->vqs[index].tx_vq =
            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
                             virtio_net_handle_tx_bh);
        n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]);
    }
    n->vqs[index].tx_waiting = 0;
    n->vqs[index].n = n;
}   
VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
                            VirtIOHandleOutput handle_output)
{       
    int i;  
    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
        if (vdev->vq[i].vring.num == 0)
            break;
    }
    if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE)
        abort();
    vdev->vq[i].vring.num = queue_size;// vring.num = queue_size
    vdev->vq[i].vring.num_default = queue_size;
    vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;
    vdev->vq[i].handle_output = handle_output;
    vdev->vq[i].handle_aio_output = NULL;
    vdev->vq[i].used_elems = g_malloc0(sizeof(VirtQueueElement) *
                                       queue_size);
    return &vdev->vq[i];
}

總結:從上面的分析可以看到,ethtool -g eth0 獲取到的網卡 ringbuffer 大小實際是從 virtio-net 驅動層獲取到的 vring.num 大小,而 vring.num 的大小是在定義虛擬機 xml 文件的時候通過 xml 定義的 rx_queue_size 和 tx_queue_size 兩個值作爲 vring.num 的大小。

本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源https://mp.weixin.qq.com/s/FIZmg0r72joytjWo5-PmPQ