eBPF Talk: 在設備層統計網絡包延遲

在 Linux 內核協議棧設備層,有 2 個 tracepoint 可以用來統計網絡包的延遲:

  1. net:net_dev_xmit:設備層發送 skb 的 tracepoint。

  2. net:netif_receive_skb:設備層接收 skb 的 tracepoint。

因此,不管是向外發起請求、還是接收請求,都可以通過這 2 個 tracepoint 來統計網絡包的延遲。

利用這 2 個 tracepoint,寫了個 skbdist 工具,可以統計網絡包的延遲分佈。

如果懷疑服務器響應慢,可以用 skbdist 來統計網絡包的延遲分佈,看看是不是真的慢。

skbdist 效果

向外發起請求,統計網絡包的延遲分佈:

$ ping -c1 8.8.8.8
PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data.
64 bytes from 8.8.8.8: icmp_seq=ttl=128 time=160 ms

--- 8.8.8.8 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 159.706/159.706/159.706/0.000 ms

$ sudo ./skbdist -i ens33 icmp
2024/07/14 12:45:36 Attached tracepoint/net/netif_receive_skb
2024/07/14 12:45:36 Attached tracepoint/net/net_dev_xmit
Ctrl+C to show results..
^C
192.168.241.133:0 -> 8.8.8.8:0 (ICMP) (total 1 records) :
        µs               : count         distribution
         0 -> 1          : 0             |                                        |
         ...
    131072 -> 262143     : 1             |****************************************|```

接收外部請求,統計網絡包的延遲分佈:

$ ping -c1 192.168.241.133
PING 192.168.241.133 (192.168.241.133): 56 data bytes
64 bytes from 192.168.241.133: icmp_seq=ttl=64 time=0.360 ms

--- 192.168.241.133 ping statistics ---
1 packets transmitted, 1 packets received, 0.0% packet loss
round-trip min/avg/max/stddev = 0.360/0.360/0.360/0.000 ms

$ sudo ./skbdist -i ens33 icmp
2024/07/14 12:45:52 Attached tracepoint/net/netif_receive_skb
2024/07/14 12:45:52 Attached tracepoint/net/net_dev_xmit
Ctrl+C to show results..
^C
192.168.241.133:0 -> 192.168.241.1:0 (ICMP) (total 1 records) :
        µs               : count         distribution
         0 -> 1          : 0             |                                        |
         2 -> 3          : 0             |                                        |
         4 -> 7          : 0             |                                        |
         8 -> 15         : 0             |                                        |
        16 -> 31         : 0             |                                        |
        32 -> 63         : 1             |****************************************|

skbdist 實現

先看看 skbdist 使用的 2 個 tracepoint:

$ cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/format
name: netif_receive_skb
ID: 1622
format:
    field:unsigned short common_type;   offset:0;   size:2; signed:0;
    field:unsigned char common_flags;   offset:2;   size:1; signed:0;
    field:unsigned char common_preempt_count;   offset:3;   size:1; signed:0;
    field:int common_pid;   offset:4;   size:4; signed:1;

    field:void * skbaddr;   offset:8;   size:8; signed:0;
    field:unsigned int len; offset:16;  size:4; signed:0;
    field:__data_loc char[] name;   offset:20;  size:4; signed:0;

print fmt: "dev=%s skbaddr=%p len=%u", __get_str(name), REC->skbaddr, REC->len

$ cat /sys/kernel/debug/tracing/events/net/net_dev_xmit/format
name: net_dev_xmit
ID: 1625
format:
    field:unsigned short common_type;   offset:0;   size:2; signed:0;
    field:unsigned char common_flags;   offset:2;   size:1; signed:0;
    field:unsigned char common_preempt_count;   offset:3;   size:1; signed:0;
    field:int common_pid;   offset:4;   size:4; signed:1;

    field:void * skbaddr;   offset:8;   size:8; signed:0;
    field:unsigned int len; offset:16;  size:4; signed:0;
    field:int rc;   offset:20;  size:4; signed:1;
    field:__data_loc char[] name;   offset:24;  size:4; signed:0;

print fmt: "dev=%s skbaddr=%p len=%u rc=%d", __get_str(name), REC->skbaddr, REC->len, REC->rc

因爲不需要知道其中的 name,所以 bpf 裏只需要:

struct tp_netif_receive_skb_args {
    __u64 unused;

    void * skbaddr;
    unsigned int len;
};

SEC("tracepoint/net/netif_receive_skb")
int tracepoint__netif_receive_skb(struct tp_netif_receive_skb_args *args)
{
    return handle_skb((struct sk_buff *) args->skbaddr, true);
}

struct tp_net_dev_xmit_args {
    __u64 unused;

    void * skbaddr;
    unsigned int len;
    int rc;
};

SEC("tracepoint/net/net_dev_xmit")
int tracepoint__net_dev_xmit(struct tp_net_dev_xmit_args *args)
{
    return handle_skb((struct sk_buff *) args->skbaddr, false);
}

handle_skb 函數會計算 skb 的延遲,並統計到 bpf_map 裏。

NOTE: 對於收包,需要將 IP 層的地址和 4 層的端口信息互換一下位置,保持跟發包方向的信息一致。

skbdist 支持 pcap-filter(7)

skbdist 不使用 --addr/--port 等參數,而是像 tcpdump 一樣支持 pcap-filter(7)[1],可以更靈活地過濾網絡包。

因此,在過濾網絡包的時候,可以使用 icmptcp port 80udp port 53 等等,而不使用 src 1.1.1.1udp src port 43 這樣帶有方向的 pcap-filter(7)

skbdist 統計 skb 長度

與此同時,skbdist 也統計了 skb 的長度分佈:

$ sudo ./skbdist -i ens33 tcp port 8080
2024/07/14 13:06:06 Attached tracepoint/net/netif_receive_skb
2024/07/14 13:06:06 Attached tracepoint/net/net_dev_xmit
Ctrl+C to show results..
^C
Send SKB lengths (total 5 pkts) :
      byte               : count         distribution
         0 -> 1          : 0             |                                        |
         2 -> 3          : 0             |                                        |
         4 -> 7          : 0             |                                        |
         8 -> 15         : 0             |                                        |
        16 -> 31         : 0             |                                        |
        32 -> 63         : 0             |                                        |
        64 -> 127        : 3             |****************************************|
       128 -> 255        : 1             |*************                           |
       256 -> 511        : 0             |                                        |
       512 -> 1023       : 1             |*************                           |

Receive SKB lengths (total 6 pkts) :
      byte               : count         distribution
         0 -> 1          : 0             |                                        |
         2 -> 3          : 0             |                                        |
         4 -> 7          : 0             |                                        |
         8 -> 15         : 0             |                                        |
        16 -> 31         : 0             |                                        |
        32 -> 63         : 4             |****************************************|
        64 -> 127        : 1             |**********                              |
       128 -> 255        : 1             |**********                              |


192.168.241.133:8080 -> 192.168.241.1:53402 (TCP) (total 5 records) :
        µs               : count         distribution
         0 -> 1          : 1             |********************                    |
         2 -> 3          : 0             |                                        |
         4 -> 7          : 0             |                                        |
         8 -> 15         : 0             |                                        |
        16 -> 31         : 0             |                                        |
        32 -> 63         : 1             |********************                    |
        64 -> 127        : 2             |****************************************|
       128 -> 255        : 0             |                                        |
       256 -> 511        : 0             |                                        |
       512 -> 1023       : 0             |                                        |
      1024 -> 2047       : 0             |                                        |
      2048 -> 4095       : 0             |                                        |
      4096 -> 8191       : 0             |                                        |
      8192 -> 16383      : 1             |********************                    |

skbdist 源代碼:skbdist[2].

總結

利用設備層的 2 個 tracepoint net:net_dev_xmitnet:netif_receive_skb,可以統計網絡包的延遲分佈,以及 skb 的長度分佈。

參考資料

[1]

pcap-filter(7): https://www.tcpdump.org/manpages/pcap-filter.7.html

[2]

skbdist: https://github.com/Asphaltt/skbdist

本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源https://mp.weixin.qq.com/s/dYK8ulggmY9-fQD9V7qJzw