eBPF Talk: XDP on Mellanox
爲了更高的性能,需要將 XDP 程序下沉到網卡驅動裏去運行。
因爲服務器使用的物理網卡是 Mellanox,所以就研究一下 Mellanox 驅動裏是怎麼運行 XDP 程序的。
XDP on Mellanox
直接在內核源代碼裏的 /drivers/net/ethernet/mellanox
目錄下搜索 XDP_REDIRECT
,就能找到如下代碼片段:
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct page *page,
struct bpf_prog *prog, struct xdp_buff *xdp)
{
u32 act;
int err;
act = bpf_prog_run_xdp(prog, xdp);
switch (act) {
case XDP_PASS:
return false;
case XDP_TX:
if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, page, xdp)))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
return true;
case XDP_REDIRECT:
/* When XDP enabled then page-refcnt==1 here */
err = xdp_do_redirect(rq->netdev, xdp, prog);
if (unlikely(err))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
__set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
mlx5e_page_dma_unmap(rq, page);
rq->stats->xdp_redirect++;
return true;
default:
bpf_warn_invalid_xdp_action(rq->netdev, prog, act);
fallthrough;
case XDP_ABORTED:
xdp_abort:
trace_xdp_exception(rq->netdev, prog, act);
fallthrough;
case XDP_DROP:
rq->stats->xdp_drop++;
return true;
}
}
bpf_prog_run_xdp()
就是真實運行 XDP 程序的函數。
XDP_PASS on Mellanox
如果 XDP 程序裏 XDP_PASS
該網絡包到內核,Mellanox 網卡驅動還做了哪些處理呢?
有 3 個地方調了 mlx5e_xdp_handle()
函數。
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
// 第 1 個
static struct sk_buff *
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt)
{
// ...
net_prefetch(data);
prog = rcu_dereference(rq->xdp_prog);
if (prog) {
struct xdp_buff xdp;
net_prefetchw(va); /* xdp_frame data area */
mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
if (mlx5e_xdp_handle(rq, au->page, prog, &xdp))
return NULL; /* page/packet was consumed by XDP */
rx_headroom = xdp.data - xdp.data_hard_start;
metasize = xdp.data - xdp.data_meta;
cqe_bcnt = xdp.data_end - xdp.data;
}
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
// ...
return skb;
}
// 第 2 個
static struct sk_buff *
mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt)
{
// ...
prog = rcu_dereference(rq->xdp_prog);
if (prog && mlx5e_xdp_handle(rq, au->page, prog, &xdp)) {
if (test_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
int i;
for (i = wi - head_wi; i < rq->wqe.info.num_frags; i++)
mlx5e_put_rx_frag(rq, &head_wi[i], true);
}
return NULL; /* page/packet was consumed by XDP */
}
skb = mlx5e_build_linear_skb(rq, xdp.data_hard_start, rq->buff.frame0_sz,
xdp.data - xdp.data_hard_start,
xdp.data_end - xdp.data,
xdp.data - xdp.data_meta);
// ...
return skb;
}
// 第 3 個
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
u16 cqe_bcnt, u32 head_offset, u32 page_idx)
{
// ...
net_prefetch(data);
prog = rcu_dereference(rq->xdp_prog);
if (prog) {
struct xdp_buff xdp;
net_prefetchw(va); /* xdp_frame data area */
mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
if (mlx5e_xdp_handle(rq, au->page, prog, &xdp)) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
__set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
return NULL; /* page/packet was consumed by XDP */
}
rx_headroom = xdp.data - xdp.data_hard_start;
metasize = xdp.data - xdp.data_meta;
cqe_bcnt = xdp.data_end - xdp.data;
}
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
// ...
return skb;
}
static inline
struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
u32 frag_size, u16 headroom,
u32 cqe_bcnt, u32 metasize)
{
struct sk_buff *skb = build_skb(va, frag_size);
if (unlikely(!skb)) {
rq->stats->buff_alloc_err++;
return NULL;
}
skb_reserve(skb, headroom);
skb_put(skb, cqe_bcnt);
if (metasize)
skb_metadata_set(skb, metasize);
return skb;
}
從上面代碼片段可以看出,在執行 XDP 程序之後,都調用 mlx5e_build_linear_skb()
函數來構建 skb
。
XDP_TX on Mellanox
如果 XDP 程序裏 XDP_TX
該網絡包發送出去,Mellanox 網卡驅動還做了哪些處理呢?
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
mlx5e_xmit_xdp_buff()
|-->mlx5e_xdpi_fifo_push()
直接在驅動內部調 mlx5e_xdpi_fifo_push()
函數發送出去了。
XDP_REDIRECT on Mellanox
如果 XDP 程序裏 XDP_REDIRECT
轉發該網絡包,Mellanox 網卡驅動還做了哪些處理呢?
驅動裏調 xdp_do_redirect()
進行了轉發處理。
// ${KERNEL}/net/core/filter.c
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
// ...
return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
xdp_prog);
}
static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
struct net_device *dev,
struct xdp_frame *xdpf,
struct bpf_prog *xdp_prog)
{
// ...
case BPF_MAP_TYPE_UNSPEC:
if (map_id == INT_MAX) {
fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
if (unlikely(!fwd)) {
err = -EINVAL;
break;
}
err = dev_xdp_enqueue(fwd, xdpf, dev);
break;
}
fallthrough;
// ...
}
// ${KERNEL}/kernel/bpf/devmap.c
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx,
struct bpf_prog *xdp_prog)
{
int err;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
if (unlikely(err))
return err;
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
// ${KERNEL}/include/net/xdp.h
static inline
int xdp_update_frame_from_buff(struct xdp_buff *xdp,
struct xdp_frame *xdp_frame)
{
int metasize, headroom;
/* Assure headroom is available for storing info */
headroom = xdp->data - xdp->data_hard_start;
metasize = xdp->data - xdp->data_meta;
metasize = metasize > 0 ? metasize : 0;
if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
return -ENOSPC;
/* Catch if driver didn't reserve tailroom for skb_shared_info */
if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
XDP_WARN("Driver BUG: missing reserved tailroom");
return -ENOSPC;
}
xdp_frame->data = xdp->data;
xdp_frame->len = xdp->data_end - xdp->data;
xdp_frame->headroom = headroom - sizeof(*xdp_frame);
xdp_frame->metasize = metasize;
xdp_frame->frame_sz = xdp->frame_sz;
xdp_frame->flags = xdp->flags;
return 0;
}
/* Convert xdp_buff to xdp_frame */
static inline
struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
{
struct xdp_frame *xdp_frame;
if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
return xdp_convert_zc_to_xdp_frame(xdp);
/* Store info in top of packet */
xdp_frame = xdp->data_hard_start;
if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0))
return NULL;
/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
xdp_frame->mem = xdp->rxq->mem;
return xdp_frame;
}
以上代碼片段的主要處理邏輯:
-
將
xdp_buff
轉爲xdp_frame
。 -
最後調目標設備的
ndo_xdp_xmit()
函數將xdp_frame
發送出去。
關於 XDP_REDIRECT 的更多講解,請看:
XDP_ABORTED and XDP_DROP on Mellanox
Mellanox 驅動對它們沒有複雜的處理邏輯:
rq->stats->xdp_drop++;
只是遞增了 xdp_drop
統計。
bpf_xdp_adjust_head()
// ${KERNEL}/net/core/filter.c
BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
unsigned long metalen = xdp_get_metalen(xdp);
void *data_start = xdp_frame_end + metalen;
void *data = xdp->data + offset;
if (unlikely(data < data_start ||
data > xdp->data_end - ETH_HLEN))
return -EINVAL;
if (metalen)
memmove(xdp->data_meta + offset,
xdp->data_meta, metalen);
xdp->data_meta += offset;
xdp->data = data;
return 0;
}
注意其中一個細節:如果有 metadata,metadata 會被 memmove 而不會被覆蓋。
bpf_xdp_adjust_meta()
// ${KERNEL}/net/core/filter.c
BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
void *meta = xdp->data_meta + offset;
unsigned long metalen = xdp->data - meta;
if (xdp_data_meta_unsupported(xdp))
return -ENOTSUPP;
if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
if (unlikely(xdp_metalen_invalid(metalen)))
return -EACCES;
xdp->data_meta = meta;
return 0;
}
更詳細的講解請看:
bpf_xdp_adjust_tail()
// ${KERNEL}/net/core/filter.c
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
void *data_end = xdp->data_end + offset;
if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
if (offset < 0)
return bpf_xdp_frags_shrink_tail(xdp, -offset);
return bpf_xdp_frags_increase_tail(xdp, offset);
}
// ...
/* Clear memory area on grow, can contain uninit kernel memory */
if (offset > 0)
memset(xdp->data_end, 0, offset);
xdp->data_end = data_end;
return 0;
}
Q&A
Q:經過 XDP adjust 後的網絡包,能否 PASS 到內核?
A:可以。回頭看 XDP_PASS on Mellanox
的處理邏輯,在調 mlx5e_build_linear_skb()
構建 skb
時便處理好了 head
、meta
和 tail
。
Q:經過 XDP adjust 後的網絡包,在 REDIRECT 時會失去 meta
嗎?
A:不會。以 veth 虛擬設備爲例,veth 網卡驅動在將 xdp_frame
轉爲 skb
時,調 skb_metadata_set()
設置 meta
信息(意即,meta
信息可以跨設備傳遞):
// ${KERNEL}/net/core/xdp.c
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *skb,
struct net_device *dev)
{
// ...
/* Part of headroom was reserved to xdpf */
headroom = sizeof(*xdpf) + xdpf->headroom;
/* Memory size backing xdp_frame data already have reserved
* room for build_skb to place skb_shared_info in tailroom.
*/
frame_size = xdpf->frame_sz;
hard_start = xdpf->data - headroom;
skb = build_skb_around(skb, hard_start, frame_size);
if (unlikely(!skb))
return NULL;
skb_reserve(skb, headroom);
__skb_put(skb, xdpf->len);
if (xdpf->metasize)
skb_metadata_set(skb, xdpf->metasize);
// ...
return skb;
}
總結
將 XDP on Mellanox 研究透徹後,就不再害怕將 XDP 程序下發到 Mellanox 驅動去運行的各種 corner case 了。
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/ZVZuos2gU_IJHygFZkbOoA