linux 內核那些事之 struct page（長文）

struct page

page（頁）是 linux 內核管理物理內存的最小單位，內核將整個物理內存按照頁對齊方式劃分成千上萬個頁進行管理，核爲了管理這些頁將每個頁抽象成 struct page 結構管理每個頁狀態及其他屬性，針對一個 4GB 內存，那麼將會存在上百萬個 struct page 結構。而 struct page 結構本身就佔有一定內存，如果 struct page 結構設計過大，那麼本身就會佔用較多內存，而給系統或者用戶可用的內存就較少，所以對 strcut page 結構大小非常敏感，即使增加一個字節對系統影響也會非常大，故社區對 struct page 的結構做了嚴格設計，不會輕易增加字段：

One of these structures exists for every physical page in the system; on a 4GB system, there will be one million page structures. Given that every byte added to struct page is amplified a million times, it is not surprising that there is a strong motivation to avoid growing this structure at any cost. So struct page contains no less than three unions and is surrounded by complicated rules describing which fields are valid at which times. Changes to how this structure is accessed must be made with great care.

爲了減少 struct page 佔用空間大小，設計之初使用了很多技巧，其中一直就是使用 union 結構，在 5.8.10 版本中整個 struct page 使用了兩個較大 union 結構以節省內存，page 結構劃分如下幾塊：

可以看到在一個 64 位系統中，struct page 主要包含兩個 union 結構，大小分別位 40 個字節和 4 個字節，這樣設計的目的主要是減少佔用空間。

除了使用 union 技術減少佔用空間之外，還使用了其他兩個技術其中一個就是對 flags 標誌的使用：

Unions are not the only technique used to shoehorn as much information as possible into this small structure. Non-uniform memory access (NUMA) systems need to track information on which node each page belongs to, and which zone within the node as well. Rather than add fields to struct page

在 NUMA 系統中爲了節省佔用空間，將 flags 頁標誌位中劃分出一部分給 node id 和 zone 使用，如下：

還有另外一個比較重要的技術就是複用，最典型的一個應用就是 list_head lru 鏈表，在 page 不同的時期及不同的用途，會指向不同的鏈表，以節省空間。

struct page 結構定義位於 include\linux\mm_types.h 文件中，5.8.10 版本定義如下：

struct page {
    unsigned long flags;        /* Atomic flags, some possibly
                     * updated asynchronously */
    /*
     * Five words (20/40 bytes) are available in this union.
     * WARNING: bit 0 of the first word is used for PageTail(). That
     * means the other users of this union MUST NOT use the bit to
     * avoid collision and false-positive PageTail().
     */
    union {
        struct {    /* Page cache and anonymous pages */
            /**
             * @lru: Pageout list, eg. active_list protected by
             * pgdat->lru_lock.  Sometimes used as a generic list
             * by the page owner.
             */
            struct list_head lru;
            /* See page-flags.h for PAGE_MAPPING_FLAGS */
            struct address_space *mapping;
            pgoff_t index;      /* Our offset within mapping. */
            /**
             * @private: Mapping-private opaque data.
             * Usually used for buffer_heads if PagePrivate.
             * Used for swp_entry_t if PageSwapCache.
             * Indicates order in the buddy system if PageBuddy.
             */
            unsigned long private;
        };
        struct {    /* page_pool used by netstack */
            /**
             * @dma_addr: might require a 64-bit value even on
             * 32-bit architectures.
             */
            dma_addr_t dma_addr;
        };
        struct {    /* slab, slob and slub */
            union {
                struct list_head slab_list;
                struct {    /* Partial pages */
                    struct page *next;
#ifdef CONFIG_64BIT
                    int pages;  /* Nr of pages left */
                    int pobjects;   /* Approximate count */
#else
                    short int pages;
                    short int pobjects;
#endif
                };
            };
            struct kmem_cache *slab_cache; /* not slob */
            /* Double-word boundary */
            void *freelist;     /* first free object */
            union {
                void *s_mem;    /* slab: first object */
                unsigned long counters;     /* SLUB */
                struct {            /* SLUB */
                    unsigned inuse:16;
                    unsigned objects:15;
                    unsigned frozen:1;
                };
            };
        };
        struct {    /* Tail pages of compound page */
            unsigned long compound_head;    /* Bit zero is set */

            /* First tail page only */
            unsigned char compound_dtor;
            unsigned char compound_order;
            atomic_t compound_mapcount;
        };
        struct {    /* Second tail page of compound page */
            unsigned long _compound_pad_1;  /* compound_head */
            atomic_t hpage_pinned_refcount;
            /* For both global and memcg */
            struct list_head deferred_list;
        };
        struct {    /* Page table pages */
            unsigned long _pt_pad_1;    /* compound_head */
            pgtable_t pmd_huge_pte; /* protected by page->ptl */
            unsigned long _pt_pad_2;    /* mapping */
            union {
                struct mm_struct *pt_mm; /* x86 pgds only */
                atomic_t pt_frag_refcount; /* powerpc */
            };
#if ALLOC_SPLIT_PTLOCKS
            spinlock_t *ptl;
#else
            spinlock_t ptl;
#endif
        };
        struct {    /* ZONE_DEVICE pages */
            /** @pgmap: Points to the hosting device page map. */
            struct dev_pagemap *pgmap;
            void *zone_device_data;
            /*
             * ZONE_DEVICE private pages are counted as being
             * mapped so the next 3 words hold the mapping, index,
             * and private fields from the source anonymous or
             * page cache page while the page is migrated to device
             * private memory.
             * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
             * use the mapping, index, and private fields when
             * pmem backed DAX files are mapped.
             */
        };

        /** @rcu_head: You can use this to free a page by RCU. */
        struct rcu_head rcu_head;
    };

    union {     /* This union is 4 bytes in size. */
        /*
         * If the page can be mapped to userspace, encodes the number
         * of times this page is referenced by a page table.
         */
        atomic_t _mapcount;

        /*
         * If the page is neither PageSlab nor mappable to userspace,
         * the value stored here may help determine what this page
         * is used for.  See page-flags.h for a list of page types
         * which are currently stored here.
         */
        unsigned int page_type;

        unsigned int active;        /* SLAB */
        int units;          /* SLOB */
    };

    /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
    atomic_t _refcount;

#ifdef CONFIG_MEMCG
    struct mem_cgroup *mem_cgroup;
#endif

    /*
     * On machines where all RAM is mapped into kernel address space,
     * we can simply calculate the virtual address. On machines with
     * highmem some memory is mapped into kernel virtual memory
     * dynamically, so we need a place to store that address.
     * Note that this field could be 16 bits on x86 ... ;)
     *
     * Architectures with slow multiplication can define
     * WANT_PAGE_VIRTUAL in asm/page.h
     */
#if defined(WANT_PAGE_VIRTUAL)
    void *virtual;          /* Kernel virtual address (NULL if
                       not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
    int _last_cpupid;
#endif
} _struct_page_alignment;

該結構較大，需要按照上述分四塊進行分析。

page flag

page flags 標誌位主要採用 bit 位方式，來描述一個物理頁的狀態信息：

Page flags" are simple bit flags describing the state of a page of physical memory. hey are defined in <linux/page-flags.h>. Flags exist to mark "reserved" pages (kernel memory, I/O memory, or simply nonexistent), locked pages, those under writeback I/O, those which are part of a compound page, pages managed by the slab allocator, and more. Depending on the target architecture and kernel configuration options selected, there can be as many as 24 individual flags defined.

page flags 不僅僅劃分分成標誌位使用，還劃分給了 section、node id、zone 使用，其劃分的形式和內存模型以及內核配置有關，在 include\linux\page-flags-layout.h 文件中描述了其主要 5 種劃分形式：

第一種形式爲非 sparse 內存模式或者 sparse vmemmap 內存模式如下：

上述形式是常見的 page flags 形式，其中從 0 到 63 位最高位依次位 FLAGS 位（真正的頁狀態標誌位）、中間剩餘保留，以及 ZONE 和 NODE 部分，其中 zone 代表着該 page 歸屬於的 zone 區域，而 NODE 在 NUMA 系統種代表着該 page 所屬於的 node 節點 id，如果是非 NUMA 系統則爲 0。中間剩餘的部分爲保留位。

而在上述第一種形式中如果開啓了 last_cpupid，則會開啓 LAST_CPUPID 字段，形式如下：

如果是開啓可非 vmemmap 的 sparse 內存模式，則需要增加 section 字段表示 page 所處於的 mem_section：

當然上述形式如果開啓了 last_cpupid，則劃分如下：

除了上述四種形式，sparse 還支持沒有 node id 形式來支持非 NUMA 系統：

上述幾種形式字段的大小以及偏移每個架構都有不同，內核種對每個字段都提供了 PGOFF 宏，方便統一計算，宏定義位於 (include\linux\mm.h) 文件中 :

/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF        ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF        (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF        (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF    (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF        (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)

除了 PGOFF 還提供了 shitfs 定義，如果一個字段位 0 則 PGSHIFT 則爲 0：

#define SECTIONS_PGSHIFT    (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT        (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT        (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT    (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT    (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))

各個字段的 MASK 定義如下：

    #define ZONEID_PGSHIFT        (ZONEID_PGOFF * (ZONEID_SHIFT != 0))

    #define ZONES_MASK        ((1UL << ZONES_WIDTH) - 1)
    #define NODES_MASK        ((1UL << NODES_WIDTH) - 1)
    #define SECTIONS_MASK        ((1UL << SECTIONS_WIDTH) - 1)
    #define LAST_CPUPID_MASK    ((1UL << LAST_CPUPID_SHIFT) - 1)
    #define KASAN_TAG_MASK        ((1UL << KASAN_TAG_WIDTH) - 1)
    #define ZONEID_MASK        ((1UL << ZONEID_SHIFT) - 1)

可以看到上述幾個宏最終都依賴於各個地段的 WIDTH 宏代表每個字段佔有多少位，該如果某個字段不存在則將該字段的 WIDTH 爲 0：

SECTIONS 字段操作

section 字段寬度 SECTIONS_WIDTH 定義如下：

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTIONS_WIDTH        SECTIONS_SHIFT
#else
#define SECTIONS_WIDTH        0
#endif

只有在配置內存模型爲 sparse 且不支持 vememap 時，SECTION_WIDTH 才爲非零，此時取決於 SECTIONS_SHIFT:

#define SECTIONS_SHIFT    (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)

MAX_PHYSMEM_BITS 和 SECTION_SIZE_BITS 與具體芯片架構有關

內核還將獲取或者設置 page section 封裝成函數，設置 page section 函數爲：

   static inline void set_page_section(struct page *page, unsigned long section)
    {
        page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
        page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
    }

ZONES 字段

ZONES_WIDTH 定義如下：

#define ZONES_WIDTH        ZONES_SHIFT

ZONES_SHIFT 定義與具體的 zone 最大 MAX_NR_ZONE 有關：

#if MAX_NR_ZONES < 2
#define ZONES_SHIFT 0
#elif MAX_NR_ZONES <= 2
#define ZONES_SHIFT 1
#elif MAX_NR_ZONES <= 4
#define ZONES_SHIFT 2
#elif MAX_NR_ZONES <= 8
#define ZONES_SHIFT 3
#else
#error ZONES_SHIFT -- too many zones configured adjust calculation
#endif

設置 page 中 zone 操作函數如下：

    static inline void set_page_zone(struct page *page, enum zone_type zone)
    {
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
    }

以及獲取到 zone id 接口：

    static inline struct zone *page_zone(const struct page *page)
    {
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
    }

NODES 字段操作

node 節點寬度 NODES_WIDTH 定義如下：

#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH        NODES_SHIFT
#else
#ifdef CONFIG_SPARSEMEM_VMEMMAP
#error "Vmemmap: No space for nodes field in page flags"
#endif
#define NODES_WIDTH        0
#endif

這裏做了檢查防止使用的 bit 位綜合超過 unsigned long 的 BITS_PER_LONG 大小，如果沒有超過則使用 NODE_SHIFT 配置：

#ifdef CONFIG_NODES_SHIFT
#define NODES_SHIFT     CONFIG_NODES_SHIFT
#else
#define NODES_SHIFT     0
#endif

NODE_SHIFT 的大小可以通過內核 CONFIG_NODES_SHIFT 來配置。

獲取 page 中的 node 字段操作如下：

static inline int page_to_nid(const struct page *page)
{
    struct page *p = (struct page *)page;

    return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
}

LAST__CPU_SHIFT

last cpu pid 沒有專門的 width 宏只有 LAST_CPUPID_SHIFT，定義如下：

#ifdef CONFIG_NUMA_BALANCING
#define LAST__PID_SHIFT 8
#define LAST__PID_MASK  ((1 << LAST__PID_SHIFT)-1)

#define LAST__CPU_SHIFT NR_CPUS_BITS
#define LAST__CPU_MASK  ((1 << LAST__CPU_SHIFT)-1)

#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT)
#else
#define LAST_CPUPID_SHIFT 0
#endif

需要開啓 CONFIG_NUMA_BALANCING 宏，才支持。且 LAST_CPUID_SHIFT 取決於 NR_CPUS_BITS。

獲取 last cpu pid:

static inline int page_cpupid_last(struct page *page)
{
    return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}

reset last cpu pid:

static inline void page_cpupid_reset_last(struct page *page)
{
    page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}

struct page 成員詳細描述

管理 struct page 各個字段詳細描述：

flags 字段

flags 字段基本都是固定的，每個 flags 佔一個 bit 位，專門位於 include\linux\page-flags.h 對標誌位進行統一管理，在該頭文件中有一個針對 page flags 有一個詳細的描述，其中有一段：

 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *

page flags 主要劃分爲兩端，其中以 NR_PAGEFLAGS 爲分水線，NR_PAGEFLAGS 以上的稱之爲可擴展部分：

vVuuj1

爲了方便進行標位位置位，清零以及查看是否置位等操作，內核做了系列的宏定義，看起來比較複雜：

/*
 * Macros to create function definitions for page flags
 */
#define TESTPAGEFLAG(uname, lname, policy)                \
static __always_inline int Page##uname(struct page *page)        \
    { return test_bit(PG_##lname, &policy(page, 0)->flags); }

#define SETPAGEFLAG(uname, lname, policy)                \
static __always_inline void SetPage##uname(struct page *page)        \
    { set_bit(PG_##lname, &policy(page, 1)->flags); }

#define CLEARPAGEFLAG(uname, lname, policy)                \
static __always_inline void ClearPage##uname(struct page *page)        \
    { clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define __SETPAGEFLAG(uname, lname, policy)                \
static __always_inline void __SetPage##uname(struct page *page)        \
    { __set_bit(PG_##lname, &policy(page, 1)->flags); }

#define __CLEARPAGEFLAG(uname, lname, policy)                \
static __always_inline void __ClearPage##uname(struct page *page)    \
    { __clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTSETFLAG(uname, lname, policy)                \
static __always_inline int TestSetPage##uname(struct page *page)    \
    { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTCLEARFLAG(uname, lname, policy)                \
static __always_inline int TestClearPage##uname(struct page *page)    \
    { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define PAGEFLAG(uname, lname, policy)                    \
    TESTPAGEFLAG(uname, lname, policy)              \
    SETPAGEFLAG(uname, lname, policy)               \
    CLEARPAGEFLAG(uname, lname, policy)

#define __PAGEFLAG(uname, lname, policy)                \
    TESTPAGEFLAG(uname, lname, policy)              \
    __SETPAGEFLAG(uname, lname, policy)             \
    __CLEARPAGEFLAG(uname, lname, policy)

#define TESTSCFLAG(uname, lname, policy)                \
    TESTSETFLAG(uname, lname, policy)               \
    TESTCLEARFLAG(uname, lname, policy)

#define TESTPAGEFLAG_FALSE(uname)                    \
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname)                        \
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname)                    \
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname)                    \
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname)                    \
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname)                    \
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)            \
    SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)

#define TESTSCFLAG_FALSE(uname)                        \
    TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
PAGEFLAG(Referenced, referenced, PF_HEAD)
    TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
    __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
    __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
    TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
    TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND)       /* Used by some filesystems */

/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
    TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
    TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)

PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
    __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
    __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
    __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
    __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)

/*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause releasepage() and co to be invoked
 */
PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
    __CLEARPAGEFLAG(Private, private, PF_ANY)
PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
    TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)

/*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
    TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)

/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
    TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
    TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)

將上述一系列宏展開，主要有以下三種：

將頁面置位統一命名位 SetPageXXX，其中 XXX 爲標誌位中的後面小寫部分，例如 SetPageLRU，設置的是 PGlru 標誌位，SetPageDirty 設置的是 PGdirty 標誌位
ClearPageXXX 是將相應的標誌位清空
PageXXX，用於檢查頁面是否設置了該標誌位。

關於 page flags 爭論

關於 struct page flags 一直都有一個巨大爭論就是 flags 是否足夠？

由於 flags 除了劃分真正的狀態標誌位還需要劃分給 node，section、zone 使用，這樣 flags 顯得就很擁擠，關於這一討論可以詳細瞭解下：How many page flags do we really have?

第一個 Union

struct page 第一個 union 在 64 位系統下是一個 40 字節的聯合體，裏面包含 8 個部分，是記錄 page 的主要功能數據，每個部分都有個結構體進行說明

Page cache and anonymous pages

第一個結構體主要是對匿名頁和 page cache 的主要功能數據，主要結構成員如下：

    struct {    /* Page cache and anonymous pages */
        /**
         * @lru: Pageout list, eg. active_list protected by
         * pgdat->lru_lock.  Sometimes used as a generic list
         * by the page owner.
         */
        struct list_head lru;
        /* See page-flags.h for PAGE_MAPPING_FLAGS */
        struct address_space *mapping;
        pgoff_t index;      /* Our offset within mapping. */
        /**
         * @private: Mapping-private opaque data.
         * Usually used for buffer_heads if PagePrivate.
         * Used for swp_entry_t if PageSwapCache.
         * Indicates order in the buddy system if PageBuddy.
         */
        unsigned long private;
    };

主要成員說明：

struct list_head lru：爲 LRU 鏈表，該鏈表會根據頁面不同的用途掛載到不同的鏈表，如在空閒時刻，被 buddy 系統管理時，會掛接到 buffy 的 free 鏈表中。如果頁面被分配，則會根據頁面的激活狀態，掛接到 active list 鏈表中。

struct address_space *mapping：當頁面被映射時指向映射的地址空間。當爲匿名映射時，mapping 實際上指向的是 struct anon_vma * 結構。當爲文件映射時，mapping 指向的是 struct address_space * 結構。如果判斷當前頁面是匿名映射還是文件映射，使用 PageAnon() 函數判斷，如果 mmaping 的前兩位 PAGE_MAPPING_ANON 爲 true 則爲匿名映射。以下爲 mapping 的幾個關鍵函數：

static __always_inline int PageAnon(struct page *page)
{
    page = compound_head(page);
    return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}

struct anon_vma *page_anon_vma(struct page *page)
{
    unsigned long mapping;

    page = compound_head(page);
    mapping = (unsigned long)page->mapping;
    if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
        return NULL;
    return __page_rmapping(page);
}

struct address_space *page_mapping(struct page *page)
{
    struct address_space *mapping;

    page = compound_head(page);

    /* This happens if someone calls flush_dcache_page on slab page */
    if (unlikely(PageSlab(page)))
        return NULL;

    if (unlikely(PageSwapCache(page))) {
        swp_entry_t entry;

        entry.val = page_private(page);
        return swap_address_space(entry);
    }

    mapping = page->mapping;
    if ((unsigned long)mapping & PAGE_MAPPING_ANON)
        return NULL;

    return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}

pgoff_t index：該字段是一個複用字段。當該頁面被文件映射時，代表偏移量。爲匿名映射時，保存的是頁遷移類型 migratetype(見 set_pcppage_migratetype() 函數）。
unsigned long private：私有數據

page_pool used by netstack

如果該頁被用作 DMA 映射，dma_addr_t 則代表的是映射的一個總線地址：

struct {    /* page_pool used by netstack */
        /**
         * @dma_addr: might require a 64-bit value even on
         * 32-bit architectures.
         */
        dma_addr_t dma_addr;
    };

slab, slob and slub

該頁面被 slab/slob/slub 所管理分配，即已經被 buffy 分配出去，進一步做小內存分配管理：

struct {    /* slab, slob and slub */
            union {
                struct list_head slab_list;
                struct {    /* Partial pages */
                    struct page *next;
#ifdef CONFIG_64BIT
                    int pages;  /* Nr of pages left */
                    int pobjects;   /* Approximate count */
#else
                    short int pages;
                    short int pobjects;
#endif
                };
            };
            struct kmem_cache *slab_cache; /* not slob */
            /* Double-word boundary */
            void *freelist;     /* first free object */
            union {
                void *s_mem;    /* slab: first object */
                unsigned long counters;     /* SLUB */
                struct {            /* SLUB */
                    unsigned inuse:16;
                    unsigned objects:15;
                    unsigned frozen:1;
                };
            };
        };

主要結構說明：

struct list_head slab_list: 指向的是 slab list 鏈表
struct page *next：在 slub 中分配使用
struct kmem_cache *slab_cache：指向的是 slab 緩存描述符
void *freelist:：指向的是第一個空間的 kobject。當一個頁被 buddy 分配出去由 slab 進行管理時，會將該內存劃分成相應大小的等份數組即 object 進行分配管理。freelist 指向的是第一個空閒的位置
void *s_mem：指向第一個 slab 對象的起始地址
unsigned long counters：被 slub 用作計數

Tail pages of compound page

該結構表明爲 compound pages 的最後一個頁 (特別注意雖然是 tail pages，但是 compound pages 的 head page 也使用該結構）至於 compound pages 的主要功能有一段描述：

A compound page is simply a grouping of two or more physically contiguous pages into a unit that can, in many ways, be treated as a single, larger page. They are most commonly used to create huge pages

compound page 將多個連續的物理頁組裝聯合在一起組成一個更大頁，其最大的用途是可以創建一個 huge 頁，具體介紹可以參考：An introduction to compound pages [LWN.net]

該此時該結果主要描述的是 compound page 的 tail page:

struct {    /* Tail pages of compound page */
    unsigned long compound_head;    /* Bit zero is set */

    /* First tail page only */
    unsigned char compound_dtor;
    unsigned char compound_order;
    atomic_t compound_mapcount;
};

主要結構成員：

unsigned long compound_head：指向 compound pages 的第一個 head 頁。除了 head 頁之外，其他頁都是 tail 頁，如果 compound_head 被設置成 head 頁，如果 compound_head 被設置成 head 頁，則表明該頁是 compound pages 的 tail 頁，可以看 PageTail 函數。compound head 頁不設置 compound_head 爲零，獲取 compound pageshead 頁接口爲 compound_head。
unsigned char compound_dtor：a destructor，當每個 compound pages 都保存對應的 destructor，用於釋放該頁時通過 compound_dtor 從 compound_page_dtors 中獲取到對應的釋放函數，可以參見 destroy_compound_page() 函數：

static inline void destroy_compound_page(struct page *page)
{
    VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
    compound_page_dtors[page[1].compound_dtor](page);
}

對應的 compound_page_dtors 定義如下

compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
    [NULL_COMPOUND_DTOR] = NULL,
    [COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
    [HUGETLB_PAGE_DTOR] = free_huge_page,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
#endif
};

unsigned char compound_order：只有在 head page 設置，將 compound pages 整體頁數作爲 order，只存在 head
atomic_t compound_mapcount: compound page 被多少個用戶進程的 page 指向該頁。當 page 屬於 compound pages 時，獲取映射此時不再從第二個 union 結構中的_mapcount 獲取，而是從該字段中獲取。

Second tail page of compound page

爲了節省整個 struct page 空間，除了上述定義 compound taile 結構之外，還定義了第二種 compound tail page 結構用於擴展：

    struct {    /* Second tail page of compound page */
            unsigned long _compound_pad_1;  /* compound_head */
            atomic_t hpage_pinned_refcount;
            /* For both global and memcg */
            struct list_head deferred_list;
        };

Page table pages

該結構主要用於 page table, 結構成員如下：

struct {    /* Page table pages */
            unsigned long _pt_pad_1;    /* compound_head */
            pgtable_t pmd_huge_pte; /* protected by page->ptl */
            unsigned long _pt_pad_2;    /* mapping */
            union {
                struct mm_struct *pt_mm; /* x86 pgds only */
                atomic_t pt_frag_refcount; /* powerpc */
            };
#if ALLOC_SPLIT_PTLOCKS
            spinlock_t *ptl;
#else
            spinlock_t ptl;
#endif

ZONE_DEVICE pages

當該頁面屬於 ZONE_DEVICE 時：

   struct {    /* ZONE_DEVICE pages */
            /** @pgmap: Points to the hosting device page map. */
            struct dev_pagemap *pgmap;
            void *zone_device_data;
            /*
             * ZONE_DEVICE private pages are counted as being
             * mapped so the next 3 words hold the mapping, index,
             * and private fields from the source anonymous or
             * page cache page while the page is migrated to device
             * private memory.
             * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
             * use the mapping, index, and private fields when
             * pmem backed DAX files are mapped.
             */
        };

rcu_head

rcu_head 主要被用作 RCU 鎖

struct
 rcu_head rcu_head
;

第二個 Union

struct page 的第二個 union 大小爲 4 個字節，主要成員包括如下：

union {        /* This union is 4 bytes in size. */
        /*
         * If the page can be mapped to userspace, encodes the number
         * of times this page is referenced by a page table.
         */
        atomic_t _mapcount;

        /*
         * If the page is neither PageSlab nor mappable to userspace,
         * the value stored here may help determine what this page
         * is used for.  See page-flags.h for a list of page types
         * which are currently stored here.
         */
        unsigned int page_type;

        unsigned int active;        /* SLAB */
        int units;          /* SLOB */
    };

atomic_t _mapcount:（_mapcount is the number of page-table entries pointing to the page）有多少個 page table 映射指向該頁面。每個用戶進程都擁有各自獨立的虛擬空間（64 位系統用戶空間一般有 256TB) 以及擁有獨立的頁表，所有有可能出現多個用戶進程空間同時映射到一個物理頁面情況。該計數代表被映射到多少個用戶進程。_mapcount 爲 - 1，則代表沒有被 PTE 映射。等於 0 時表示只有一個父進程使用被映射，當大於 0 時代表除了父進程還有其他進程使用這個頁面。與該計數有關的一個重要特性就是 RMAP。獲取 mapcount 函數爲 page_mapped，特別要說明的是如果該頁是 compound page 則從 compound_mapcount 中獲取。

    bool page_mapped(struct page *page)
    {
        int i;

        if (likely(!PageCompound(page)))
            return atomic_read(&page->_mapcount) >= 0;
        page = compound_head(page);
        if (atomic_read(compound_mapcount_ptr(page)) >= 0)
            return true;
        if (PageHuge(page))
            return false;
        for (i = 0; i < compound_nr(page); i++) {
            if (atomic_read(&page[i]._mapcount) >= 0)
                return true;
        }
        return false;

    }

unsigned int page_type ：如果該頁面即不屬於 page slab 也不屬於 user space，則該代表頁面類型即使用用途
unsigned int active ：表示 slab 中活躍對象
int units ：被 slob 使用

page type

page type 用於表示一個物理頁 page 使用類型，只要支持的 page type 如下：

#define PG_buddy    0x00000080
#define PG_offline    0x00000100
#define PG_kmemcg    0x00000200
#define PG_table    0x00000400
#define PG_guard    0x00000800

採用 bit map 表示方法:

PG_buddy：page 是否位於 buddy
PG_offline: page 是否處於上線狀態
PG_kmemcg：page 爲 kmemcg 使用
PG_table：page 作爲 page table 使用
PG_guard：page 作爲 guard 使用。

定義位於 include\linux\page-flags.h 文件中，該文件還定義了一系列宏：

#define PageType(page, flag)                        \
    ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)

static inline int page_has_type(struct page *page)
{
    return (int)page->page_type < PAGE_MAPCOUNT_RESERVE;
}

#define PAGE_TYPE_OPS(uname, lname)                    \
static __always_inline int Page##uname(struct page *page)        \
{                                    \
    return PageType(page, PG_##lname);              \
}                                    \
static __always_inline void __SetPage##uname(struct page *page)        \
{                                    \
    VM_BUG_ON_PAGE(!PageType(page, 0), page);           \
    page->page_type &= ~PG_##lname;                 \
}                                    \
static __always_inline void __ClearPage##uname(struct page *page)    \
{                                    \
    VM_BUG_ON_PAGE(!Page##uname(page), page);           \
    page->page_type |= PG_##lname;                  \
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy)

PAGE_TYPE_OPS(Offline, offline)

PAGE_TYPE_OPS(Kmemcg, kmemcg)

PAGE_TYPE_OPS(Table, table)

PAGE_TYPE_OPS(Guard, guard)

例如以 PG_buddy 爲用例：

PageBuddy: 表明 PG_buddy 位是否設置，如果設置返回 true，否則返回 false。
SetPageBuddy: 設置 PGbuddy 標誌位位 1。
ClearPageBuddy: 清除 PGbuddy 標記位。

_refcount

_refcount 被用作引用計數管理，用於跟蹤內存使用狀況。初始化爲空閒狀態時計數爲 0，當被分配引用時計數會 + 1，如果該頁面被其他引用時也會 + 1。

特別要注意：如果該頁是一個 compound page，則計數只會記錄再在 head pages 中。內核中常用的計數函數：get_page() 計數 + 1 函數接口。put_page() 計數 - 1 接口：

static inline void get_page(struct page *page)
{
    page = compound_head(page);
    /*
     * Getting a normal page or the head of a compound page
     * requires to already have an elevated page->_refcount.
     */
    VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
    page_ref_inc(page);
}

static inline void put_page(struct page *page)
{
    page = compound_head(page);

    /*
     * For devmap managed pages we need to catch refcount transition from
     * 2 to 1, when refcount reach one it means the page is free and we
     * need to inform the device driver through callback. See
     * include/linux/memremap.h and HMM for details.
     */
    if (page_is_devmap_managed(page)) {
        put_devmap_managed_page(page);
        return;
    }

    if (put_page_testzero(page))
        __put_page(page);
}

上述兩個函數首先是調用 compound_head() 函數，如果是 compound page 則獲取首頁，如果不是則返回該頁：

static inline struct page *compound_head(struct page *page)
{
    unsigned long head = READ_ONCE(page->compound_head);

    if (unlikely(head & 1))
        return (struct page *) (head - 1);
    return page;
}

最終都會調用到以下幾個函數：

static inline int page_count(struct page *page)
{
    return atomic_read(&compound_head(page)->_refcount);
}

static inline void set_page_count(struct page *page, int v)
{
    atomic_set(&page->_refcount, v);
    if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
        __page_ref_set(page, v);
}

static inline void init_page_count(struct page *page)
{
    set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
    atomic_add(nr, &page->_refcount);
    if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
        __page_ref_mod(page, nr);
}

....
static inline void page_ref_inc(struct page *page)
{
    atomic_inc(&page->_refcount);
    if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
        __page_ref_mod(page, 1);
}

static inline void page_ref_dec(struct page *page)
{
    atomic_dec(&page->_refcount);
    if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
        __page_ref_mod(page, -1);
}
...

關於 page refcount 引用計數最終的函數定義及實現在 include\linux\page_ref.h 文件中。

compound pages VS ordinary allocations

compound pages 在申請內存時也可正常的內存申請一樣，都是使用 order 申請一塊連續的內存，在內核中大部分的情況下是用不到 compound 特性的。如果該 pages 作爲 compound pages，那麼這些 pages 將被看作一個整體，如果有些模塊想操作 compoundpages 中的一個頁，相當於操作整個 compound pages, 不能被分裂。釋放時也是同樣如此，作爲一個整體釋放：

In most cases, compound pages are unnecessary and ordinary allocations can be used; calling code needs to remember how many pages it allocated, but otherwise the metadata that would be stored in a compound page is unneeded. A compound page is indicated, though, whenever it is important to treat the group of pages as a whole even if somebody references a single page within it. Transparent huge pages are a classic example; if user space attempts to change the protections on a portion of a huge page, the entire huge page will need to be located and broken up. Various drivers also use compound pages to ease the management of larger buffers.

struct page 歷史演進

struct pages 作爲一個內核中物理內存中一個重要數據結構，經歷經歷過多次整體技術演進，一個重要目的就是減小 structpage 結構的尺度，儘量在較小的尺寸內表達儘量多的信息，所以該結構中大量使用了複用技術，需要了解每個結構的具體演進，才能不至於看代碼看的雲裏霧裏，其中一個比較重要的整改說明文章可以參考:《Cramming more into struct page》Cramming more into struct page [LWN.net]

參考資料

Cramming more into struct page [LWN.net]

How many page flags do we really have? [LWN.net]

An introduction to compound pages [LWN.net]

Minimizing the use of tail pages [LWN.net]

本文由 Readfog 進行 AMP 轉碼，版權歸原作者所有。
來源：https://mp.weixin.qq.com/s/LsPUS_23nqTmZjjgKjog_Q