/* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization.
*/ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif
/* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h>
*/ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped.
*/ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) staticinlinevoid __mm_zero_struct_page(struct page *page)
{ unsignedlong *_pp = (void *)page;
/* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
BUILD_BUG_ON(sizeof(struct page) & 7);
BUILD_BUG_ON(sizeof(struct page) < 56);
BUILD_BUG_ON(sizeof(struct page) > 96);
/* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that.
*/ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
/* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
/* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsignedlong)(addr), PAGE_SIZE)
/* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions).
*/
/* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h
*/ #define VM_NONE 0x00000000
#define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else/* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif/* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
#define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
/* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
#ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif
#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif/* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size.
*/ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #endif
#ifdefined(CONFIG_ARM64_GCS) /* * arm64's Guarded Control Stack implements similar functionality and * has similar constraints to shadow stacks.
*/ # define VM_SHADOW_STACK VM_HIGH_ARCH_6 #endif
/* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type.
*/ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif
/* * Special vmas that are non-mergable, non-mlock()able.
*/ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
/* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
/* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE
/* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
/* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
/* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask..
*/
/* * The default fault flags that should be used by most of the * arch-specific page fault handlers.
*/ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \
FAULT_FLAG_KILLABLE | \
FAULT_FLAG_INTERRUPTIBLE)
/** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise.
*/ staticinlinebool fault_flag_allow_retry_first(enum fault_flag flags)
{ return (flags & FAULT_FLAG_ALLOW_RETRY) &&
(!(flags & FAULT_FLAG_TRIED));
}
/* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible.
*/ struct vm_fault { conststruct { struct vm_area_struct *vma; /* Target VMA */
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */ unsignedlong address; /* Faulting virtual address - masked */ unsignedlong real_address; /* Faulting virtual address - unmasked */
}; enum fault_flag flags; /* FAULT_FLAG_xxx flags
* XXX: should really be 'const' */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
pud_t *pud; /* Pointer to pud entry matching * the 'address'
*/ union {
pte_t orig_pte; /* Value of PTE at the time of fault */
pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only.
*/
};
struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR).
*/ /* These three entries are valid only while holding ptl lock */
pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated.
*/
spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd.
*/
pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context.
*/
};
/* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs.
*/ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock.
*/ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsignedlong addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed.
*/ int (*mprotect)(struct vm_area_struct *vma, unsignedlong start, unsignedlong end, unsignedlong newflags);
vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsignedint order);
vm_fault_t (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff); unsignedlong (*pagesize)(struct vm_area_struct * area);
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
/* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping.
*/ int (*access)(struct vm_area_struct *vma, unsignedlong addr, void *buf, int len, int write);
/* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this
* vma to be dumped unconditionally. */ constchar *(*name)(struct vm_area_struct *vma);
#ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy.
*/ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
/* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy.
*/ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsignedlong addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page.
*/ struct page *(*find_special_page)(struct vm_area_struct *vma, unsignedlong addr);
};
/* * These must be here rather than mmap_lock.h as dependent on vm_fault type, * declared in this header.
*/ #ifdef CONFIG_PER_VMA_LOCK staticinlinevoid release_fault_lock(struct vm_fault *vmf)
{ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
vma_end_read(vmf->vma); else
mmap_read_unlock(vmf->vma->vm_mm);
}
/* Use when VMA is not part of the VMA tree and needs no locking */ staticinlinevoid vm_flags_init(struct vm_area_struct *vma,
vm_flags_t flags)
{
ACCESS_PRIVATE(vma, __vm_flags) = flags;
}
/* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand.
*/ staticinlinevoid vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
vma_assert_write_locked(vma);
vm_flags_init(vma, flags);
}
/* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking.
*/ staticinlinevoid __vm_flags_mod(struct vm_area_struct *vma,
vm_flags_t set, vm_flags_t clear)
{
vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
}
/* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly.
*/ staticinlinevoid vm_flags_mod(struct vm_area_struct *vma,
vm_flags_t set, vm_flags_t clear)
{
vma_start_write(vma);
__vm_flags_mod(vma, set, clear);
}
/* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task.
*/ staticinlinebool vma_is_initial_heap(conststruct vm_area_struct *vma)
{ return vma->vm_start < vma->vm_mm->brk &&
vma->vm_end > vma->vm_mm->start_brk;
}
/* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task.
*/ staticinlinebool vma_is_initial_stack(conststruct vm_area_struct *vma)
{ /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go.
*/ return vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack;
}
staticinlinestruct vm_area_struct *vma_next(struct vma_iterator *vmi)
{ /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry.
*/ return mas_find(&vmi->mas, ULONG_MAX);
}
#define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL)
/* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
#ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault.
*/ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else staticinlinebool vma_is_shmem(struct vm_area_struct *vma) { returnfalse; } staticinlinebool vma_is_anon_shmem(struct vm_area_struct *vma) { returnfalse; } #endif
int vma_is_stack_for_current(struct vm_area_struct *vma);
/* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
/* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples.
*/ staticinlineunsignedint compound_order(struct page *page)
{ struct folio *folio = (struct folio *)page;
if (!test_bit(PG_head, &folio->flags)) return 0; return folio_large_order(folio);
}
/** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio.
*/ staticinlineunsignedint folio_order(conststruct folio *folio)
{ if (!folio_test_large(folio)) return 0; return folio_large_order(folio);
}
/** * folio_reset_order - Reset the folio order and derived _nr_pages * @folio: The folio. * * Reset the order and derived _nr_pages to 0. Must only be used in the * process of splitting large folios.
*/ staticinlinevoid folio_reset_order(struct folio *folio)
{ if (WARN_ON_ONCE(!folio_test_large(folio))) return;
folio->_flags_1 &= ~0xffUL; #ifdef NR_PAGES_IN_LARGE_FOLIO
folio->_nr_pages = 0; #endif
}
#include <linux/huge_mm.h>
/* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them.
*/
/* * Drop a ref, return true if the refcount fell to zero (the page has no users)
*/ staticinlineint put_page_testzero(struct page *page)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page);
}
/* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings.
*/ staticinlinebool get_page_unless_zero(struct page *page)
{ return page_ref_add_unless(page, 1, 0);
}
int region_intersects(resource_size_t offset, size_t size, unsignedlong flags, unsignedlong desc);
/* Support for virtually mapped pages */ struct page *vmalloc_to_page(constvoid *addr); unsignedlong vmalloc_to_pfn(constvoid *addr);
/* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required.
*/ #ifdef CONFIG_MMU externbool is_vmalloc_addr(constvoid *x); externint is_vmalloc_or_module_addr(constvoid *x); #else staticinlinebool is_vmalloc_addr(constvoid *x)
{ returnfalse;
} staticinlineint is_vmalloc_or_module_addr(constvoid *x)
{ return 0;
} #endif
/* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes or implementation of other core folio_*() primitives.
*/ staticinlineint folio_entire_mapcount(conststruct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) return 0; return atomic_read(&folio->_entire_mapcount) + 1;
}
/** * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * * The folio mapcount corresponds to the number of present user page table * entries that reference any part of a folio. Each such present user page * table entry must be paired with exactly on folio reference. * * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts * exactly once. * * For hugetlb folios, each abstracted "hugetlb" user page table entry that * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * * Will report 0 for pages which cannot be mapped into userspace, such as * slab, page tables and similar. * * Return: The number of times this folio is mapped.
*/ staticinlineint folio_mapcount(conststruct folio *folio)
{ int mapcount;
if (likely(!folio_test_large(folio))) {
mapcount = atomic_read(&folio->_mapcount) + 1; if (page_mapcount_is_type(mapcount))
mapcount = 0; return mapcount;
} return folio_large_mapcount(folio);
}
/** * folio_mapped - Is this folio mapped into userspace? * @folio: The folio. * * Return: True if any page in this folio is referenced by user page tables.
*/ staticinlinebool folio_mapped(conststruct folio *folio)
{ return folio_mapcount(folio) >= 1;
}
/* * Return true if this page is mapped into pagetables. * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD.
*/ staticinlinebool page_mapped(conststruct page *page)
{ return folio_mapped(page_folio(page));
}
/* Returns the number of bytes in this potentially compound page. */ staticinlineunsignedlong page_size(struct page *page)
{ return PAGE_SIZE << compound_order(page);
}
/* Returns the number of bits needed for the number of bytes in a page */ staticinlineunsignedint page_shift(struct page *page)
{ return PAGE_SHIFT + compound_order(page);
}
/** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page.
*/ staticinlineunsignedint thp_order(struct page *page)
{
VM_BUG_ON_PGFLAGS(PageTail(page), page); return compound_order(page);
}
/** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page.
*/ staticinlineunsignedlong thp_size(struct page *page)
{ return PAGE_SIZE << thp_order(page);
}
#ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm.
*/ staticinline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{ if (likely(vma->vm_flags & VM_WRITE))
pte = pte_mkwrite(pte, vma); return pte;
}
/* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A folio may belong to an inode's memory mapping. In this case, * folio->mapping points to the inode, and folio->index is the file * offset of the folio, in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory.
*/
/* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \
((unsignedint) folio_ref_count(folio) + 127u <= 127u)
/** * folio_get - Increment the reference count on a folio. * @folio: The folio. * * Context: May be called in any context, as long as you know that * you have a refcount on the folio. If you do not already have one, * folio_try_get() may be the right interface for you to use.
*/ staticinlinevoid folio_get(struct folio *folio)
{
VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
folio_ref_inc(folio);
}
staticinlinevoid get_page(struct page *page)
{ struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_test_slab(folio))) return; if (WARN_ON_ONCE(folio_test_large_kmalloc(folio))) return;
folio_get(folio);
}
/** * folio_put - Decrement the reference count on a folio. * @folio: The folio. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put() unless you can be sure that it wasn't the * last reference. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock.
*/ staticinlinevoid folio_put(struct folio *folio)
{ if (folio_put_testzero(folio))
__folio_put(folio);
}
/** * folio_put_refs - Reduce the reference count on a folio. * @folio: The folio. * @refs: The amount to subtract from the folio's reference count. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put_refs() unless you can be sure that these weren't * the last references. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock.
*/ staticinlinevoid folio_put_refs(struct folio *folio, int refs)
{ if (folio_ref_sub_and_test(folio, refs))
__folio_put(folio);
}
/* * union release_pages_arg - an array of pages or folios * * release_pages() releases a simple array of multiple pages, and * accepts various different forms of said page array: either * a regular old boring array of pages, an array of folios, or * an array of encoded page pointers. * * The transparent union syntax for this kind of "any of these * argument types" is all kinds of ugly, so look away.
*/ typedefunion { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages;
} release_pages_arg __attribute__ ((__transparent_union__));
void release_pages(release_pages_arg, int nr);
/** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need to * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock.
*/ staticinlinevoid folios_put(struct folio_batch *folios)
{
folios_put_refs(folios, NULL);
}
if (folio_test_slab(folio) || folio_test_large_kmalloc(folio)) return;
folio_put(folio);
}
/* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries.
*/ #define GUP_PIN_COUNTING_BIAS (1U << 10)
#ifndef CONFIG_MMU staticinlinebool is_nommu_shared_mapping(vm_flags_t flags)
{ /* * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of * a file mapping. R/O MAP_PRIVATE mappings might still modify * underlying memory if ptrace is active, so this is only possible if * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later.
*/ return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
} #endif
/* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone.
*/ staticinlineint page_zone_id(struct page *page)
{ return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
/* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff.
*/
staticinline u8 page_kasan_tag(conststruct page *page)
{
u8 tag = KASAN_TAG_KERNEL;
if (kasan_enabled()) {
tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
tag ^= 0xff;
}
/** * folio_pfn - Return the Page Frame Number of a folio. * @folio: The folio. * * A folio may contain multiple pages. The pages have consecutive * Page Frame Numbers. * * Return: The Page Frame Number of the first page in the folio.
*/ staticinlineunsignedlong folio_pfn(conststruct folio *folio)
{ return page_to_pfn(&folio->page);
}
/** * folio_mk_pte - Create a PTE for this folio * @folio: The folio to create a PTE for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_ptes(). * * Return: A page table entry suitable for mapping this folio.
*/ staticinline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot)
{ return pfn_pte(folio_pfn(folio), pgprot);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE /** * folio_mk_pmd - Create a PMD for this folio * @folio: The folio to create a PMD for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_pmd_at(). * * Return: A page table entry suitable for mapping this folio.
*/ staticinline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
{ return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD /** * folio_mk_pud - Create a PUD for this folio * @folio: The folio to create a PUD for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_pud_at(). * * Return: A page table entry suitable for mapping this folio.
*/ staticinline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot)
{ return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
} #endif/* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif/* CONFIG_TRANSPARENT_HUGEPAGE */ #endif/* CONFIG_MMU */
/** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. * * This function checks if a folio has been pinned via a call to * a function in the pin_user_pages() family. * * For small folios, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal folio references". * * False positives are OK, because: a) it's unlikely for a folio to * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * * For most large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * Return: True, if it is likely that the folio has been "dma-pinned". * False, if the folio is definitely not dma-pinned.
*/ staticinlinebool folio_maybe_dma_pinned(struct folio *folio)
{ if (folio_has_pincount(folio)) return atomic_read(&folio->_pincount) > 0;
/* * folio_ref_count() is signed. If that refcount overflows, then * folio_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the sign bit to count a little * bit higher via unsigned math, and thus still get an accurate result.
*/ return ((unsignedint)folio_ref_count(folio)) >=
GUP_PIN_COUNTING_BIAS;
}
/* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
*/ staticinlinebool folio_needs_cow_for_dma(struct vm_area_struct *vma, struct folio *folio)
{
VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) returnfalse;
return folio_maybe_dma_pinned(folio);
}
/** * is_zero_page - Query if a page is a zero page * @page: The page to query * * This returns true if @page is one of the permanent zero pages.
*/ staticinlinebool is_zero_page(conststruct page *page)
{ return is_zero_pfn(page_to_pfn(page));
}
/** * is_zero_folio - Query if a folio is a zero page * @folio: The folio to query * * This returns true if @folio is one of the permanent zero pages.
*/ staticinlinebool is_zero_folio(conststruct folio *folio)
{ return is_zero_page(&folio->page);
}
/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION staticinlinebool folio_is_longterm_pinnable(struct folio *folio)
{ #ifdef CONFIG_CMA int mt = folio_migratetype(folio);
if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) returnfalse; #endif /* The zero page can be "pinned" but gets special handling. */ if (is_zero_folio(folio)) returntrue;
/* Coherent device memory must always allow eviction. */ if (folio_is_device_coherent(folio)) returnfalse;
/* * Filesystems can only tolerate transient delays to truncate and * hole-punch operations
*/ if (folio_is_fsdax(folio)) returnfalse;
/* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio);
/** * folio_nr_pages - The number of pages in the folio. * @folio: The folio. * * Return: A positive power of two.
*/ staticinlinelong folio_nr_pages(conststruct folio *folio)
{ if (!folio_test_large(folio)) return 1; return folio_large_nr_pages(folio);
}
/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) #else #define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES #endif
/* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case.
*/ staticinlinelong compound_nr(struct page *page)
{ struct folio *folio = (struct folio *)page;
if (!test_bit(PG_head, &folio->flags)) return 1; return folio_large_nr_pages(folio);
}
/** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * * If you have physically contiguous memory which may span more than * one folio (eg a &struct bio_vec), use this function to move from one * folio to the next. Do not use it if the memory is only virtually * contiguous as the folios are almost certainly not adjacent to each * other. This is the folio equivalent to writing ``page++``. * * Context: We assume that the folios are refcounted and/or locked at a * higher level and do not adjust the reference counts. * Return: The next struct folio.
*/ staticinlinestruct folio *folio_next(struct folio *folio)
{ return (struct folio *)folio_page(folio, folio_nr_pages(folio));
}
/** * folio_shift - The size of the memory described by this folio. * @folio: The folio. * * A folio represents a number of bytes which is a power-of-two in size. * This function tells you which power-of-two the folio is. See also * folio_size() and folio_order(). * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio.
*/ staticinlineunsignedint folio_shift(conststruct folio *folio)
{ return PAGE_SHIFT + folio_order(folio);
}
/** * folio_size - The number of bytes in a folio. * @folio: The folio. * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio.
*/ staticinline size_t folio_size(conststruct folio *folio)
{ return PAGE_SIZE << folio_order(folio);
}
/** * folio_maybe_mapped_shared - Whether the folio is mapped into the page * tables of more than one MM * @folio: The folio. * * This function checks if the folio maybe currently mapped into more than one * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single * MM ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly * indicate "mapped shared" (false positive) if a folio was mapped by * more than two MMs at one point in time. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). * * This function does not consider: * #. If the folio might get mapped in the (near) future (e.g., swapcache, * pagecache, temporary unmapping for migration). * #. If the folio is mapped differently (VM_PFNMAP). * #. If hugetlb page table sharing applies. Callers might want to check * hugetlb_pmd_shared(). * * Return: Whether the folio is estimated to be mapped into more than one MM.
*/ staticinlinebool folio_maybe_mapped_shared(struct folio *folio)
{ int mapcount = folio_mapcount(folio);
/* Only partially-mappable folios require more care. */ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1;
/* * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... * simply assume "mapped shared", nobody should really care * about this for arbitrary kernel allocations.
*/ if (!IS_ENABLED(CONFIG_MM_ID)) returntrue;
/* * A single mapping implies "mapped exclusively", even if the * folio flag says something different: it's easier to handle this * case here instead of on the RMAP hot path.
*/ if (mapcount <= 1) returnfalse; return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
}
/** * folio_expected_ref_count - calculate the expected folio refcount * @folio: the folio * * Calculate the expected folio refcount, taking references from the pagecache, * swapcache, PG_private and page table mappings into account. Useful in * combination with folio_ref_count() to detect unexpected references (e.g., * GUP or other temporary references). * * Does currently not consider references from the LRU cache. If the folio * was isolated from the LRU (which is the case during migration or split), * the LRU cache does not apply. * * Calling this function on an unmapped folio -- !folio_mapped() -- that is * locked will return a stable result. * * Calling this function on a mapped folio will not result in a stable result, * because nothing stops additional page table mappings from coming (e.g., * fork()) or going (e.g., munmap()). * * Calling this function without the folio lock will also not result in a * stable result: for example, the folio might get dropped from the swapcache * concurrently. * * However, even when called without the folio lock or on a mapped folio, * this function can be used to detect unexpected references early (for example, * if it makes sense to even lock the folio and unmap it). * * The caller must add any reference (e.g., from folio_try_get()) it might be * holding itself to the result. * * Returns the expected folio refcount.
*/ staticinlineint folio_expected_ref_count(conststruct folio *folio)
{ constint order = folio_order(folio); int ref_count = 0;
if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio))) return 0;
if (folio_test_anon(folio)) { /* One reference per page from the swapcache. */
ref_count += folio_test_swapcache(folio) << order;
} else { /* One reference per page from the pagecache. */
ref_count += !!folio->mapping << order; /* One reference from PG_private. */
ref_count += folio_test_private(folio);
}
/* One reference per page table mapping. */ return ref_count + folio_mapcount(folio);
}
/* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure.
*/ staticinlinebool page_is_pfmemalloc(conststruct page *page)
{ /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information.
*/ return (uintptr_t)page->lru.next & BIT(1);
}
/* * Return true only if the folio has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure.
*/ staticinlinebool folio_is_pfmemalloc(conststruct folio *folio)
{ /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information.
*/ return (uintptr_t)folio->lru.next & BIT(1);
}
/* * Only to be called by the page allocator on a freshly allocated * page.
*/ staticinlinevoid set_page_pfmemalloc(struct page *page)
{
page->lru.next = (void *)BIT(1);
}
/* * Parameter block passed down to zap_pte_range in exceptional cases.
*/ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ bool reclaim_pt; /* Need reclaim page tables? */
zap_flags_t zap_flags; /* Extra flags for zapping */
};
/* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely * drop the page in the mm, either by truncation or unmapping of the vma. By * default, the flag is not set.
*/ #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
#ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); staticinlineint task_mm_cid(struct task_struct *t)
{ return t->mm_cid;
} #else staticinlinevoid sched_mm_cid_before_execve(struct task_struct *t) { } staticinlinevoid sched_mm_cid_after_execve(struct task_struct *t) { } staticinlinevoid sched_mm_cid_fork(struct task_struct *t) { } staticinlinevoid sched_mm_cid_exit_signals(struct task_struct *t) { } staticinlineint task_mm_cid(struct task_struct *t)
{ /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits.
*/ return raw_smp_processor_id();
} #endif
long get_user_pages_remote(struct mm_struct *mm, unsignedlong start, unsignedlong nr_pages, unsignedint gup_flags, struct page **pages, int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsignedlong start, unsignedlong nr_pages, unsignedint gup_flags, struct page **pages, int *locked);
/* * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
*/ staticinlinestruct page *get_user_page_vma_remote(struct mm_struct *mm, unsignedlong addr, int gup_flags, struct vm_area_struct **vmap)
{ struct page *page; struct vm_area_struct *vma; int got;
if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) return ERR_PTR(-EINVAL);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
/* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However * for now all the callers are only use one of the flags at the same * time.
*/ /* * Whether we should manually check if we can map individual PTEs writable, * because something (e.g., COW, uffd-wp) blocks that from happening for all * PTEs automatically in a writable mapping.
*/ #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ #define MM_CP_UFFD_WP (1UL << 2) /* do wp */ #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
/* * doesn't attempt to fault and will return short.
*/ int get_user_pages_fast_only(unsignedlong start, int nr_pages, unsignedint gup_flags, struct page **pages);
staticinlineunsignedlong get_mm_counter_sum(struct mm_struct *mm, int member)
{ return percpu_counter_sum_positive(&mm->rss_stat[member]);
}
void mm_trace_rss_stat(struct mm_struct *mm, int member);
staticinlinevoid add_mm_counter(struct mm_struct *mm, int member, long value)
{
percpu_counter_add(&mm->rss_stat[member], value);
mm_trace_rss_stat(mm, member);
}
staticinlinevoid inc_mm_counter(struct mm_struct *mm, int member)
{
percpu_counter_inc(&mm->rss_stat[member]);
mm_trace_rss_stat(mm, member);
}
staticinlinevoid dec_mm_counter(struct mm_struct *mm, int member)
{
percpu_counter_dec(&mm->rss_stat[member]);
mm_trace_rss_stat(mm, member);
}
/* Optimized variant when folio is already known not to be anon */ staticinlineint mm_counter_file(struct folio *folio)
{ if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES;
}
/** * pagetable_free - Free pagetables * @pt: The page table descriptor * * pagetable_free frees the memory of all page tables described by a page * table descriptor and the memory for the descriptor itself.
*/ staticinlinevoid pagetable_free(struct ptdesc *pt)
{ struct page *page = ptdesc_page(pt);
staticinlinebool ptlock_init(struct ptdesc *ptdesc)
{ /* * prep_new_page() initialize page->private (and therefore page->ptl) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl.
*/
VM_BUG_ON_PAGE(*(unsignedlong *)&ptdesc->ptl, ptdesc_page(ptdesc)); if (!ptlock_alloc(ptdesc)) returnfalse;
spin_lock_init(ptlock_ptr(ptdesc)); returntrue;
}
#else/* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm.
*/ staticinline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{ return &mm->page_table_lock;
} staticinline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
{ return &mm->page_table_lock;
} staticinlinevoid ptlock_cache_init(void) {} staticinlinebool ptlock_init(struct ptdesc *ptdesc) { returntrue; } staticinlinevoid ptlock_free(struct ptdesc *ptdesc) {} #endif/* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
/* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be * considered ready to switch to split PUD locks yet; there may be places * which need to be converted from page_table_lock.
*/ staticinline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{ return &mm->page_table_lock;
}
/* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system.
*/ externunsignedlong free_reserved_area(void *start, void *end, int poison, constchar *s);
externvoid adjust_managed_page_count(struct page *page, long count);
externvoid reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid);
/* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page);
/* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within * range [0, UCHAR_MAX]. * Return pages freed into the buddy system.
*/ staticinlineunsignedlong free_initmem_default(int poison)
{ externchar __init_begin[], __init_end[];
/* * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns);
*/ void free_area_init(unsignedlong *max_zone_pfn); unsignedlong node_map_pfn_alignment(void); externunsignedlong absent_pages_in_range(unsignedlong start_pfn, unsignedlong end_pfn); externvoid get_pfn_range_for_nid(unsignedint nid, unsignedlong *start_pfn, unsignedlong *end_pfn);
externunsignedlong stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ int expand_stack_locked(struct vm_area_struct *vma, unsignedlong address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsignedlong addr);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ externstruct vm_area_struct * find_vma(struct mm_struct * mm, unsignedlong addr); externstruct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsignedlong addr, struct vm_area_struct **pprev);
/* * Look up the first VMA which intersects the interval [start_addr, end_addr) * NULL if none. Assume start_addr < end_addr.
*/ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsignedlong start_addr, unsignedlong end_addr);
/** * vma_lookup() - Find a VMA at a specific address * @mm: The process address space. * @addr: The user address. * * Return: The vm_area_struct at the given address, %NULL otherwise.
*/ staticinline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsignedlong addr)
{ return mtree_load(&mm->mm_mt, addr);
}
/* * Convert errno to return value for ->page_mkwrite() calls. * * This should eventually be merged with vmf_error() above, but will need a * careful audit of all vmf_error() callers.
*/ staticinline vm_fault_t vmf_fs_error(int err)
{ if (err == 0) return VM_FAULT_LOCKED; if (err == -EFAULT || err == -EAGAIN) return VM_FAULT_NOPAGE; if (err == -ENOMEM) return VM_FAULT_OOM; /* -ENOSPC, -EDQUOT, -EIO ... */ return VM_FAULT_SIGBUS;
}
staticinlineint vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{ if (vm_fault & VM_FAULT_OOM) return -ENOMEM; if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) return -EFAULT; return 0;
}
/* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required.
*/ staticinlinebool gup_can_follow_protnone(struct vm_area_struct *vma, unsignedint flags)
{ /* * If callers don't want to honor NUMA hinting faults, no need to * determine if we would actually have to trigger a NUMA hinting fault.
*/ if (!(flags & FOLL_HONOR_NUMA_FAULT)) returntrue;
/* * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. * * Requiring a fault here even for inaccessible VMAs would mean that * FOLL_FORCE cannot make any progress, because handle_mm_fault() * refuses to process NUMA hinting faults in inaccessible VMAs.
*/ return !vma_is_accessible(vma);
}
#ifdef CONFIG_PAGE_POISONING externvoid __kernel_poison_pages(struct page *page, int numpages); externvoid __kernel_unpoison_pages(struct page *page, int numpages); externbool _page_poisoning_enabled_early;
DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); staticinlinebool page_poisoning_enabled(void)
{ return _page_poisoning_enabled_early;
} /* * For use in fast paths after init_mem_debugging() has run, or when a * false negative result is not harmful when called too early.
*/ staticinlinebool page_poisoning_enabled_static(void)
{ return static_branch_unlikely(&_page_poisoning_enabled);
} staticinlinevoid kernel_poison_pages(struct page *page, int numpages)
{ if (page_poisoning_enabled_static())
__kernel_poison_pages(page, numpages);
} staticinlinevoid kernel_unpoison_pages(struct page *page, int numpages)
{ if (page_poisoning_enabled_static())
__kernel_unpoison_pages(page, numpages);
} #else staticinlinebool page_poisoning_enabled(void) { returnfalse; } staticinlinebool page_poisoning_enabled_static(void) { returnfalse; } staticinlinevoid __kernel_poison_pages(struct page *page, int nunmpages) { } staticinlinevoid kernel_poison_pages(struct page *page, int numpages) { } staticinlinevoid kernel_unpoison_pages(struct page *page, int numpages) { } #endif
/* * For use in fast paths after mem_debugging_and_hardening_init() has run, * or when a false negative result is not harmful when called too early.
*/ staticinlinebool debug_pagealloc_enabled_static(void)
{ if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) returnfalse;
/* * To support DEBUG_PAGEALLOC architecture must ensure that * __kernel_map_pages() never fails
*/ externvoid __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC staticinlinevoid debug_pagealloc_map_pages(struct page *page, int numpages)
{ if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 1);
}
staticinlinevoid debug_pagealloc_unmap_pages(struct page *page, int numpages)
{ if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 0);
}
void *sparse_buffer_alloc(unsignedlong size); unsignedlong section_map_size(void); struct page * __populate_section_memmap(unsignedlong pfn, unsignedlong nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
pgd_t *vmemmap_pgd_populate(unsignedlong addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsignedlong addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsignedlong addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsignedlong addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsignedlong addr, int node, struct vmem_altmap *altmap, unsignedlong ptpfn, unsignedlong flags); void *vmemmap_alloc_block(unsignedlong size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsignedlong size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsignedlong, unsignedlong); void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsignedlong addr, unsignedlong next); int vmemmap_check_pmd(pmd_t *pmd, int node, unsignedlong addr, unsignedlong next); int vmemmap_populate_basepages(unsignedlong start, unsignedlong end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hugepages(unsignedlong start, unsignedlong end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsignedlong start, unsignedlong end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hvo(unsignedlong start, unsignedlong end, int node, unsignedlong headsize); int vmemmap_undo_hvo(unsignedlong start, unsignedlong end, int node, unsignedlong headsize); void vmemmap_wrprotect_hvo(unsignedlong start, unsignedlong end, int node, unsignedlong headsize); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsignedlong start, unsignedlong end, struct vmem_altmap *altmap); #endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP staticinlineunsignedlong vmem_altmap_offset(struct vmem_altmap *altmap)
{ /* number of pfns from base where pfn_to_page() is valid */ if (altmap) return altmap->reserve + altmap->free; return 0;
}
if (!pgmap || !is_power_of_2(sizeof(struct page))) returnfalse;
nr_pages = pgmap_vmemmap_nr(pgmap);
nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); /* * For vmemmap optimization with DAX we need minimum 2 vmemmap * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
*/ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
} /* * If we don't have an architecture override, use the generic rule
*/ #ifndef vmemmap_can_optimize #define vmemmap_can_optimize __vmemmap_can_optimize #endif
/** * vma_is_special_huge - Are transhuge page-table entries considered special? * @vma: Pointer to the struct vm_area_struct to consider * * Whether transhuge page-table entries are considered "special" following * the definition in vm_normal_page(). * * Return: true if transhuge page-table entries should be considered special, * false otherwise.
*/ staticinlinebool vma_is_special_huge(conststruct vm_area_struct *vma)
{ return vma_is_dax(vma) || (vma->vm_file &&
(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}
int reserve_mem_find_by_name(constchar *name, phys_addr_t *start, phys_addr_t *size); int reserve_mem_release_by_name(constchar *name);
#ifdef CONFIG_64BIT int do_mseal(unsignedlong start, size_t len_in, unsignedlong flags); #else staticinlineint do_mseal(unsignedlong start, size_t len_in, unsignedlong flags)
{ /* noop on 32 bit */ return 0;
} #endif
/* * user_alloc_needs_zeroing checks if a user folio from page allocator needs to * be zeroed or not.
*/ staticinlinebool user_alloc_needs_zeroing(void)
{ /* * for user folios, arch with cache aliasing requires cache flush and * arc changes folio->flags to make icache coherent with dcache, so * always return false to make caller use * clear_user_page()/clear_user_highpage().
*/ return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() ||
!static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
&init_on_alloc);
}
int arch_get_shadow_stack_status(struct task_struct *t, unsignedlong __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsignedlong status); int arch_lock_shadow_stack_status(struct task_struct *t, unsignedlong status);
/* * mseal of userspace process's system mappings.
*/ #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS #define VM_SEALED_SYSMAP VM_SEALED #else #define VM_SEALED_SYSMAP VM_NONE #endif
/* * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and * stashes it in the upper bits of page->pp_magic. We always want to be able to * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP * pages can have arbitrary kernel pointers stored in the same field as pp_magic * (since it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * * On architectures that set POISON_POINTER_DELTA, this is already ensured, * since this value becomes part of PP_SIGNATURE; meaning we can just use the * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is * known at compile-time. * * If the value of PAGE_OFFSET is not known at compile time, or if it is too * small to leave at least 8 bits available above PP_SIGNATURE, we define the * number of bits to be 0, which turns off the DMA index tracking altogether * (see page_pool_register_dma_index()).
*/ #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) #if POISON_POINTER_DELTA > 0 /* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA * index to not overlap with that if set
*/ #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) #else /* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */ #define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8)) #define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \
PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \
!(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \
MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0)
/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for * the head page of compound page and bit 1 for pfmemalloc page, as well as the * bits used for the DMA index. page_is_pfmemalloc() is checked in * __page_pool_put_page() to avoid recycling the pfmemalloc page.
*/ #define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.