/* * By default, transparent hugepage support is disabled in order to avoid * risking an increased memory footprint for applications that are not * guaranteed to benefit from it. When transparent hugepage support is * enabled, it is for all mappings, and khugepaged scans all mappings. * Defrag is invoked by khugepaged hugepage allocations and by page faults * for all hugepage allocations.
*/ unsignedlong transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
(1<<TRANSPARENT_HUGEPAGE_FLAG)| #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma))
supported_orders = THP_ORDERS_ALL_ANON; elseif (vma_is_special_huge(vma))
supported_orders = THP_ORDERS_ALL_SPECIAL; else
supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
orders &= supported_orders; if (!orders) return 0;
if (!vma->vm_mm) /* vdso */ return 0;
if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */ if (vma_is_dax(vma)) return in_pf ? orders : 0;
/* * khugepaged special VMA and hugetlb VMA. * Must be checked after dax since some dax mappings may have * VM_MIXEDMAP set.
*/ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) return 0;
/* * Check alignment for file vma and size for both file and anon vma by * filtering out the unsuitable orders. * * Skip the check for page fault. Huge fault does the check in fault * handlers.
*/ if (!in_pf) { int order = highest_order(orders); unsignedlong addr;
while (orders) {
addr = vma->vm_end - (PAGE_SIZE << order); if (thp_vma_suitable_order(vma, addr, order)) break;
order = next_order(&orders, order);
}
if (!orders) return 0;
}
/* * Enabled via shmem mount options or sysfs settings. * Must be done before hugepage flags check since shmem has its * own flags.
*/ if (!in_pf && shmem_file(vma->vm_file)) return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
!enforce_sysfs);
if (!vma_is_anonymous(vma)) { /* * Enforce sysfs THP requirements as necessary. Anonymous vmas * were already handled in thp_vma_allowable_orders().
*/ if (enforce_sysfs &&
(!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
!hugepage_global_always()))) return 0;
/* * Trust that ->huge_fault() handlers know what they are doing * in fault path.
*/ if (((in_pf || smaps)) && vma->vm_ops->huge_fault) return orders; /* Only regular file is valid in collapse path */ if (((!in_pf || smaps)) && file_thp_enabled(vma)) return orders; return 0;
}
if (vma_is_temporary_stack(vma)) return 0;
/* * THPeligible bit of smaps should show 1 for proper VMAs even * though anon_vma is not initialized yet. * * Allow page fault since anon_vma may be not initialized until * the first page fault.
*/ if (!vma->anon_vma) return (smaps || in_pf) ? orders : 0;
return orders;
}
staticbool get_huge_zero_page(void)
{ struct folio *zero_folio;
retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) returntrue;
zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER); if (!zero_folio) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); returnfalse;
} /* Ensure zero folio won't have large_rmappable flag set. */
folio_clear_large_rmappable(zero_folio);
preempt_disable(); if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
preempt_enable();
folio_put(zero_folio); goto retry;
}
WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
count_vm_event(THP_ZERO_PAGE_ALLOC); returntrue;
}
staticvoid put_huge_zero_page(void)
{ /* * Counter should never go to zero here. Only shrinker can put * last reference.
*/
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
put_huge_zero_page();
return READ_ONCE(huge_zero_folio);
}
void mm_put_huge_zero_folio(struct mm_struct *mm)
{ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
put_huge_zero_page();
}
staticunsignedlong shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc)
{ /* we can free zero page only if last reference remains */ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
staticint sysfs_add_group(struct kobject *kobj, conststruct attribute_group *grp)
{ int ret = -ENOENT;
/* * If the group is named, try to merge first, assuming the subdirectory * was already created. This avoids the warning emitted by * sysfs_create_group() if the directory already exists.
*/ if (grp->name)
ret = sysfs_merge_group(kobj, grp); if (ret)
ret = sysfs_create_group(kobj, grp);
staticint __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{ int err; struct thpsize *thpsize; unsignedlong orders; int order;
/* * Default to setting PMD-sized THP to inherit the global setting and * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time * constant so we have to do this here.
*/ if (!anon_orders_configured)
huge_anon_orders_inherit = BIT(PMD_ORDER);
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) {
pr_err("failed to create transparent hugepage kobject\n"); return -ENOMEM;
}
err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) {
pr_err("failed to register transparent hugepage group\n"); goto delete_obj;
}
err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) {
pr_err("failed to register transparent hugepage group\n"); goto remove_hp_group;
}
orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
order = highest_order(orders); while (orders) {
thpsize = thpsize_create(order, *hugepage_kobj); if (IS_ERR(thpsize)) {
pr_err("failed to create thpsize for order %d\n", order);
err = PTR_ERR(thpsize); goto remove_all;
}
list_add(&thpsize->node, &thpsize_list);
order = next_order(&orders, order);
}
/* * hugepages can't be allocated by the buddy allocator
*/
MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs;
err = khugepaged_init(); if (err) goto err_slab;
err = thp_shrinker_init(); if (err) goto err_shrinker;
/* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys.
*/ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
transparent_hugepage_flags = 0; return 0;
}
err = start_stop_khugepaged(); if (err) goto err_khugepaged;
/* * When a folio is not zeroed during allocation (__GFP_ZERO not used) * or user folios require special handling, folio_zero_user() is used to * make sure that the page corresponding to the faulting address will be * hot in the cache after zeroing.
*/ if (user_alloc_needs_zeroing())
folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() * write.
*/
__folio_mark_uptodate(folio); return folio;
}
/* * always: directly stall for all thp allocations * defer: wake kswapd and fail if not immediately available * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise * fail if not immediately available * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately * available * never: never stall for any thp allocation
*/
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{ constbool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
/* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
/* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
/* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT |
(vma_madvised ? __GFP_DIRECT_RECLAIM :
__GFP_KSWAPD_RECLAIM);
/* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT |
(vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK;
ret = vmf_anon_prepare(vmf); if (ret) return ret;
khugepaged_enter_vma(vma, vma->vm_flags);
/* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit.
*/
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS;
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM;
}
/* * If we had pud_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit.
*/
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS;
pmd = pmdp_get_lockless(src_pmd); if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); /* * No need to recheck the pmd, it can't change with write * mmap lock held here. * * Meanwhile, making sure it's not a CoW VMA with writable * mapping, otherwise it means either the anon page wrongly * applied special bit, or we made the PRIVATE mapping be * able to wrongly write to the backend MMIO.
*/
VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); goto set_pmd;
}
/* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0;
pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out;
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd)); if (!is_readable_migration_entry(entry)) {
entry = make_readable_migration_entry(
swp_offset(entry));
pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd); if (pmd_swp_uffd_wp(*src_pmd))
pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); if (!userfaultfd_wp(dst_vma))
pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0; goto out_unlock;
} #endif
if (unlikely(!pmd_trans_huge(pmd))) {
pte_free(dst_mm, pgtable); goto out_unlock;
} /* * When page table lock is held, the huge zero pmd should not be * under splitting since we don't split the page itself, only pmd to * a page table.
*/ if (is_huge_zero_pmd(pmd)) { /* * mm_get_huge_zero_folio() will never allocate a new * folio here, since we already have a zero page to * copy. It just takes a reference.
*/
mm_get_huge_zero_folio(dst_mm); goto out_zero_page;
}
/* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse;
if (!folio_trylock(folio)) {
folio_get(folio);
spin_unlock(vmf->ptl);
folio_lock(folio);
spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
folio_unlock(folio);
folio_put(folio); return 0;
}
folio_put(folio);
}
/* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) {
folio_unlock(folio); goto reuse;
}
/* * See do_wp_page(): we can only reuse the folio exclusively if * there are no additional references. Note that we always drain * the LRU cache immediately after adding a THP.
*/ if (folio_ref_count(folio) >
1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; if (folio_test_swapcache(folio))
folio_free_swap(folio); if (folio_ref_count(folio) == 1) {
pmd_t entry;
if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
spin_unlock(vmf->ptl); return 0;
}
pmd = pmd_modify(old_pmd, vma->vm_page_prot);
/* * Detect now whether the PMD could be writable; this information * is only valid while holding the PT lock.
*/
writable = pmd_write(pmd); if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
can_change_pmd_writable(vma, vmf->address, pmd))
writable = true;
folio = vm_normal_folio_pmd(vma, haddr, pmd); if (!folio) goto out_map;
nid = folio_nid(folio);
target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
&last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
flags |= TNF_MIGRATE_FAIL; goto out_map;
} /* The folio is isolated and isolation code holds a folio reference. */
spin_unlock(vmf->ptl);
writable = false;
if (!migrate_misplaced_folio(folio, target_nid)) {
flags |= TNF_MIGRATED;
nid = target_nid;
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); return 0;
}
/* * Return true if we do MADV_FREE successfully on entire pmd page. * Otherwise, return false.
*/ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsignedlong addr, unsignedlong next)
{
spinlock_t *ptl;
pmd_t orig_pmd; struct folio *folio; struct mm_struct *mm = tlb->mm; bool ret = false;
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked;
orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto out;
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(orig_pmd)); goto out;
}
folio = pmd_folio(orig_pmd); /* * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio.
*/ if (folio_maybe_mapped_shared(folio)) goto out;
if (!folio_trylock(folio)) goto out;
/* * If user want to discard part-pages of THP, split it so MADV_FREE * will deactivate only them.
*/ if (next - addr != HPAGE_PMD_SIZE) {
folio_get(folio);
spin_unlock(ptl);
split_folio(folio);
folio_unlock(folio);
folio_put(folio); goto out_unlocked;
}
if (folio_test_dirty(folio))
folio_clear_dirty(folio);
folio_unlock(folio);
if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else { if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(folio),
-HPAGE_PMD_NR);
/* * Use flush_needed to indicate whether the PMD entry * is present, instead of checking pmd_present() again.
*/ if (flush_needed && pmd_young(orig_pmd) &&
likely(vma_has_recency(vma)))
folio_mark_accessed(folio);
}
spin_unlock(ptl); if (flush_needed)
tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
} return 1;
}
#ifndef pmd_move_must_withdraw staticinlineint pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
spinlock_t *old_pmd_ptl, struct vm_area_struct *vma)
{ /* * With split pmd lock we also need to move preallocated * PTE page table if new_pmd is on different PMD page table. * * We also don't deposit and withdraw tables for file pages.
*/ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
} #endif
/* * The destination pmd shouldn't be established, free_pgtables() * should have released it; but move_page_tables() might have already * inserted a page table, if racing against shmem/file collapse.
*/ if (!pmd_none(*new_pmd)) {
VM_BUG_ON(pmd_trans_huge(*new_pmd)); returnfalse;
}
/* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock.
*/
old_ptl = __pmd_trans_huge_lock(old_pmd, vma); if (old_ptl) {
new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); if (pmd_present(pmd))
force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so * just be safe and disable write
*/ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(swp_offset(entry)); else
entry = make_readable_migration_entry(swp_offset(entry));
newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
} else {
newpmd = *pmd;
}
if (prot_numa) { struct folio *folio; bool toptier; /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and * local/remote hits to the zero page are not interesting.
*/ if (is_huge_zero_pmd(*pmd)) goto unlock;
if (pmd_protnone(*pmd)) goto unlock;
folio = pmd_folio(*pmd);
toptier = node_is_toptier(folio_nid(folio)); /* * Skip scanning top tier node if normal numa * balancing is disabled
*/ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
toptier) goto unlock;
if (folio_use_access_time(folio))
folio_xchg_access_time(folio,
jiffies_to_msecs(jiffies));
} /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED * which is also under mmap_read_lock(mm): * * CPU0: CPU1: * change_huge_pmd(prot_numa=1) * pmdp_huge_get_and_clear_notify() * madvise_dontneed() * zap_pmd_range() * pmd_trans_huge(*pmd) == 0 (without ptl) * // skip the pmd * set_pmd_at(); * // pmd is re-established * * The race makes MADV_DONTNEED miss the huge pmd and don't clear it * which may break userspace. * * pmdp_invalidate_ad() is required to make sure we don't miss * dirty/young flags set by hardware.
*/
oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
entry = pmd_modify(oldpmd, newprot); if (uffd_wp)
entry = pmd_mkuffd_wp(entry); elseif (uffd_wp_resolve) /* * Leave the write bit to be handled by PF interrupt * handler, then things like COW could be properly * handled.
*/
entry = pmd_clear_uffd_wp(entry);
/* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
can_change_pmd_writable(vma, addr, entry))
entry = pmd_mkwrite(entry, vma);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
/* * Returns: * * - 0: if pud leaf changed from under us * - 1: if pud can be skipped * - HPAGE_PUD_NR: if pud was successfully processed
*/ #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
pud_t *pudp, unsignedlong addr, pgprot_t newprot, unsignedlong cp_flags)
{ struct mm_struct *mm = vma->vm_mm;
pud_t oldpud, entry;
spinlock_t *ptl;
tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
/* NUMA balancing doesn't apply to dax */ if (cp_flags & MM_CP_PROT_NUMA) return 1;
/* * Huge entries on userfault-wp only works with anonymous, while we * don't have anonymous PUDs yet.
*/ if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL)) return 1;
ptl = __pud_trans_huge_lock(pudp, vma); if (!ptl) return 0;
/* * Can't clear PUD or it can race with concurrent zapping. See * change_huge_pmd().
*/
oldpud = pudp_invalidate(vma, addr, pudp);
entry = pud_modify(oldpud, newprot);
set_pud_at(mm, addr, pudp, entry);
tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);
spin_unlock(ptl); return HPAGE_PUD_NR;
} #endif
#ifdef CONFIG_USERFAULTFD /* * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by * the caller, but it must return after releasing the page_table_lock. * Just move the page from src_pmd to dst_pmd if possible. * Return zero if succeeded in moving the page, -EAGAIN if it needs to be * repeated by the caller, or other errors in case of failure.
*/ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsignedlong dst_addr, unsignedlong src_addr)
{
pmd_t _dst_pmd, src_pmdval; struct page *src_page; struct folio *src_folio; struct anon_vma *src_anon_vma;
spinlock_t *src_ptl, *dst_ptl;
pgtable_t src_pgtable; struct mmu_notifier_range range; int err = 0;
/* * split_huge_page walks the anon_vma chain without the page * lock. Serialize against it with the anon_vma lock, the page * lock is not enough.
*/
src_anon_vma = folio_get_anon_vma(src_folio); if (!src_anon_vma) {
err = -EAGAIN; goto unlock_folio;
}
anon_vma_lock_write(src_anon_vma);
} else
src_anon_vma = NULL;
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) {
set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
err = -EBUSY; goto unlock_ptls;
}
/* * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. * * Note that if it returns page table lock pointer, this routine returns without * unlocking page table lock. So callers must unlock it.
*/
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spinlock_t *ptl;
ptl = pmd_lock(vma->vm_mm, pmd); if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))) return ptl;
spin_unlock(ptl); return NULL;
}
/* * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. * * Note that if it returns page table lock pointer, this routine returns without * unlocking page table lock. So callers must unlock it.
*/
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
spinlock_t *ptl;
/* * Leave pmd empty until pte is filled note that it is fine to delay * notification until mmu_notifier_invalidate_range_end() as we are * replacing a zero pmd write protected page with a zero pte write * protected page. * * See Documentation/mm/mmu_notifier.rst
*/
old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
if (!vma_is_anonymous(vma)) {
old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); /* * We are going to unmap this huge page. So * just go ahead and zap it
*/ if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd); if (!vma_is_dax(vma) && vma_is_special_huge(vma)) return; if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below * inside __split_huge_pmd() ? * * We are going from a zero huge page write protected to zero * small page also write protected so it does not seems useful * to invalidate secondary mmu at this time.
*/ return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
pmd_migration = is_pmd_migration_entry(*pmd); if (unlikely(pmd_migration)) {
swp_entry_t entry;
old_pmd = *pmd;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry); if (PageAnon(page))
anon_exclusive = is_readable_exclusive_migration_entry(entry);
young = is_migration_entry_young(entry);
dirty = is_migration_entry_dirty(entry);
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else { /* * Up to this point the pmd is present and huge and userland has * the whole access to the hugepage during the split (which * happens in place). If we overwrite the pmd with the not-huge * version pointing to the pte here (which of course we could if * all CPUs were bug free), userland could trigger a small page * size TLB miss on the small sized TLB while the hugepage TLB * entry is still established in the huge TLB. Some CPU doesn't * like that. See * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum * 383 on page 105. Intel should be safe but is also warns that * it's only safe if the permission and cache attributes of the * two entries loaded in the two TLB is identical (which should * be the case here). But it is generally safer to never allow * small and huge TLB entries for the same virtual address to be * loaded simultaneously. So instead of doing "pmd_populate(); * flush_pmd_tlb_range();" we first mark the current pmd * notpresent (atomically because here the pmd_trans_huge must * remain set at all times on the pmd until the split is * complete for this pmd), then we flush the SMP TLB and finally * we write the non-huge version of the pmd entry with * pmd_populate.
*/
old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
folio = page_folio(page); if (pmd_dirty(old_pmd)) {
dirty = true;
folio_set_dirty(folio);
}
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
uffd_wp = pmd_uffd_wp(old_pmd);
/* * Without "freeze", we'll simply split the PMD, propagating the * PageAnonExclusive() flag for each PTE by setting it for * each subpage -- no need to (temporarily) clear. * * With "freeze" we want to replace mapped pages by * migration entries right away. This is only possible if we * managed to clear PageAnonExclusive() -- see * set_pmd_migration_entry(). * * In case we cannot clear PageAnonExclusive(), split the PMD * only and let try_to_migrate_one() fail later. * * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
*/
anon_exclusive = PageAnonExclusive(page); if (freeze && anon_exclusive &&
folio_try_share_anon_rmap_pmd(folio, page))
freeze = false; if (!freeze) {
rmap_t rmap_flags = RMAP_NONE;
/* * Withdraw the table only after we mark the pmd entry invalid. * This's critical for some architectures (Power).
*/
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
/* * Note that NUMA hinting access restrictions are not transferred to * avoid any possibility of altering permissions across VMAs.
*/ if (freeze || pmd_migration) { for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
pte_t entry;
swp_entry_t swp_entry;
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i)); elseif (anon_exclusive)
swp_entry = make_readable_exclusive_migration_entry(
page_to_pfn(page + i)); else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i)); if (young)
swp_entry = make_migration_entry_young(swp_entry); if (dirty)
swp_entry = make_migration_entry_dirty(swp_entry);
entry = swp_entry_to_pte(swp_entry); if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry); if (uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); if (write)
entry = pte_mkwrite(entry, vma); if (!young)
entry = pte_mkold(entry); /* NOTE: this may set soft-dirty too on some archs */ if (dirty)
entry = pte_mkdirty(entry); if (soft_dirty)
entry = pte_mksoft_dirty(entry); if (uffd_wp)
entry = pte_mkuffd_wp(entry);
for (i = 0; i < HPAGE_PMD_NR; i++)
VM_WARN_ON(!pte_none(ptep_get(pte + i)));
staticinlinevoid split_huge_pmd_if_needed(struct vm_area_struct *vma, unsignedlong address)
{ /* * If the new address isn't hpage aligned and it could previously * contain an hugepage: check if we need to split an huge pmd.
*/ if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
ALIGN(address, HPAGE_PMD_SIZE)))
split_huge_pmd_address(vma, address, false);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsignedlong start, unsignedlong end, struct vm_area_struct *next)
{ /* Check if we need to split start first. */
split_huge_pmd_if_needed(vma, start);
/* Check if we need to split end next. */
split_huge_pmd_if_needed(vma, end);
/* If we're incrementing next->vm_start, we might need to split it. */ if (next)
split_huge_pmd_if_needed(next, end);
}
if (folio_test_pmd_mappable(folio))
ttu_flags |= TTU_SPLIT_HUGE_PMD;
/* * Anon pages need migration entries to preserve them, but file * pages can simply be left unmapped, then faulted back on demand. * If that is ever changed (perhaps for mlock), update remap_page().
*/ if (folio_test_anon(folio))
try_to_migrate(folio, ttu_flags); else
try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
/* * Order reads for folio refcount and dirty flag * (see comments in __remove_mapping()).
*/
smp_rmb();
/* * If the folio or its PMD is redirtied at this point, or if there * are unexpected references, we will give up to discard this folio * and remap it. * * The only folio refs must be one from isolation plus the rmap(s).
*/ if (pmd_dirty(orig_pmd))
folio_set_dirty(folio); if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
folio_set_swapbacked(folio);
set_pmd_at(mm, addr, pmdp, orig_pmd); returnfalse;
}
staticvoid remap_page(struct folio *folio, unsignedlong nr, int flags)
{ int i = 0;
/* If unmap_folio() uses try_to_migrate() on file, remove this check */ if (!folio_test_anon(folio)) return; for (;;) {
remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
i += folio_nr_pages(folio); if (i >= nr) break;
folio = folio_next(folio);
}
}
if (list) { /* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
folio_get(new_folio);
list_add_tail(&new_folio->lru, list);
} else { /* head is still on lru (and we have it frozen) */
VM_WARN_ON(!folio_test_lru(folio)); if (folio_test_unevictable(folio))
new_folio->mlock_count = 0; else
list_add_tail(&new_folio->lru, &folio->lru);
folio_set_lru(new_folio);
}
}
/* Racy check whether the huge page can be split */ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{ int extra_pins;
staticbool page_range_has_hwpoisoned(struct page *page, long nr_pages)
{ for (; nr_pages; page++, nr_pages--) if (PageHWPoison(page)) returntrue; returnfalse;
}
/* * It splits @folio into @new_order folios and copies the @folio metadata to * all the resulting folios.
*/ staticvoid __split_folio_to_order(struct folio *folio, int old_order, int new_order)
{ /* Scan poisoned pages when split a poisoned folio to large folios */ constbool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order; long new_nr_pages = 1 << new_order; long nr_pages = 1 << old_order; long i;
folio_clear_has_hwpoisoned(folio);
/* Check first new_nr_pages since the loop below skips them */ if (handle_hwpoison &&
page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages))
folio_set_has_hwpoisoned(folio); /* * Skip the first new_nr_pages, since the new folio from them have all * the flags from the original folio.
*/ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { struct page *new_head = &folio->page + i; /* * Careful: new_folio is not a "real" folio before we cleared PageTail. * Don't pass it around before clear_compound_head().
*/ struct folio *new_folio = (struct folio *)new_head;
/* * Clone page flags before unfreezing refcount. * * After successful get_page_unless_zero() might follow flags change, * for example lock_page() which set PG_waiters. * * Note that for mapped sub-pages of an anonymous THP, * PG_anon_exclusive has been cleared in unmap_folio() and is stored in * the migration entry instead from where remap_page() will restore it. * We can still have PG_anon_exclusive set on effectively unmapped and * unreferenced sub-pages of an anonymous THP: we can simply drop * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
new_folio->flags |= (folio->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_swapcache) |
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) | #ifdef CONFIG_ARCH_USES_PG_ARCH_2
(1L << PG_arch_2) | #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_3
(1L << PG_arch_3) | #endif
(1L << PG_dirty) |
LRU_GEN_MASK | LRU_REFS_MASK));
if (handle_hwpoison &&
page_range_has_hwpoisoned(new_head, new_nr_pages))
folio_set_has_hwpoisoned(new_folio);
/* * page->private should not be set in tail pages. Fix up and warn once * if private is unexpectedly set.
*/ if (unlikely(new_folio->private)) {
VM_WARN_ON_ONCE_PAGE(true, new_head);
new_folio->private = NULL;
}
if (folio_test_swapcache(folio))
new_folio->swap.val = folio->swap.val + i;
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
/* * Clear PageTail before unfreezing page refcount. * * After successful get_page_unless_zero() might follow put_page() * which needs correct compound_head().
*/
clear_compound_head(new_head); if (new_order) {
prep_compound_page(new_head, new_order);
folio_set_large_rmappable(new_folio);
}
if (folio_test_young(folio))
folio_set_young(new_folio); if (folio_test_idle(folio))
folio_set_idle(new_folio); #ifdef CONFIG_MEMCG
new_folio->memcg_data = folio->memcg_data; #endif
if (new_order)
folio_set_order(folio, new_order); else
ClearPageCompound(&folio->page);
}
/* * It splits an unmapped @folio to lower order smaller folios in two ways. * @folio: the to-be-split folio * @new_order: the smallest order of the after split folios (since buddy * allocator like split generates folios with orders from @folio's * order - 1 to new_order). * @split_at: in buddy allocator like split, the folio containing @split_at * will be split until its order becomes @new_order. * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller * @mapping: @folio->mapping * @uniform_split: if the split is uniform or not (buddy allocator like split) * * * 1. uniform split: the given @folio into multiple @new_order small folios, * where all small folios have the same order. This is done when * uniform_split is true. * 2. buddy allocator like (non-uniform) split: the given @folio is split into * half and one of the half (containing the given page) is split into half * until the given @page's order becomes @new_order. This is done when * uniform_split is false. * * The high level flow for these two methods are: * 1. uniform split: a single __split_folio_to_order() is called to split the * @folio into @new_order, then we traverse all the resulting folios one by * one in PFN ascending order and perform stats, unfreeze, adding to list, * and file mapping index operations. * 2. non-uniform split: in general, folio_order - @new_order calls to * __split_folio_to_order() are made in a for loop to split the @folio * to one lower order at a time. The resulting small folios are processed * like what is done during the traversal in 1, except the one containing * @page, which is split in next for loop. * * After splitting, the caller's folio reference will be transferred to the * folio containing @page. The caller needs to unlock and/or free after-split * folios if necessary. * * For !uniform_split, when -ENOMEM is returned, the original folio might be * split. The caller needs to check the input folio.
*/ staticint __split_unmapped_folio(struct folio *folio, int new_order, struct page *split_at, struct xa_state *xas, struct address_space *mapping, bool uniform_split)
{ int order = folio_order(folio); int start_order = uniform_split ? new_order : order - 1; bool stop_split = false; struct folio *next; int split_order; int ret = 0;
if (folio_test_anon(folio))
mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
/* * split to new_order one order at a time. For uniform split, * folio is split to new_order directly.
*/ for (split_order = start_order;
split_order >= new_order && !stop_split;
split_order--) { struct folio *end_folio = folio_next(folio); int old_order = folio_order(folio); struct folio *new_folio;
/* order-1 anonymous folio is not supported */ if (folio_test_anon(folio) && split_order == 1) continue; if (uniform_split && split_order != new_order) continue;
if (mapping) { /* * uniform split has xas_split_alloc() called before * irq is disabled to allocate enough memory, whereas * non-uniform split can handle ENOMEM.
*/ if (uniform_split)
xas_split(xas, folio, old_order); else {
xas_set_order(xas, folio->index, split_order);
xas_try_split(xas, folio, old_order); if (xas_error(xas)) {
ret = xas_error(xas);
stop_split = true;
}
}
}
/* * Iterate through after-split folios and update folio stats. * But in buddy allocator like split, the folio * containing the specified page is skipped until its order * is new_order, since the folio will be worked on in next * iteration.
*/ for (new_folio = folio; new_folio != end_folio; new_folio = next) {
next = folio_next(new_folio); /* * for buddy allocator like split, new_folio containing * @split_at page could be split again, thus do not * change stats yet. Wait until new_folio's order is * @new_order or stop_split is set to true by the above * xas_split() failure.
*/ if (new_folio == page_folio(split_at)) {
folio = new_folio; if (split_order != new_order && !stop_split) continue;
} if (folio_test_anon(new_folio))
mod_mthp_stat(folio_order(new_folio),
MTHP_STAT_NR_ANON, 1);
}
}
return ret;
}
bool non_uniform_split_supported(struct folio *folio, unsignedint new_order, bool warns)
{ if (folio_test_anon(folio)) { /* order-1 is not supported for anonymous THP. */
VM_WARN_ONCE(warns && new_order == 1, "Cannot split to order-1 folio"); if (new_order == 1) returnfalse;
} elseif (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
!mapping_large_folio_support(folio->mapping)) { /* * No split if the file system does not support large folio. * Note that we might still have THPs in such mappings due to * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping * does not actually support large folios properly.
*/
VM_WARN_ONCE(warns, "Cannot split file folio to non-0 order"); returnfalse;
}
/* Only swapping a whole PMD-mapped folio is supported */ if (folio_test_swapcache(folio)) {
VM_WARN_ONCE(warns, "Cannot split swapcache folio to non-0 order"); returnfalse;
}
returntrue;
}
/* See comments in non_uniform_split_supported() */ bool uniform_split_supported(struct folio *folio, unsignedint new_order, bool warns)
{ if (folio_test_anon(folio)) {
VM_WARN_ONCE(warns && new_order == 1, "Cannot split to order-1 folio"); if (new_order == 1) returnfalse;
} elseif (new_order) { if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
!mapping_large_folio_support(folio->mapping)) {
VM_WARN_ONCE(warns, "Cannot split file folio to non-0 order"); returnfalse;
}
}
if (new_order && folio_test_swapcache(folio)) {
VM_WARN_ONCE(warns, "Cannot split swapcache folio to non-0 order"); returnfalse;
}
returntrue;
}
/* * __folio_split: split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL * @uniform_split: perform uniform split or not (non-uniform split) * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. * It is in charge of checking whether the split is supported or not and * preparing @folio for __split_unmapped_folio(). * * After splitting, the after-split folio containing @lock_at remains locked * and others are unlocked: * 1. for uniform split, @lock_at points to one of @folio's subpages; * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio. * * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be * split but not to @new_order, the caller needs to check)
*/ staticint __folio_split(struct folio *folio, unsignedint new_order, struct page *split_at, struct page *lock_at, struct list_head *list, bool uniform_split)
{ struct deferred_split *ds_queue = get_deferred_split_queue(folio);
XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; struct anon_vma *anon_vma = NULL; int order = folio_order(folio); struct folio *new_folio, *next; int nr_shmem_dropped = 0; int remap_flags = 0; int extra_pins, ret;
pgoff_t end; bool is_hzp;
if (folio != page_folio(split_at) || folio != page_folio(lock_at)) return -EINVAL;
if (new_order >= folio_order(folio)) return -EINVAL;
if (uniform_split && !uniform_split_supported(folio, new_order, true)) return -EINVAL;
if (!uniform_split &&
!non_uniform_split_supported(folio, new_order, true)) return -EINVAL;
is_hzp = is_huge_zero_folio(folio); if (is_hzp) {
pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY;
}
if (folio_test_writeback(folio)) return -EBUSY;
if (is_anon) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a * reference to it and then lock the anon_vma for write. This * is similar to folio_lock_anon_vma_read except the write lock * is taken to serialise against parallel split or collapse * operations.
*/
anon_vma = folio_get_anon_vma(folio); if (!anon_vma) {
ret = -EBUSY; goto out;
}
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else { unsignedint min_order;
gfp_t gfp;
mapping = folio->mapping;
/* Truncated ? */ /* * TODO: add support for large shmem folio in swap cache. * When shmem is in swap cache, mapping is NULL and * folio_test_swapcache() is true.
*/ if (!mapping) {
ret = -EBUSY; goto out;
}
min_order = mapping_min_folio_order(folio->mapping); if (new_order < min_order) {
ret = -EINVAL; goto out;
}
if (!filemap_release_folio(folio, gfp)) {
ret = -EBUSY; goto out;
}
if (uniform_split) {
xas_set_order(&xas, folio->index, new_order);
xas_split_alloc(&xas, folio, folio_order(folio), gfp); if (xas_error(&xas)) {
ret = xas_error(&xas); goto out;
}
}
anon_vma = NULL;
i_mmap_lock_read(mapping);
/* *__split_unmapped_folio() may need to trim off pages beyond * EOF: but on 32-bit, i_size_read() takes an irq-unsafe * seqlock, which cannot be nested inside the page tree lock. * So note end now: i_size itself may be changed at any moment, * but folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping))
end = shmem_fallocend(mapping->host, end);
}
/* * Racy check if we can split the page, before unmap_folio() will * split PMDs
*/ if (!can_split_folio(folio, 1, &extra_pins)) {
ret = -EAGAIN; goto out_unlock;
}
unmap_folio(folio);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable(); if (mapping) { /* * Check if the folio is present in page cache. * We assume all tail are present too, if folio is there.
*/
xas_lock(&xas);
xas_reset(&xas); if (xas_load(&xas) != folio) {
ret = -EAGAIN; goto fail;
}
}
if (folio_order(folio) > 1 &&
!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
} /* * Reinitialize page_deferred_list after removing the * page from the split_queue, otherwise a subsequent * split will see list corruption when checking the * page_deferred_list.
*/
list_del_init(&folio->_deferred_list);
}
spin_unlock(&ds_queue->split_queue_lock); if (mapping) { int nr = folio_nr_pages(folio);
ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
mapping, uniform_split);
/* * Unfreeze after-split folios and put them back to the right * list. @folio should be kept frozon until page cache * entries are updated with all the other after-split folios * to prevent others seeing stale page cache entries. * As a result, new_folio starts from the next folio of * @folio.
*/ for (new_folio = folio_next(folio); new_folio != end_folio;
new_folio = next) { unsignedlong nr_pages = folio_nr_pages(new_folio);
/* * Anonymous folio with swap cache. * NOTE: shmem in swap cache is not supported yet.
*/ if (swap_cache) {
__xa_store(&swap_cache->i_pages,
swap_cache_index(new_folio->swap),
new_folio, 0); continue;
}
/* Anonymous folio without swap cache */ if (!mapping) continue;
/* Add the new folio to the page cache. */ if (new_folio->index < end) {
__xa_store(&mapping->i_pages, new_folio->index,
new_folio, 0); continue;
}
/* Drop folio beyond EOF: ->index >= end */ if (shmem_mapping(mapping))
nr_shmem_dropped += nr_pages; elseif (folio_test_clear_dirty(new_folio))
folio_account_cleaned(
new_folio, inode_to_wb(mapping->host));
__filemap_remove_folio(new_folio, NULL);
folio_put_refs(new_folio, nr_pages);
} /* * Unfreeze @folio only after all page cache entries, which * used to point to it, have been updated with new folios. * Otherwise, a parallel folio_try_get() can grab @folio * and its caller can see stale page cache entries.
*/
expected_refs = folio_expected_ref_count(folio) + 1;
folio_ref_unfreeze(folio, expected_refs);
unlock_page_lruvec(lruvec);
if (swap_cache)
xa_unlock(&swap_cache->i_pages);
} else {
spin_unlock(&ds_queue->split_queue_lock);
ret = -EAGAIN;
}
fail: if (mapping)
xas_unlock(&xas);
local_irq_enable();
if (nr_shmem_dropped)
shmem_uncharge(mapping->host, nr_shmem_dropped);
/* * Unlock all after-split folios except the one containing * @lock_at page. If @folio is not split, it will be kept locked.
*/ for (new_folio = folio; new_folio != end_folio; new_folio = next) {
next = folio_next(new_folio); if (new_folio == page_folio(lock_at)) continue;
folio_unlock(new_folio); /* * Subpages may be freed if there wasn't any mapping * like if add_to_swap() is running on a lru page that * had its mapping zapped. And freeing these pages * requires taking the lru_lock so we do the put_page * of the tail pages after the split is complete.
*/
free_folio_and_swap_cache(new_folio);
}
out_unlock: if (anon_vma) {
anon_vma_unlock_write(anon_vma);
put_anon_vma(anon_vma);
} if (mapping)
i_mmap_unlock_read(mapping);
out:
xas_destroy(&xas); if (order == HPAGE_PMD_ORDER)
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); return ret;
}
/* * This function splits a large folio into smaller folios of order @new_order. * @page can point to any page of the large folio to split. The split operation * does not change the position of @page. * * Prerequisites: * * 1) The caller must hold a reference on the @page's owning folio, also known * as the large folio. * * 2) The large folio must be locked. * * 3) The folio must not be pinned. Any unexpected folio references, including * GUP pins, will result in the folio not getting split; instead, the caller * will receive an -EAGAIN. * * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not * supported for non-file-backed folios, because folio->_deferred_list, which * is used by partially mapped folios, is stored in subpage 2, but an order-1 * folio only has subpages 0 and 1. File-backed order-1 folios are supported, * since they do not use _deferred_list. * * After splitting, the caller's folio reference will be transferred to @page, * resulting in a raised refcount of @page after this call. The other pages may * be freed if they are not mapped. * * If @list is null, tail pages will be added to LRU list, otherwise, to @list. * * Pages in @new_order will inherit the mapping, flags, and so on from the * huge page. * * Returns 0 if the huge page was split successfully. * * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if * the folio was concurrently removed from the page cache. * * Returns -EBUSY when trying to split the huge zeropage, if the folio is * under writeback, if fs-specific folio metadata cannot currently be * released, or if some unexpected race happened (e.g., anon VMA disappeared, * truncation). * * Callers should ensure that the order respects the address space mapping * min-order if one is set for non-anonymous folios. * * Returns -EINVAL when trying to split to an order that is incompatible * with the folio. Splitting to order 0 is compatible with all folios.
*/ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsignedint new_order)
{ struct folio *folio = page_folio(page);
/* * folio_split: split a folio at @split_at to a @new_order folio * @folio: folio to split * @new_order: the order of the new folio * @split_at: a page within the new folio * * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be * split but not to @new_order, the caller needs to check) * * It has the same prerequisites and returns as * split_huge_page_to_list_to_order(). * * Split a folio at @split_at to a new_order folio, leave the * remaining subpages of the original folio as large as possible. For example, * in the case of splitting an order-9 folio at its third order-3 subpages to * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio. * After the split, there will be a group of folios with different orders and * the new folio containing @split_at is marked in bracket: * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8]. * * After split, folio is left locked for caller.
*/ int folio_split(struct folio *folio, unsignedint new_order, struct page *split_at, struct list_head *list)
{ return __folio_split(folio, new_order, split_at, &folio->page, list, false);
}
int min_order_for_split(struct folio *folio)
{ if (folio_test_anon(folio)) return 0;
if (!folio->mapping) { if (folio_test_pmd_mappable(folio))
count_vm_event(THP_SPLIT_PAGE_FAILED); return -EBUSY;
}
/* * __folio_unqueue_deferred_split() is not to be called directly: * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h * limits its calls to those folios which may have a _deferred_list for * queueing THP splits, and that list is (racily observed to be) non-empty. * * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is * zero: because even when split_queue_lock is held, a non-empty _deferred_list * might be in use on deferred_split_scan()'s unlocked on-stack list. * * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is * therefore important to unqueue deferred split before changing folio memcg.
*/ bool __folio_unqueue_deferred_split(struct folio *folio)
{ struct deferred_split *ds_queue; unsignedlong flags; bool unqueued = false;
/* * Order 1 folios have no space for a deferred list, but we also * won't waste much memory by not adding them to the deferred list.
*/ if (folio_order(folio) <= 1) return;
if (!partially_mapped && !split_underused_thp) return;
/* * Exclude swapcache: originally to avoid a corrupt deferred split * queue. Nowadays that is fully prevented by memcg1_swapout(); * but if page reclaim is already handling the same folio, it is * unnecessary to handle it again in the shrinker, so excluding * swapcache here may still be a useful optimization.
*/ if (folio_test_swapcache(folio)) return;
spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) {
folio_set_partially_mapped(folio); if (folio_test_pmd_mappable(folio))
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
staticbool thp_underused(struct folio *folio)
{ int num_zero_pages = 0, num_filled_pages = 0; int i;
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) returnfalse;
if (folio_contain_hwpoisoned_page(folio)) returnfalse;
for (i = 0; i < folio_nr_pages(folio); i++) { if (pages_identical(folio_page(folio, i), ZERO_PAGE(0))) { if (++num_zero_pages > khugepaged_max_ptes_none) returntrue;
} else { /* * Another path for early exit once the number * of non-zero filled pages exceeds threshold.
*/ if (++num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) returnfalse;
}
} returnfalse;
}
#ifdef CONFIG_MEMCG if (sc->memcg)
ds_queue = &sc->memcg->deferred_split_queue; #endif
spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */
list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
_deferred_list) { if (folio_try_get(folio)) {
list_move(&folio->_deferred_list, &list);
} else { /* We lost race with folio_put() */ if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
ds_queue->split_queue_len--;
} if (!--sc->nr_to_scan) break;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
if (!folio_test_partially_mapped(folio)) {
underused = thp_underused(folio); if (!underused) goto next;
} if (!folio_trylock(folio)) goto next; if (!split_folio(folio)) {
did_split = true; if (underused)
count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
split++;
}
folio_unlock(folio);
next: /* * split_folio() removes folio from list on success. * Only add back to the queue if folio is partially mapped. * If thp_underused returns false, or if split_folio fails * in the case it was underused, then consider it used and * don't add it back to split_queue.
*/ if (did_split) {
; /* folio already removed from list */
} elseif (!folio_test_partially_mapped(folio)) {
list_del_init(&folio->_deferred_list);
removed++;
} else { /* * That unlocked list_del_init() above would be unsafe, * unless its folio is separated from any earlier folios * left on the list (which may be concurrently unqueued) * by one safe folio with refcount still raised.
*/
swap(folio, prev);
} if (folio)
folio_put(folio);
}
/* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us.
*/ if (!split && list_empty(&ds_queue->split_queue)) return SHRINK_STOP; return split;
}
mmap_read_lock(mm); /* * always increase addr by PAGE_SIZE, since we could have a PTE page * table filled with PTE-mapped THPs, each of which is distinct.
*/ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { struct vm_area_struct *vma = vma_lookup(mm, addr); struct folio_walk fw; struct folio *folio; struct address_space *mapping; unsignedint target_order = new_order;
if (!vma) break;
/* skip special VMA and hugetlb VMA */ if (vma_not_suitable_for_thp_split(vma)) {
addr = vma->vm_end; continue;
}
folio = folio_walk_start(&fw, vma, addr, 0); if (!folio) continue;
if (target_order >= folio_order(folio)) goto next;
total++; /* * For folios with private, split_huge_page_to_list_to_order() * will try to drop it before split and then check if the folio * can be split or not. So skip the check here.
*/ if (!folio_test_private(folio) &&
!can_split_folio(folio, 0, NULL)) goto next;
if (!folio_trylock(folio)) goto next;
folio_get(folio);
folio_walk_end(&fw, vma);
if (!folio_test_anon(folio) && folio->mapping != mapping) goto unlock;
if (in_folio_offset < 0 ||
in_folio_offset >= folio_nr_pages(folio)) { if (!split_folio_to_order(folio, target_order))
split++;
} else { struct page *split_at = folio_page(folio,
in_folio_offset); if (!folio_split(folio, target_order, split_at, NULL))
split++;
}
tok = strsep(&tok_buf, ","); if (tok && tok_buf) {
strscpy(file_path, tok);
} else {
ret = -EINVAL; goto out;
}
ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
&new_order, &in_folio_offset); if (ret != 2 && ret != 3 && ret != 4) {
ret = -EINVAL; goto out;
}
ret = split_huge_pages_in_file(file_path, off_start, off_end,
new_order, in_folio_offset); if (!ret)
ret = input_len;
goto out;
}
ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
&vaddr_end, &new_order, &in_folio_offset); if (ret == 1 && pid == 1) {
split_huge_pages_all();
ret = strlen(input_buf); goto out;
} elseif (ret != 3 && ret != 4 && ret != 5) {
ret = -EINVAL; goto out;
}
ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
in_folio_offset); if (!ret)
ret = strlen(input_buf);
out:
mutex_unlock(&split_debug_mutex); return ret;
/* No need to invalidate - it was non-present before */
update_mmu_cache_pmd(vma, address, pvmw->pmd);
trace_remove_migration_pmd(address, pmd_val(pmde));
} #endif
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.62 Sekunden
(vorverarbeitet am 2026-04-25)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.