// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel.
*/
struct scan_control { /* How many pages shrink_list() should reclaim */ unsignedlong nr_to_reclaim;
/* * Nodemask of nodes allowed by the caller. If NULL, all nodes * are scanned.
*/
nodemask_t *nodemask;
/* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation.
*/ struct mem_cgroup *target_mem_cgroup;
/* * Scan pressure balancing between anon and file LRUs
*/ unsignedlong anon_cost; unsignedlong file_cost;
/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ int *proactive_swappiness;
/* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsignedint may_deactivate:2; unsignedint force_deactivate:1; unsignedint skipped_deactivate:1;
/* Writepage batching in laptop mode; RECLAIM_WRITE */ unsignedint may_writepage:1;
/* Can mapped folios be reclaimed? */ unsignedint may_unmap:1;
/* Can folios be swapped as part of reclaim? */ unsignedint may_swap:1;
/* Not allow cache_trim_mode to be turned on as part of reclaim? */ unsignedint no_cache_trim_mode:1;
/* Has cache_trim_mode failed at least once? */ unsignedint cache_trim_mode_failed:1;
/* Proactive reclaim invoked by userspace */ unsignedint proactive:1;
/* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at * reduced force or passed over entirely due to its memory.low * setting (memcg_low_skipped), and nothing is reclaimed as a * result, then go back for one more cycle that reclaims the protected * memory (memcg_low_reclaim) to avert OOM.
*/ unsignedint memcg_low_reclaim:1; unsignedint memcg_low_skipped:1;
/* Shared cgroup tree walk failed, rescan the whole tree */ unsignedint memcg_full_walk:1;
unsignedint hibernation_mode:1;
/* One of the zones is ready for compaction */ unsignedint compaction_ready:1;
/* There is easily reclaimable cold cache in the current node */ unsignedint cache_trim_mode:1;
/* The file folios on the current node are dangerously low */ unsignedint file_is_tiny:1;
/* Always discard instead of demoting to lower tier memory */ unsignedint no_demotion:1;
/* Allocation order */
s8 order;
/* Scan (total_size >> priority) pages at once */
s8 priority;
/* The highest zone to isolate folios for reclaim from */
s8 reclaim_idx;
/* This context's GFP mask */
gfp_t gfp_mask;
/* Incremented by the number of inactive pages that were scanned */ unsignedlong nr_scanned;
/* Number of pages freed so far during a call to shrink_zones() */ unsignedlong nr_reclaimed;
/* for recording the reclaimed slab by now */ struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ if ((_folio)->lru.prev != _base) { \ struct folio *prev; \
\
prev = lru_to_folio(&(_folio->lru)); \
prefetchw(&prev->_field); \
} \
} while (0) #else #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif
/* * From 0 .. MAX_SWAPPINESS. Higher means more swappy.
*/ int vm_swappiness = 60;
#ifdef CONFIG_MEMCG
/* Returns true for reclaim through cgroup limits or cgroup interfaces. */ staticbool cgroup_reclaim(struct scan_control *sc)
{ return sc->target_mem_cgroup;
}
/* * Returns true for reclaim on the root cgroup. This is true for direct * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
*/ staticbool root_reclaim(struct scan_control *sc)
{ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
}
/** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational.
*/ staticbool writeback_throttling_sane(struct scan_control *sc)
{ if (!cgroup_reclaim(sc)) returntrue; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) returntrue; #endif returnfalse;
}
/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to * and including the specified highidx * @zone: The current zone in the iterator * @pgdat: The pgdat which node_zones are being iterated * @idx: The index variable * @highidx: The index of the highest zone to return * * This macro iterates through all managed zones up to and including the specified highidx. * The zone iterator enters an invalid state after macro call and must be reinitialized * before it can be used again.
*/ #define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
(idx) <= (highidx); \
(idx)++, (zone)++) \ if (!managed_zone(zone)) \ continue; \ else
staticvoid set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs)
{ /* Check for an overwrite */
WARN_ON_ONCE(rs && task->reclaim_state);
/* Check for the nulling of an already-nulled member */
WARN_ON_ONCE(!rs && !task->reclaim_state);
task->reclaim_state = rs;
}
/* * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to * scan_control->nr_reclaimed.
*/ staticvoid flush_reclaim_state(struct scan_control *sc)
{ /* * Currently, reclaim_state->reclaimed includes three types of pages * freed outside of vmscan: * (1) Slab pages. * (2) Clean file pages from pruned inodes (on highmem systems). * (3) XFS freed buffer pages. * * For all of these cases, we cannot universally link the pages to a * single memcg. For example, a memcg-aware shrinker can free one object * charged to the target memcg, causing an entire page to be freed. * If we count the entire page as reclaimed from the memcg, we end up * overestimating the reclaimed amount (potentially under-reclaiming). * * Only count such pages for global reclaim to prevent under-reclaiming * from the target memcg; preventing unnecessary retries during memcg * charging and false positives from proactive reclaim. * * For uncommon cases where the freed pages were actually mostly * charged to the target memcg, we end up underestimating the reclaimed * amount. This should be fine. The freed pages will be uncharged * anyway, even if they are not counted here properly, and we will be * able to make forward progress in charging (which is usually in a * retry loop). * * We can go one step further, and report the uncharged objcg pages in * memcg reclaim, to make reporting more accurate and reduce * underestimation, but it's probably not worth the complexity for now.
*/ if (current->reclaim_state && root_reclaim(sc)) {
sc->nr_reclaimed += current->reclaim_state->reclaimed;
current->reclaim_state->reclaimed = 0;
}
}
if (!numa_demotion_enabled) returnfalse; if (sc && sc->no_demotion) returnfalse;
demotion_nid = next_demotion_node(nid); if (demotion_nid == NUMA_NO_NODE) returnfalse;
/* If demotion node isn't in the cgroup's mems_allowed, fall back */ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
staticinlinebool can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid, struct scan_control *sc)
{ if (memcg == NULL) { /* * For non-memcg reclaim, is there * space in any swap device?
*/ if (get_nr_swap_pages() > 0) returntrue;
} else { /* Is the memcg below its swap limit? */ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) returntrue;
}
/* * The page can not be swapped. * * Can it be reclaimed from this node via demotion?
*/ return can_demote(nid, sc, memcg);
}
/* * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is * not expected that isolated folios will be a dominating factor.
*/ unsignedlong zone_reclaimable_pages(struct zone *zone)
{ unsignedlong nr;
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); /* * If there are no reclaimable file-backed or anonymous pages, * ensure zones with sufficient free pages are not skipped. * This prevents zones like DMA32 from being ignored in reclaim * scenarios where they can still help alleviate memory pressure.
*/ if (nr == 0)
nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); return nr;
}
/** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
*/ staticunsignedlong lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{ unsignedlong size = 0; int zid; struct zone *zone;
if (current_is_kswapd()) return 0; if (current_is_khugepaged()) return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; if (sc->proactive) return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD; return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
staticinlineint is_page_cache_freeable(struct folio *folio)
{ /* * A freeable page cache folio is referenced only by the caller * that isolated the folio, the page cache and optional filesystem * private data at folio->private.
*/ return folio_ref_count(folio) - folio_test_private(folio) ==
1 + folio_nr_pages(folio);
}
/* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing * prevents it from being freed up. But we have a ref on the folio and once * that folio is locked, the mapping is pinned. * * We're allowed to run sleeping folio_lock() here because we know the caller has * __GFP_FS.
*/ staticvoid handle_write_error(struct address_space *mapping, struct folio *folio, int error)
{
folio_lock(folio); if (folio_mapping(folio) == mapping)
mapping_set_error(mapping, error);
folio_unlock(folio);
}
staticbool skip_throttle_noprogress(pg_data_t *pgdat)
{ int reclaimable = 0, write_pending = 0; int i; struct zone *zone; /* * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM.
*/ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) returntrue;
/* * If there are a lot of dirty/writeback folios then do not * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback.
*/
for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
} if (2 * write_pending <= reclaimable) returntrue;
/* * Do not throttle user workers, kthreads other than kswapd or * workqueues. They may be required for reclaim to make * forward progress (e.g. journalling workqueues or kthreads).
*/ if (!current_is_kswapd() &&
current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
cond_resched(); return;
}
/* * These figures are pulled out of thin air. * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many * parallel reclaimers which is a short-lived event so the timeout is * short. Failing to make progress or waiting on writeback are * potentially long-lived events so use a longer timeout. This is shaky * logic as a failure to make progress could be due to anything from * writeback to a slow device to excessive referenced folios at the tail * of the inactive LRU.
*/ switch(reason) { case VMSCAN_THROTTLE_WRITEBACK:
timeout = HZ/10;
if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
WRITE_ONCE(pgdat->nr_reclaim_start,
node_page_state(pgdat, NR_THROTTLED_WRITTEN));
}
break; case VMSCAN_THROTTLE_CONGESTED:
fallthrough; case VMSCAN_THROTTLE_NOPROGRESS: if (skip_throttle_noprogress(pgdat)) {
cond_resched(); return;
}
/* * Account for folios written if tasks are throttled waiting on dirty * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks.
*/ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled)
{ unsignedlong nr_written;
node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
/* * This is an inaccurate read as the per-cpu deltas may not * be synchronised. However, given that the system is * writeback throttled, it is not worth taking the penalty * of getting an accurate count. At worst, the throttle * timeout guarantees forward progress.
*/
nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
READ_ONCE(pgdat->nr_reclaim_start);
if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
}
/* possible outcome of pageout() */ typedefenum { /* failed to write folio out, folio is locked */
PAGE_KEEP, /* move folio to the active list, folio is locked */
PAGE_ACTIVATE, /* folio has been sent to the disk successfully, folio is unlocked */
PAGE_SUCCESS, /* folio is clean and locked */
PAGE_CLEAN,
} pageout_t;
/* * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled * or we failed to allocate contiguous swap entries, in which case * the split out folios get added back to folio_list.
*/ if (shmem_mapping(mapping))
res = shmem_writeout(folio, plug, folio_list); else
res = swap_writeout(folio, plug);
if (res < 0)
handle_write_error(mapping, folio, res); if (res == AOP_WRITEPAGE_ACTIVATE) {
folio_clear_reclaim(folio); return PAGE_ACTIVATE;
}
/* synchronous write? */ if (!folio_test_writeback(folio))
folio_clear_reclaim(folio);
/* * pageout is called by shrink_folio_list() for each dirty folio.
*/ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list)
{ /* * We no longer attempt to writeback filesystem folios here, other * than tmpfs/shmem. That's taken care of in page-writeback. * If we find a dirty filesystem folio at the end of the LRU list, * typically that means the filesystem is saturating the storage * with contiguous writes and telling it to write a folio here * would only make the situation worse by injecting an element * of random access. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed.
*/ if (!is_page_cache_freeable(folio)) return PAGE_KEEP; if (!mapping) { /* * Some data journaling orphaned folios can have * folio->mapping == NULL while being dirty with clean buffers.
*/ if (folio_test_private(folio)) { if (try_to_free_buffers(folio)) {
folio_clear_dirty(folio);
pr_info("%s: orphaned folio\n", __func__); return PAGE_CLEAN;
}
} return PAGE_KEEP;
}
if (!shmem_mapping(mapping) && !folio_test_anon(folio)) return PAGE_ACTIVATE; if (!folio_clear_dirty_for_io(folio)) return PAGE_CLEAN; return writeout(folio, mapping, plug, folio_list);
}
/* * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0.
*/ staticint __remove_mapping(struct address_space *mapping, struct folio *folio, bool reclaimed, struct mem_cgroup *target_memcg)
{ int refcount; void *shadow = NULL;
if (!folio_test_swapcache(folio))
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has * a ref to the folio, it may be possible that they dirty it then * drop the reference. So if the dirty flag is tested before the * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); * !folio_test_dirty(folio) [good] * folio_set_dirty(folio); * folio_put(folio); * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the folio->flags * load is not satisfied before that of folio->_refcount. * * Note that if the dirty flag is always set via folio_mark_dirty, * and thus under the i_pages lock, then this ordering is not required.
*/
refcount = 1 + folio_nr_pages(folio); if (!folio_ref_freeze(folio, refcount)) goto cannot_free; /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ if (unlikely(folio_test_dirty(folio))) {
folio_ref_unfreeze(folio, refcount); goto cannot_free;
}
if (folio_test_swapcache(folio)) {
swp_entry_t swap = folio->swap;
free_folio = mapping->a_ops->free_folio; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. * * But don't store shadows in an address space that is * already exiting. This is not just an optimization, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. * * We also don't store shadows for DAX mappings because the * only page cache folios found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space.
*/ if (reclaimed && folio_is_file_lru(folio) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(folio, target_memcg);
__filemap_remove_folio(folio, shadow);
xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (free_folio)
free_folio(folio);
}
return 1;
cannot_free:
xa_unlock_irq(&mapping->i_pages); if (!folio_test_swapcache(folio))
spin_unlock(&mapping->host->i_lock); return 0;
}
/** * remove_mapping() - Attempt to remove a folio from its mapping. * @mapping: The address space. * @folio: The folio to remove. * * If the folio is dirty, under writeback or if someone else has a ref * on it, removal will fail. * Return: The number of pages removed from the mapping. 0 if the folio * could not be removed. * Context: The caller should have a single refcount on the folio and * hold its lock.
*/ long remove_mapping(struct address_space *mapping, struct folio *folio)
{ if (__remove_mapping(mapping, folio, false, NULL)) { /* * Unfreezing the refcount with 1 effectively * drops the pagecache ref for us without requiring another * atomic operation.
*/
folio_ref_unfreeze(folio, 1); return folio_nr_pages(folio);
} return 0;
}
/** * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. * @folio: Folio to be returned to an LRU list. * * Add previously isolated @folio to appropriate LRU list. * The folio may still be unevictable for other reasons. * * Context: lru_lock must not be held, interrupts must be enabled.
*/ void folio_putback_lru(struct folio *folio)
{
folio_add_lru(folio);
folio_put(folio); /* drop ref from isolate */
}
#ifdef CONFIG_LRU_GEN /* * Only used on a mapped folio in the eviction (rmap walk) path, where promotion * needs to be done by taking the folio off the LRU list and then adding it back * with PG_active set. In contrast, the aging (page table walk) path uses * folio_update_gen().
*/ staticbool lru_gen_set_refs(struct folio *folio)
{ /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); returnfalse;
}
/* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. * Let the folio, now marked Mlocked, be moved to the unevictable list.
*/ if (vm_flags & VM_LOCKED) return FOLIOREF_ACTIVATE;
/* * There are two cases to consider. * 1) Rmap lock contention: rotate. * 2) Skip the non-shared swapbacked folio mapped solely by * the exiting or OOM-reaped process.
*/ if (referenced_ptes == -1) return FOLIOREF_KEEP;
if (lru_gen_enabled()) { if (!referenced_ptes) return FOLIOREF_RECLAIM;
if (referenced_ptes) { /* * All mapped folios start out with page table * references from the instantiating fault, so we need * to look twice if a mapped file/anon folio is used more * than once. * * Mark it and spare it for another trip around the * inactive list. Another page table reference will * lead to its activation. * * Note: the mark is set for activated folios as well * so that recently deactivated but used folios are * quickly recovered.
*/
folio_set_referenced(folio);
if (referenced_folio || referenced_ptes > 1) return FOLIOREF_ACTIVATE;
/* * Activate file-backed executable folios after first usage.
*/ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) return FOLIOREF_ACTIVATE;
return FOLIOREF_KEEP;
}
/* Reclaim if clean, defer dirty folios to writeback */ if (referenced_folio && folio_is_file_lru(folio)) return FOLIOREF_RECLAIM_CLEAN;
return FOLIOREF_RECLAIM;
}
/* Check if a folio is dirty or under writeback */ staticvoid folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback)
{ struct address_space *mapping;
/* * Anonymous folios are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them. * MADV_FREE anonymous folios are put into inactive file list too. * They could be mistakenly treated as file lru. So further anon * test is needed.
*/ if (!folio_is_file_lru(folio) ||
(folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
*dirty = false;
*writeback = false; return;
}
/* By default assume that the folio flags are accurate */
*dirty = folio_test_dirty(folio);
*writeback = folio_test_writeback(folio);
/* Verify dirty/writeback state if the filesystem supports it */ if (!folio_test_private(folio)) return;
allowed_mask = mtc->nmask; /* * make sure we allocate from the target node first also trying to * demote or reclaim pages from the target node via kswapd if we are * low on free memory on target node. If we don't do this and if * we have free memory on the slower(lower) memtier, we would start * allocating pages from slower(lower) memory tiers without even forcing * a demotion of cold pages from the target memtier. This can result * in the kernel placing hot pages in slower(lower) memory tiers.
*/
mtc->nmask = NULL;
mtc->gfp_mask |= __GFP_THISNODE;
dst = alloc_migration_target(src, (unsignedlong)mtc); if (dst) return dst;
/* * Take folios on @demote_folios and attempt to demote them to another node. * Folios which are not demoted are left on @demote_folios.
*/ staticunsignedint demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat)
{ int target_nid = next_demotion_node(pgdat->node_id); unsignedint nr_succeeded;
nodemask_t allowed_mask;
struct migration_target_control mtc = { /* * Allocate from 'node', or fail quickly and quietly. * When this happens, 'page' will likely just be discarded * instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
.nmask = &allowed_mask,
.reason = MR_DEMOTION,
};
if (list_empty(demote_folios)) return 0;
if (target_nid == NUMA_NO_NODE) return 0;
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsignedlong)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
return nr_succeeded;
}
staticbool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
{ if (gfp_mask & __GFP_FS) returntrue; if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) returnfalse; /* * We can "enter_fs" for swap-cache with only __GFP_IO * providing this isn't SWP_FS_OPS. * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe.
*/ return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
}
if (folio_contain_hwpoisoned_page(folio)) { /* * unmap_poisoned_folio() can't handle large * folio, just skip it. memory_failure() will * handle it if the UCE is triggered again.
*/ if (folio_test_large(folio)) goto keep_locked;
/* Account the number of base pages */
sc->nr_scanned += nr_pages;
if (unlikely(!folio_evictable(folio))) goto activate_locked;
if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked;
/* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing * folios if the tail of the LRU is all dirty unqueued folios.
*/
folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback)
stat->nr_dirty += nr_pages;
if (dirty && !writeback)
stat->nr_unqueued_dirty += nr_pages;
/* * Treat this folio as congested if folios are cycling * through the LRU so quickly that the folios marked * for immediate reclaim are making it to the end of * the LRU a second time.
*/ if (writeback && folio_test_reclaim(folio))
stat->nr_congested += nr_pages;
/* * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * * 1) If reclaim is encountering an excessive number * of folios under writeback and this folio has both * the writeback and reclaim flags set, then it * indicates that folios are being queued for I/O but * are being recycled through the LRU before the I/O * can complete. Waiting on the folio itself risks an * indefinite stall if it is impossible to writeback * the folio due to I/O error or disconnected storage * so instead note that the LRU is being scanned too * quickly and the caller can stall after the folio * list has been processed. * * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, * not to fs), or the folio belongs to a mapping where * waiting on writeback during reclaim may lead to a deadlock. * In this case mark the folio for immediate reclaim and * continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * * 3) Legacy memcg encounters a folio that already has the * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * * In cases 1) and 2) we activate the folios to get them out of * the way while we continue scanning for clean folios on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. * Since they're marked for immediate reclaim, they won't put * memory pressure on the cache working set any longer than it * takes to write them to disk.
*/ if (folio_test_writeback(folio)) {
mapping = folio_mapping(folio);
/* Case 1 above */ if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
stat->nr_immediate += nr_pages; goto activate_locked;
/* Case 2 above */
} elseif (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
!may_enter_fs(folio, sc->gfp_mask) ||
(mapping &&
mapping_writeback_may_deadlock_on_reclaim(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have * just cleared the reclaim flag, then * setting the reclaim flag here ends up * interpreted as the readahead flag - but * that does not matter enough to care. * What we do want is for this folio to * have the reclaim flag set next time * memcg reclaim reaches the tests above, * so it will then wait for writeback to * avoid OOM; and it's also appropriate * in global reclaim.
*/
folio_set_reclaim(folio);
stat->nr_writeback += nr_pages; goto activate_locked;
/* Case 3 above */
} else {
folio_unlock(folio);
folio_wait_writeback(folio); /* then go back and try same folio again */
list_add_tail(&folio->lru, folio_list); continue;
}
}
if (!ignore_references)
references = folio_check_references(folio, sc);
switch (references) { case FOLIOREF_ACTIVATE: goto activate_locked; case FOLIOREF_KEEP:
stat->nr_ref_keep += nr_pages; goto keep_locked; case FOLIOREF_RECLAIM: case FOLIOREF_RECLAIM_CLEAN:
; /* try to reclaim the folio below */
}
/* * Before reclaiming the folio, try to relocate * its contents to another node.
*/ if (do_demote_pass &&
(thp_migration_supported() || !folio_test_large(folio))) {
list_add(&folio->lru, &demote_folios);
folio_unlock(folio); continue;
}
/* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree folio could be freed directly
*/ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { if (!folio_test_swapcache(folio)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) goto keep_locked; if (folio_test_large(folio)) { /* cannot split folio, skip it */ if (!can_split_folio(folio, 1, NULL)) goto activate_locked; /* * Split partially mapped folios right away. * We can free the unmapped pages without IO.
*/ if (data_race(!list_empty(&folio->_deferred_list) &&
folio_test_partially_mapped(folio)) &&
split_folio_to_list(folio, folio_list)) goto activate_locked;
} if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages >= HPAGE_PMD_NR) {
count_memcg_folio_events(folio,
THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
} #endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) goto activate_locked_split;
} /* * Normally the folio will be dirtied in unmap because its * pte should be dirty. A special case is MADV_FREE page. The * page's pte could have dirty bit cleared but the folio's * SwapBacked flag is still set because clearing the dirty bit * and SwapBacked flag has no lock protected. For such folio, * unmap will not set dirty bit for it, so folio reclaim will * not write the folio out. This can cause data corruption when * the folio is swapped in later. Always setting the dirty flag * for the folio solves the problem.
*/
folio_mark_dirty(folio);
}
}
/* * If the folio was split above, the tail pages will make * their own pass through this function and be accounted * then.
*/ if ((nr_pages > 1) && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
/* * The folio is mapped into the page tables of one or more * processes. Try to unmap it here.
*/ if (folio_mapped(folio)) { enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = folio_test_swapbacked(folio);
if (folio_test_pmd_mappable(folio))
flags |= TTU_SPLIT_HUGE_PMD; /* * Without TTU_SYNC, try_to_unmap will only begin to * hold PTL from the first present PTE within a large * folio. Some initial PTEs might be skipped due to * races with parallel PTE writes in which PTEs can be * cleared temporarily before being written new present * values. This will lead to a large folio is still * mapped while some subpages have been partially * unmapped after try_to_unmap; TTU_SYNC helps * try_to_unmap acquire PTL from the first PTE, * eliminating the influence of temporary PTE values.
*/ if (folio_test_large(folio))
flags |= TTU_SYNC;
try_to_unmap(folio, flags); if (folio_mapped(folio)) {
stat->nr_unmap_fail += nr_pages; if (!was_swapbacked &&
folio_test_swapbacked(folio))
stat->nr_lazyfree_fail += nr_pages; goto activate_locked;
}
}
/* * Folio is unmapped now so it cannot be newly pinned anymore. * No point in trying to reclaim folio if it is pinned. * Furthermore we don't want to reclaim underlying fs metadata * if the folio is pinned and thus potentially modified by the * pinning process as that may upset the filesystem.
*/ if (folio_maybe_dma_pinned(folio)) goto activate_locked;
mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { /* * Only kswapd can writeback filesystem folios * to avoid risk of stack overflow. But avoid * injecting inefficient single-folio I/O into * flusher writeback as much as possible: only * write folios when we've encountered many * dirty folios, and when we've already scanned * the rest of the LRU for clean folios and see * the same dirty folios again (with the reclaim * flag set).
*/ if (folio_is_file_lru(folio) &&
(!current_is_kswapd() ||
!folio_test_reclaim(folio) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() * except we already have the folio isolated * and know it's dirty
*/
node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
nr_pages);
folio_set_reclaim(folio);
goto activate_locked;
}
if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked;
/* * Folio is dirty. Flush the TLB if a writable entry * potentially exists to avoid CPU writes after I/O * starts and then write it out here.
*/
try_to_unmap_flush_dirty(); switch (pageout(folio, mapping, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: /* * If shmem folio is split when writeback to swap, * the tail pages will make their own pass through * this function and be accounted then.
*/ if (nr_pages > 1 && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
} goto activate_locked; case PAGE_SUCCESS: if (nr_pages > 1 && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
stat->nr_pageout += nr_pages;
if (folio_test_writeback(folio)) goto keep; if (folio_test_dirty(folio)) goto keep;
/* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the folio.
*/ if (!folio_trylock(folio)) goto keep; if (folio_test_dirty(folio) ||
folio_test_writeback(folio)) goto keep_locked;
mapping = folio_mapping(folio);
fallthrough; case PAGE_CLEAN:
; /* try to free the folio below */
}
}
/* * If the folio has buffers, try to free the buffer * mappings associated with this folio. If we succeed * we try to free the folio as well. * * We do this even if the folio is dirty. * filemap_release_folio() does not perform I/O, but it * is possible for a folio to have the dirty flag set, * but it is actually clean (all its buffers are clean). * This happens if the buffers were written out directly, * with submit_bh(). ext3 will do this, as well as * the blockdev mapping. filemap_release_folio() will * discover that cleanness and will drop the buffers * and mark the folio clean - it can be freed. * * Rarely, folios can have buffers and no ->mapping. * These are the folios which were not successfully * invalidated in truncate_cleanup_folio(). We try to * drop those buffers here and if that worked, and the * folio is no longer mapped into process address space * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable.
*/ if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) {
folio_unlock(folio); if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU).
*/
nr_reclaimed += nr_pages; continue;
}
}
}
if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; /* * The folio has only one reference left, which is * from the isolation. After the caller puts the * folio back on the lru and drops the reference, the * folio will be freed anyway. It doesn't matter * which lru it goes on. So we don't bother checking * the dirty flag here.
*/
count_vm_events(PGLAZYFREED, nr_pages);
count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
} elseif (!mapping || !__remove_mapping(mapping, folio, true,
sc->target_mem_cgroup)) goto keep_locked;
folio_unlock(folio);
free_it: /* * Folio may get swapped out as a whole, need to account * all pages in it.
*/
nr_reclaimed += nr_pages;
activate_locked_split: /* * The tail pages that are failed to add into swap cache * reach here. Fixup nr_scanned and nr_pages.
*/ if (nr_pages > 1) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) &&
(mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
folio_free_swap(folio);
VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio);
folio_set_active(folio);
stat->nr_activate[type] += nr_pages;
count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
}
keep_locked:
folio_unlock(folio);
keep:
list_add(&folio->lru, &ret_folios);
VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
folio_test_unevictable(folio), folio);
} /* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
nr_demoted = demote_folio_list(&demote_folios, pgdat);
nr_reclaimed += nr_demoted;
stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */
list_splice_init(&demote_folios, folio_list);
/* * goto retry to reclaim the undemoted folios in folio_list if * desired. * * Reclaiming directly from top tier nodes is not often desired * due to it breaking the LRU ordering: in general memory * should be reclaimed from lower tier nodes and demoted from * top tier nodes. * * However, disabling reclaim from top tier nodes entirely * would cause ooms in edge scenarios where lower tier memory * is unreclaimable for whatever reason, eg memory being * mlocked or too hot to reclaim. We can disable reclaim * from top tier nodes in proactive reclaim though as that is * not real memory pressure.
*/ if (!sc->proactive) {
do_demote_pass = false; goto retry;
}
}
list_for_each_entry_safe(folio, next, folio_list, lru) { /* TODO: these pages should not even appear in this list. */ if (page_has_movable_ops(&folio->page)) continue; if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
!folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
folio_clear_active(folio);
list_move(&folio->lru, &clean_folios);
}
}
/* * We should be safe here since we are only dealing with file pages and * we are not kswapd and therefore cannot write dirty file pages. But * call memalloc_noreclaim_save() anyway, just in case these conditions * change in the future.
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
&stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)nr_reclaimed); /* * Since lazyfree pages are isolated from file LRU from the beginning, * they will rotate back to anonymous LRU in the end if it failed to * discard so isolated count will be mismatched. * Compensate the isolated count for both LRU lists.
*/
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
stat.nr_lazyfree_fail);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)stat.nr_lazyfree_fail); return nr_reclaimed;
}
/* * Update LRU sizes after isolating pages. The LRU size updates must * be complete before mem_cgroup_update_lru_size due to a sanity check.
*/ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsignedlong *nr_zone_taken)
{ int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_zone_taken[zid]) continue;
/* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * * lruvec->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * * For pagecache intensive workloads, this function is the hottest * spot in the kernel (apart from copy_*_user functions). * * Lru_lock must be held before calling this function. * * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst.
*/ staticunsignedlong isolate_lru_folios(unsignedlong nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsignedlong *nr_scanned, struct scan_control *sc, enum lru_list lru)
{ struct list_head *src = &lruvec->lists[lru]; unsignedlong nr_taken = 0; unsignedlong nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsignedlong nr_skipped[MAX_NR_ZONES] = { 0, }; unsignedlong skipped = 0, total_scan = 0, scan = 0; unsignedlong nr_pages; unsignedlong max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
/* Using max_nr_skipped to prevent hard LOCKUP*/ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
(folio_zonenum(folio) > sc->reclaim_idx)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
max_nr_skipped++; goto move;
}
/* * Do not count skipped folios because that makes the function * return with no isolated folios if the LRU mostly contains * ineligible folios. This causes the VM to not reclaim any * folios, triggering a premature OOM. * Account all pages in a folio.
*/
scan += nr_pages;
if (!folio_test_lru(folio)) goto move; if (!sc->may_unmap && folio_mapped(folio)) goto move;
/* * Be careful not to clear the lru flag until after we're * sure the folio is not being freed elsewhere -- the * folio release code relies on it.
*/ if (unlikely(!folio_try_get(folio))) goto move;
if (!folio_test_clear_lru(folio)) { /* Another thread is already isolating this folio */
folio_put(folio); goto move;
}
/* * Splice any skipped folios to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX * scanning would soon rescan the same folios to skip and waste lots * of cpu cycles.
*/ if (!list_empty(&folios_skipped)) { int zid;
list_splice(&folios_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue;
/** * folio_isolate_lru() - Try to isolate a folio from its LRU list. * @folio: Folio to isolate from its LRU list. * * Isolate a @folio from an LRU list and adjust the vmstat statistic * corresponding to whatever LRU list the folio was on. * * The folio will have its LRU flag cleared. If it was found on the * active list, it will have the Active flag set. If it was found on the * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * * Context: * * (1) Must be called with an elevated refcount on the folio. This is a * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. * * Return: true if the folio was removed from an LRU list. * false if the folio was not on an LRU list.
*/ bool folio_isolate_lru(struct folio *folio)
{ bool ret = false;
VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
if (folio_test_clear_lru(folio)) { struct lruvec *lruvec;
/* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM.
*/ staticbool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc)
{ unsignedlong inactive, isolated; bool too_many;
/* * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they * won't get blocked by normal direct-reclaimers, forming a circular * deadlock.
*/ if (gfp_has_io_fs(sc->gfp_mask))
inactive >>= 3;
too_many = isolated > inactive;
/* Wake up tasks throttled due to too_many_isolated. */ if (!too_many)
wake_throttle_isolated(pgdat);
return too_many;
}
/* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * * Returns the number of pages moved to the given lruvec.
*/ staticunsignedint move_folios_to_lru(struct lruvec *lruvec, struct list_head *list)
{ int nr_pages, nr_moved = 0; struct folio_batch free_folios;
folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list);
/* * The folio_set_lru needs to be kept here for list integrity. * Otherwise: * #0 move_folios_to_lru #1 release_pages * if (!folio_put_testzero()) * if (folio_put_testzero()) * !lru //skip lru_lock * folio_set_lru() * list_add(&folio->lru,) * list_add(&folio->lru,)
*/
folio_set_lru(folio);
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
/* * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration).
*/
VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
lruvec_add_folio(lruvec, folio);
nr_pages = folio_nr_pages(folio);
nr_moved += nr_pages; if (folio_test_active(folio))
workingset_age_nonresident(lruvec, nr_pages);
}
if (free_folios.nr) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
free_unref_folios(&free_folios);
spin_lock_irq(&lruvec->lru_lock);
}
return nr_moved;
}
/* * If a kernel thread (such as nfsd for loop-back mounts) services a backing * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case * we should not throttle. Otherwise it is safe to do so.
*/ staticint current_may_throttle(void)
{ return !(current->flags & PF_LOCAL_THROTTLE);
}
/* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages
*/ staticunsignedlong shrink_inactive_list(unsignedlong nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(folio_list); unsignedlong nr_scanned; unsignedint nr_reclaimed = 0; unsignedlong nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false;
while (unlikely(too_many_isolated(pgdat, file, sc))) { if (stalled) return 0;
/* wait a bit for the reclaimer. */
stalled = true;
reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
/* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) return SWAP_CLUSTER_MAX;
}
/* * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep.
*/ if (stat.nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN); /* * For cgroupv1 dirty throttling is achieved by waking up * the kernel flusher here and later waiting on folios * which are in writeback to finish (see shrink_folio_list()). * * Flusher may not be able to issue writeback quickly * enough for cgroupv1 writeback throttling to work * on a large system.
*/ if (!writeback_throttling_sane(sc))
reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}
/* * shrink_active_list() moves folios from the active LRU to the inactive LRU. * * We move them the other way if the folio is referenced by one or more * processes. * * If the folios are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if * the folios are mapped, the processing is slow (folio_referenced()), so * we should drop lru_lock around each folio. It's impossible to balance * this, so instead we remove the folios from the LRU while processing them. * It is safe to rely on the active flag against the non-LRU folios in here * because nobody will play with that bit on a non-LRU folio. * * The downside is that we have to touch folio->_refcount against each folio. * But we had to alter folio->flags anyway.
*/ staticvoid shrink_active_list(unsignedlong nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)
{ unsignedlong nr_taken; unsignedlong nr_scanned;
vm_flags_t vm_flags;
LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; bool file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec);
if (unlikely(!folio_evictable(folio))) {
folio_putback_lru(folio); continue;
}
if (unlikely(buffer_heads_over_limit)) { if (folio_needs_release(folio) &&
folio_trylock(folio)) {
filemap_release_folio(folio, 0);
folio_unlock(folio);
}
}
/* Referenced or rmap lock contention: rotate */ if (folio_referenced(folio, 0, sc->target_mem_cgroup,
&vm_flags) != 0) { /* * Identify referenced, file-backed active folios and * give them one more trip around the active list. So * that executable code get better chances to stay in * memory under moderate memory pressure. Anon folios * are not likely to be evicted by use-once streaming * IO, plus JVM can create lots of anon VM_EXEC folios, * so we ignore them here.
*/ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
nr_rotated += folio_nr_pages(folio);
list_add(&folio->lru, &l_active); continue;
}
}
folio_clear_active(folio); /* we are de-activating */
folio_set_workingset(folio);
list_add(&folio->lru, &l_inactive);
}
/* * Move folios back to the lru list.
*/
spin_lock_irq(&lruvec->lru_lock);
/* * The inactive anon list should be small enough that the VM never has * to do too much work. * * The inactive file list should be small enough to leave most memory * to the established workingset on the scan-resistant active list, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive * folio has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios * on this LRU, maintained by the pageout code. An inactive_ratio * of 3 means 3:1 or 25% of the folios are kept on the inactive list. * * total target max * memory ratio inactive * ------------------------------------- * 10MB 1 5MB * 100MB 1 50MB * 1GB 3 250MB * 10GB 10 0.9GB * 100GB 31 3GB * 1TB 101 10GB * 10TB 320 32GB
*/ staticbool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
{ enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsignedlong inactive, active; unsignedlong inactive_ratio; unsignedlong gb;
/* * Flush the memory cgroup stats in rate-limited way as we don't need * most accurate stats here. We may switch to regular stats flushing * in the future once it is cheap enough.
*/
mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup);
/* * Determine the scan balance between anon and file LRUs.
*/
spin_lock_irq(&target_lruvec->lru_lock);
sc->anon_cost = target_lruvec->anon_cost;
sc->file_cost = target_lruvec->file_cost;
spin_unlock_irq(&target_lruvec->lru_lock);
/* * Target desirable inactive:active list ratios for the anon * and file LRU lists.
*/ if (!sc->force_deactivate) { unsignedlong refaults;
/* * When refaults are being observed, it means a new * workingset is being established. Deactivate to get * rid of any stale active pages quickly.
*/
refaults = lruvec_page_state(target_lruvec,
WORKINGSET_ACTIVATE_ANON); if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
sc->may_deactivate |= DEACTIVATE_ANON; else
sc->may_deactivate &= ~DEACTIVATE_ANON;
/* * If we have plenty of inactive file pages that aren't * thrashing, try to reclaim those first before touching * anonymous pages.
*/
file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) &&
!sc->no_cache_trim_mode)
sc->cache_trim_mode = 1; else
sc->cache_trim_mode = 0;
/* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip * the scan balance towards the file LRU. And as the file LRU * shrinks, so does the window for rotation from references. * This means we have a runaway feedback loop where a tiny * thrashing file LRU becomes infinitely more attractive than * anon pages. Try to detect this based on file LRU size.
*/ if (!cgroup_reclaim(sc)) { unsignedlong total_high_wmark = 0; unsignedlong free, anon; int z; struct zone *zone;
/* * Consider anon: if that's low too, this isn't a * runaway file reclaim problem, but rather just * extreme pressure. Reclaim as per usual then.
*/
anon = node_page_state(pgdat, NR_INACTIVE_ANON);
/* * Calculate the pressure balance between anon and file pages. * * The amount of pressure we put on each LRU is inversely * proportional to the cost of reclaiming each list, as * determined by the share of pages that are refaulting, times * the relative IO cost of bringing back a swapped out * anonymous page vs reloading a filesystem page (swappiness). * * Although we limit that influence to ensure no list gets * left behind completely: at least a third of the pressure is * applied, before swappiness. * * With swappiness at 100, anon and file have equal IO cost.
*/
total_cost = sc->anon_cost + sc->file_cost;
anon_cost = total_cost + sc->anon_cost;
file_cost = total_cost + sc->file_cost;
total_cost = anon_cost + file_cost;
ap = swappiness * (total_cost + 1);
ap /= anon_cost + 1;
if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min * setting. * * This is important, as otherwise scanning aggression * becomes extremely binary -- from nothing as we * approach the memory protection threshold, to totally * nominal as we exceed it. This results in requiring * setting extremely liberal protection thresholds. It * also means we simply get no protection at all if we * set it too low, which is not ideal. * * If there is any protection in place, we reduce scan * pressure by how much of the total memory used is * within protection thresholds. * * There is one special case: in the first reclaim pass, * we skip over all groups that are within their low * protection. If that fails to reclaim enough pages to * satisfy the reclaim goal, we come back and override * the best-effort low protection. However, we still * ideally want to honor how well-behaved groups are in * that case instead of simply punishing them all * equally. As such, we reclaim them based on how much * memory they are using, reducing the scan pressure * again by how much of the total memory used is under * hard protection.
*/ unsignedlong cgroup_size = mem_cgroup_size(memcg); unsignedlong protection;
/* memory.low scaling, make sure we retry before OOM */ if (!sc->memcg_low_reclaim && low > min) {
protection = low;
sc->memcg_low_skipped = 1;
} else {
protection = min;
}
/* * Minimally target SWAP_CLUSTER_MAX pages to keep * reclaim moving forwards, avoiding decrementing * sc->priority further than desirable.
*/
scan = max(scan, SWAP_CLUSTER_MAX);
} return scan;
}
/* * Determine how aggressively the anon and file LRU lists should be * scanned. * * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
*/ staticvoid get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsignedlong *nr)
{ struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); int swappiness = sc_swappiness(sc, memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; enum lru_list lru;
/* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE; goto out;
}
/* * Global reclaim will swap to prevent OOM even with no * swappiness, but memcg users want to use this knob to * disable swapping for individual groups completely when * using the memory controller's swap limit feature would be * too expensive.
*/ if (cgroup_reclaim(sc) && !swappiness) {
scan_balance = SCAN_FILE; goto out;
}
/* Proactive reclaim initiated by userspace for anonymous memory only */ if (swappiness == SWAPPINESS_ANON_ONLY) {
WARN_ON_ONCE(!sc->proactive);
scan_balance = SCAN_ANON; goto out;
}
/* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping).
*/ if (!sc->priority && swappiness) {
scan_balance = SCAN_EQUAL; goto out;
}
/* * If the system is almost out of file pages, force-scan anon.
*/ if (sc->file_is_tiny) {
scan_balance = SCAN_ANON; goto out;
}
/* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now to make sure * a streaming file access pattern doesn't cause swapping.
*/ if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE; goto out;
}
/* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache.
*/ if (!scan && !mem_cgroup_online(memcg))
scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) { case SCAN_EQUAL: /* Scan lists relative to size */ break; case SCAN_FRACT: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. * Make sure we don't miss the last page on * the offlined memory cgroups because of a * round-off error.
*/
scan = mem_cgroup_online(memcg) ?
div64_u64(scan * fraction[file], denominator) :
DIV64_U64_ROUND_UP(scan * fraction[file],
denominator); break; case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file)
scan = 0; break; default: /* Look ma, no brain */
BUG();
}
nr[lru] = scan;
}
}
/* * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory.
*/ staticbool can_age_anon_pages(struct lruvec *lruvec, struct scan_control *sc)
{ /* Aging the anon LRU is valuable if swap is present: */ if (total_swap_pages > 0) returntrue;
/* Also valuable if anon pages can be demoted: */ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
lruvec_memcg(lruvec));
}
/* Get the min/max evictable type based on swappiness */ #define min_type(swappiness) (!(swappiness)) #define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
/* * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of * bits in a bitmap, k is the number of hash functions and n is the number of * inserted items. * * Page table walkers use one of the two filters to reduce their search space. * To get rid of non-leaf entries that no longer have enough leaf entries, the * aging uses the double-buffering technique to flip to the other filter each * time it produces a new generation. For non-leaf entries that have enough * leaf entries, the aging carries them over to the next generation in * walk_pmd_range(); the eviction also report them when walking the rmap * in lru_gen_look_around(). * * For future optimizations: * 1. It's not necessary to keep both filters all the time. The spare one can be * freed after the RCU grace period and reallocated if needed again. * 2. And when reallocating, it's worth scaling its size according to the number * of inserted entries in the other filter, to reduce the memory overhead on * small systems and false positives on large systems. * 3. Jenkins' hash function is an alternative to Knuth's.
*/ #define BLOOM_FILTER_SHIFT 15
/****************************************************************************** * mm_struct list
******************************************************************************/
/* * mm_state->seq is incremented after each iteration of mm_list. There * are three interesting cases for this page table walker: * 1. It tries to start a new iteration with a stale max_seq: there is * nothing left to do. * 2. It started the next iteration: it needs to reset the Bloom filter * so that a fresh set of PTE tables can be recorded. * 3. It ended the current iteration: it needs to reset the mm stats * counters and tell its caller to increment max_seq.
*/
spin_lock(&mm_list->lock);
VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);
if (walk->seq <= mm_state->seq) goto done;
if (!mm_state->head)
mm_state->head = &mm_list->fifo;
if (mm_state->head == &mm_list->fifo)
first = true;
do {
mm_state->head = mm_state->head->next; if (mm_state->head == &mm_list->fifo) {
WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
last = true; break;
}
/* force scan for those added after the last iteration */ if (!mm_state->tail || mm_state->tail == mm_state->head) {
mm_state->tail = mm_state->head->next;
walk->force_scan = true;
}
} while (!(mm = get_next_mm(walk)));
done: if (*iter || last)
reset_mm_stats(walk, last);
spin_unlock(&mm_list->lock);
if (mm && first)
reset_bloom_filter(mm_state, walk->seq + 1);
/* * A feedback loop based on Proportional-Integral-Derivative (PID) controller. * * The P term is refaulted/(evicted+protected) from a tier in the generation * currently being evicted; the I term is the exponential moving average of the * P term over the generations previously evicted, using the smoothing factor * 1/2; the D term isn't supported. * * The setpoint (SP) is always the first tier of one type; the process variable * (PV) is either any tier of the other type or any other tier of the same * type. * * The error is the difference between the SP and the PV; the correction is to * turn off protection when SP>PV or turn on protection when SP<PV. * * For future optimizations: * 1. The D term may discount the other two terms over time so that long-lived * generations can resist stale information.
*/ struct ctrl_pos { unsignedlong refaulted; unsignedlong total; int gain;
};
staticvoid read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos)
{ int i; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]);
for (tier = 0; tier < MAX_NR_TIERS; tier++) { if (carryover) { unsignedlong sum;
sum = lrugen->avg_refaulted[type][tier] +
atomic_long_read(&lrugen->refaulted[hist][type][tier]);
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
lrugen->protected[hist][type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
}
staticbool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
{ /* * Return true if the PV has a limited number of refaults or a lower * refaulted/total than the SP.
*/ return pv->refaulted < MIN_LRU_BATCH ||
pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
(sp->refaulted + 1) * pv->total * pv->gain;
}
/****************************************************************************** * the aging
******************************************************************************/
/* promote pages accessed through page tables */ staticint folio_update_gen(struct folio *folio, int gen)
{ unsignedlong new_flags, old_flags = READ_ONCE(folio->flags);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
/* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return -1;
}
do { /* lru_gen_del_folio() has isolated this page? */ if (!(old_flags & LRU_GEN_MASK)) return -1;
staticvoid update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, int old_gen, int new_gen)
{ int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio);
if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) returntrue;
if (vma == get_gate_vma(vma->vm_mm)) returntrue;
if (vma_is_anonymous(vma)) return !walk->swappiness;
if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) returntrue;
mapping = vma->vm_file->f_mapping; if (mapping_unevictable(mapping)) returntrue;
if (shmem_mapping(mapping)) return !walk->swappiness;
if (walk->swappiness > MAX_SWAPPINESS) returntrue;
/* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio;
}
/* * Some userspace memory allocators map many single-page VMAs. Instead of * returning back to the PGD table for each of such VMAs, finish an entire PMD * table to reduce zigzags and improve cache performance.
*/ staticbool get_next_vma(unsignedlong mask, unsignedlong size, struct mm_walk *args, unsignedlong *vm_start, unsignedlong *vm_end)
{ unsignedlong start = round_up(*vm_end, size); unsignedlong end = (start | ~mask) + 1;
VMA_ITERATOR(vmi, args->mm, start);
/* * Finish an entire PMD in two passes: the first only reaches to PTE * tables to avoid taking the PMD lock; the second, if necessary, takes * the PMD lock to clear the accessed bit in PMD entries.
*/
pmd = pmd_offset(pud, start & PUD_MASK);
restart: /* walk_pte_range() may call get_next_vma() */
vma = args->vma; for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
pmd_t val = pmdp_get_lockless(pmd + i);
next = pmd_addr_end(addr, end);
if (!pmd_present(val) || is_huge_zero_pmd(val)) {
walk->mm_stats[MM_LEAF_TOTAL]++; continue;
}
/* another thread might have called inc_max_seq() */ if (walk->seq != max_seq) break;
/* the caller might be holding the lock for write */ if (mmap_read_trylock(mm)) {
err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
mmap_read_unlock(mm);
}
if (walk->batched) {
spin_lock_irq(&lruvec->lru_lock);
reset_batch_size(walk);
spin_unlock_irq(&lruvec->lru_lock);
}
staticbool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
{ int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
/* For file type, skip the check if swappiness is anon only */ if (type && (swappiness == SWAPPINESS_ANON_ONLY)) goto done;
/* For anon type, skip the check if swappiness is zero (file only) */ if (!type && !swappiness) goto done;
/* prevent cold/hot inversion if the type is evictable */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio);
/* find the oldest populated generation */
for_each_evictable_type(type, swappiness) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
gen = lru_gen_from_seq(min_seq[type]);
for (zone = 0; zone < MAX_NR_ZONES; zone++) { if (!list_empty(&lrugen->folios[gen][type][zone])) goto next;
}
min_seq[type]++;
seq_inc_flag = true;
}
next:
;
}
/* * If min_seq[type] of both anonymous and file is not increased, * we can directly return false to avoid unnecessary checking * overhead later.
*/ if (!seq_inc_flag) return success;
/* see the comment on lru_gen_folio */ if (swappiness && swappiness <= MAX_SWAPPINESS) { unsignedlong seq = lrugen->max_seq - MIN_NR_GENS;
/* * Update the active/inactive LRU sizes for compatibility. Both sides of * the current max_seq need to be covered, since max_seq+1 can overlap * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do * overlap, cold/hot inversion happens.
*/
prev = lru_gen_from_seq(lrugen->max_seq - 1);
next = lru_gen_from_seq(lrugen->max_seq + 1);
for (type = 0; type < ANON_AND_FILE; type++) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { enum lru_list lru = type * LRU_INACTIVE_FILE; long delta = lrugen->nr_pages[prev][type][zone] -
lrugen->nr_pages[next][type][zone];
if (!mm_state) return inc_max_seq(lruvec, seq, swappiness);
/* see the comment in iterate_mm_list() */ if (seq <= READ_ONCE(mm_state->seq)) returnfalse;
/* * If the hardware doesn't automatically set the accessed bit, fallback * to lru_gen_look_around(), which only clears the accessed bit in a * handful of PTEs. Spreading the work out over a period of time usually * is less efficient, but it avoids bursty page faults.
*/ if (!should_walk_mmu()) {
success = iterate_mm_list_nowalk(lruvec, seq); goto done;
}
walk = set_mm_walk(NULL, true); if (!walk) {
success = iterate_mm_list_nowalk(lruvec, seq); goto done;
}
do {
success = iterate_mm_list(walk, &mm); if (mm)
walk_mm(mm, walk);
} while (mm);
done: if (success) {
success = inc_max_seq(lruvec, seq, swappiness);
WARN_ON_ONCE(!success);
}
return success;
}
/****************************************************************************** * working set protection
******************************************************************************/
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) return; /* * Determine the initial priority based on * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, * where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
/* * The estimation is based on LRU pages only, so cap it to prevent * overshoots of shrinker objects by large margins.
*/
sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
}
if (!reclaimable)
reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
/* * The main goal is to OOM kill if every generation from all memcgs is * younger than min_ttl. However, another possibility is all memcgs are * either too small or below min.
*/ if (!reclaimable && mutex_trylock(&oom_lock)) { struct oom_control oc = {
.gfp_mask = sc->gfp_mask,
};
out_of_memory(&oc);
mutex_unlock(&oom_lock);
}
}
/****************************************************************************** * rmap/PT walk feedback
******************************************************************************/
/* * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If * the scan was done cacheline efficiently, it adds the PMD entry pointing to * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging.
*/ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{ int i; bool dirty; unsignedlong start; unsignedlong end; struct lru_gen_mm_walk *walk; struct folio *last = NULL; int young = 1;
pte_t *pte = pvmw->pte; unsignedlong addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec); int gen = lru_gen_from_seq(max_seq);
/* see the comment on MEMCG_NR_GENS */ enum {
MEMCG_LRU_NOP,
MEMCG_LRU_HEAD,
MEMCG_LRU_TAIL,
MEMCG_LRU_OLD,
MEMCG_LRU_YOUNG,
};
staticvoid lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{ int seg; int old, new; unsignedlong flags; int bin = get_random_u32_below(MEMCG_NR_BINS); struct pglist_data *pgdat = lruvec_pgdat(lruvec);
/* see the comment on MEMCG_NR_GENS */ if (op == MEMCG_LRU_HEAD)
seg = MEMCG_LRU_HEAD; elseif (op == MEMCG_LRU_TAIL)
seg = MEMCG_LRU_TAIL; elseif (op == MEMCG_LRU_OLD) new = get_memcg_gen(pgdat->memcg_lru.seq); elseif (op == MEMCG_LRU_YOUNG) new = get_memcg_gen(pgdat->memcg_lru.seq + 1); else
VM_WARN_ON_ONCE(true);
/* see the comment on MEMCG_NR_GENS */ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
}
#endif/* CONFIG_MEMCG */
/****************************************************************************** * the eviction
******************************************************************************/
staticbool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, int tier_idx)
{ bool success; bool dirty, writeback; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen;
staticint scan_folios(unsignedlong nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int type, int tier, struct list_head *list)
{ int i; int gen; enum vm_event_item item; int sorted = 0; int scanned = 0; int isolated = 0; int skipped = 0; int remaining = min(nr_to_scan, MAX_LRU_BATCH); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec);
VM_WARN_ON_ONCE(!list_empty(list));
if (get_nr_gens(lruvec, type) == MIN_NR_GENS) return 0;
gen = lru_gen_from_seq(lrugen->min_seq[type]);
for (i = MAX_NR_ZONES; i > 0; i--) {
LIST_HEAD(moved); int skipped_zone = 0; int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; struct list_head *head = &lrugen->folios[gen][type][zone];
while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); int delta = folio_nr_pages(folio);
if (!remaining || isolated >= MIN_LRU_BATCH) break;
}
item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
count_memcg_events(memcg, item, isolated);
count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); if (type == LRU_GEN_FILE)
sc->nr.file_taken += isolated; /* * There might not be eligible folios due to reclaim_idx. Check the * remaining to prevent livelock if it's not making progress.
*/ return isolated || !remaining ? scanned : 0;
}
staticint get_tier_idx(struct lruvec *lruvec, int type)
{ int tier; struct ctrl_pos sp, pv;
/* * To leave a margin for fluctuations, use a larger gain factor (2:3). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier.
*/
read_ctrl_pos(lruvec, type, 0, 2, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) {
read_ctrl_pos(lruvec, type, tier, 3, &pv); if (!positive_ctrl_err(&sp, &pv)) break;
}
if (swappiness <= MIN_SWAPPINESS + 1) return LRU_GEN_FILE;
if (swappiness >= MAX_SWAPPINESS) return LRU_GEN_ANON; /* * Compare the sum of all tiers of anon with that of file to determine * which type to scan.
*/
read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
return positive_ctrl_err(&sp, &pv);
}
staticint isolate_folios(unsignedlong nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list)
{ int i; int type = get_type_to_scan(lruvec, swappiness);
for_each_evictable_type(i, swappiness) { int scanned; int tier = get_tier_idx(lruvec, type);
*nr_to_scan = 0; /* have to run aging, since eviction is not possible anymore */ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) returntrue;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
}
}
*nr_to_scan = size; /* better to run aging even though eviction is still possible */ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}
/* * For future optimizations: * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim.
*/ staticlong get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{ bool success; unsignedlong nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1;
/* don't abort memcg reclaim to ensure fairness */ if (!root_reclaim(sc)) returnfalse;
if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) returntrue;
/* check the order to exclude compaction-induced reclaim */ if (!current_is_kswapd() || sc->order) returnfalse;
mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
WMARK_PROMO : WMARK_HIGH;
for (i = 0; i <= sc->reclaim_idx; i++) { struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; unsignedlong size = wmark_pages(zone, mark) + MIN_LRU_BATCH;
if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0)) returnfalse;
}
/* kswapd should abort if all eligible zones are safe */ returntrue;
}
staticbool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{ long nr_to_scan; unsignedlong scanned = 0; int swappiness = get_swappiness(lruvec, sc);
while (true) { int delta;
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); if (nr_to_scan <= 0) break;
delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); if (!delta) break;
scanned += delta; if (scanned >= nr_to_scan) break;
if (should_abort_scan(lruvec, sc)) break;
cond_resched();
}
/* * If too many file cache in the coldest generation can't be evicted * due to being dirty, wake up the flusher.
*/ if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
wakeup_flusher_threads(WB_REASON_VMSCAN);
/* whether this lruvec should be rotated */ return nr_to_scan < 0;
}
/* lru_gen_age_node() called mem_cgroup_calculate_protection() */ if (mem_cgroup_below_min(NULL, memcg)) return MEMCG_LRU_YOUNG;
if (mem_cgroup_below_low(NULL, memcg)) { /* see the comment on MEMCG_NR_GENS */ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) return MEMCG_LRU_TAIL;
/* * Unmapped clean folios are already prioritized. Scanning for more of * them is likely futile and can cause high reclaim latency when there * is a large number of memcgs.
*/ if (!sc->may_writepage || !sc->may_unmap) goto done;
lru_add_drain();
blk_start_plug(&plug);
set_mm_walk(pgdat, sc->proactive);
set_initial_priority(pgdat, sc);
if (current_is_kswapd())
sc->nr_reclaimed = 0;
if (mem_cgroup_disabled())
shrink_one(&pgdat->__lruvec, sc); else
shrink_many(pgdat, sc);
if (current_is_kswapd())
sc->nr_reclaimed += reclaimed;
clear_mm_walk();
blk_finish_plug(&plug);
done: if (sc->nr_reclaimed > reclaimed)
pgdat->kswapd_failures = 0;
}
/****************************************************************************** * state change
******************************************************************************/
if (lru_gen_enabled() && !root_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc); return;
}
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
/* * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal * event that can occur when there is little memory pressure e.g. * multiple streaming readers/writers. Hence, we do not abort scanning * when the requested number of pages are reclaimed when scanning at * DEF_PRIORITY on the assumption that the fact we are direct * reclaiming implies that kswapd is not keeping up and it is best to * do a batch of work at once. For memcg reclaim one check is made to * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass.
*/
proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
sc->priority == DEF_PRIORITY);
if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) continue;
/* * For kswapd and memcg, reclaim at least the number of pages * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target.
*/
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
/* * It's just vindictive to attack the larger once the smaller * has gone to zero. And given the way we stop scanning the * smaller below, this makes sure that we only make one nudge * towards proportionality once we've got nr_to_reclaim.
*/ if (!nr_file || !nr_anon) break;
/* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio.
*/ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
/* Use reclaim/compaction for costly allocs or under memory pressure */ staticbool in_reclaim_compaction(struct scan_control *sc)
{ if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2)) returntrue;
returnfalse;
}
/* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages.
*/ staticinlinebool should_continue_reclaim(struct pglist_data *pgdat, unsignedlong nr_reclaimed, struct scan_control *sc)
{ unsignedlong pages_for_compaction; unsignedlong inactive_lru_pages; int z; struct zone *zone;
/* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) returnfalse;
/* * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX * number of pages that were scanned. This will return to the caller * with the risk reclaim/compaction and the resulting allocation attempt * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL * allocations through requiring that the full LRU list has been scanned * first, by assuming that zero delta of sc->nr_scanned means full LRU * scan, but that approximation was wrong, and there were corner cases * where always a non-zero amount of pages were scanned.
*/ if (!nr_reclaimed) returnfalse;
/* If compaction would go ahead or the allocation would succeed, stop */
for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { unsignedlong watermark = min_wmark_pages(zone);
/* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, watermark,
sc->reclaim_idx, 0)) returnfalse;
if (compaction_suitable(zone, sc->order, watermark,
sc->reclaim_idx)) returnfalse;
}
/* * If we have not reclaimed enough pages for compaction and the * inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = compact_gap(sc->order);
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
/* * In most cases, direct reclaimers can do partial walks * through the cgroup tree, using an iterator state that * persists across invocations. This strikes a balance between * fairness and allocation latency. * * For kswapd, reliable forward progress is more important * than a quick return to idle. Always do full walks.
*/ if (current_is_kswapd() || sc->memcg_full_walk)
partial = NULL;
/* * This loop can become CPU-bound when target memcgs * aren't eligible for reclaim - either because they * don't have any reclaimable pages, or because their * memory is explicitly protected. Avoid soft lockups.
*/
cond_resched();
if (mem_cgroup_below_min(target_memcg, memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM.
*/ continue;
} elseif (mem_cgroup_below_low(target_memcg, memcg)) { /* * Soft protection. * Respect the protection only as long as * there is an unprotected supply * of reclaimable memory from other cgroups.
*/ if (!sc->memcg_low_reclaim) {
sc->memcg_low_skipped = 1; continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
}
/* Record the subtree's reclaim efficiency */ if (!sc->proactive)
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned, nr_node_reclaimed);
if (nr_node_reclaimed)
reclaimable = true;
if (current_is_kswapd()) { /* * If reclaim is isolating dirty pages under writeback, * it implies that the long-lived page allocation rate * is exceeding the page laundering rate. Either the * global limits are not being effective at throttling * processes due to the page distribution throughout * zones or there is heavy usage of a slow backing * device. The only option is to throttle from reclaim * context which is not ideal as there is no guarantee * the dirtying process is throttled in the same way * balance_dirty_pages() manages. * * Once a node is flagged PGDAT_WRITEBACK, kswapd will * count the number of pages under pages flagged for * immediate reclaim and stall if any are encountered * in the nr_immediate check below.
*/ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/* Allow kswapd to start writing pages during reclaim.*/ if (sc->nr.unqueued_dirty &&
sc->nr.unqueued_dirty == sc->nr.file_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
/* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it * implies that pages are cycling through the LRU * faster than they are written so forcibly stall * until some pages complete writeback.
*/ if (sc->nr.immediate)
reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}
/* * Tag a node/memcg as congested if all the dirty pages were marked * for writeback and immediate reclaim (counted in nr.congested). * * Legacy memcg will stall in page writeback so avoid forcibly * stalling in reclaim_throttle().
*/ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
if (current_is_kswapd())
set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
}
/* * Stall direct reclaim for IO completions if the lruvec is * node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly.
*/ if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
(test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) goto again;
/* * Kswapd gives up on balancing particular nodes after too * many failures to reclaim anything from them and goes to * sleep. On reclaim progress, reset the failure counter. A * successful direct reclaim run will revive a dormant kswapd.
*/ if (reclaimable)
pgdat->kswapd_failures = 0; elseif (sc->cache_trim_mode)
sc->cache_trim_mode_failed = 1;
}
/* * Returns true if compaction should go ahead for a costly-order request, or * the allocation would already succeed without compaction. Return false if we * should reclaim first.
*/ staticinlinebool compaction_ready(struct zone *zone, struct scan_control *sc)
{ unsignedlong watermark;
if (!gfp_compaction_allowed(sc->gfp_mask)) returnfalse;
/* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
sc->reclaim_idx, 0)) returntrue;
/* * Direct reclaim usually targets the min watermark, but compaction * takes time to run and there are potentially other callers using the * pages just freed. So target a higher buffer to give compaction a * reasonable chance of completing and allocating the pages. * * Note that we won't actually reclaim the whole buffer in one attempt * as the target watermark in should_continue_reclaim() is lower. But if * we are already above the high+gap watermark, don't reclaim at all.
*/
watermark = high_wmark_pages(zone); if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx)) returntrue;
returnfalse;
}
staticvoid consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
{ /* * If reclaim is making progress greater than 12% efficiency then * wake all the NOPROGRESS throttled tasks.
*/ if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
wait_queue_head_t *wqh;
wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; if (waitqueue_active(wqh))
wake_up(wqh);
return;
}
/* * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages * under writeback and marked for immediate reclaim at the tail of the * LRU.
*/ if (current_is_kswapd() || cgroup_reclaim(sc)) return;
/* Throttle if making no progress at high prioities. */ if (sc->priority == 1 && !sc->nr_reclaimed)
reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
}
/* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it.
*/ staticvoid shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{ struct zoneref *z; struct zone *zone; unsignedlong nr_soft_reclaimed; unsignedlong nr_soft_scanned;
gfp_t orig_mask;
pg_data_t *last_pgdat = NULL;
pg_data_t *first_pgdat = NULL;
/* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads
*/
orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) {
sc->gfp_mask |= __GFP_HIGHMEM;
sc->reclaim_idx = gfp_zone(sc->gfp_mask);
}
for_each_zone_zonelist_nodemask(zone, z, zonelist,
sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU.
*/ if (!cgroup_reclaim(sc)) { if (!cpuset_zone_allowed(zone,
GFP_KERNEL | __GFP_HARDWALL)) continue;
/* * If we already have plenty of memory free for * compaction in this zone, don't free any more. * Even though compaction is invoked for any * non-zero order, only frequent costly order * reclamation is disruptive enough to become a * noticeable problem, like transparent huge * page allocations.
*/ if (IS_ENABLED(CONFIG_COMPACTION) &&
sc->order > PAGE_ALLOC_COSTLY_ORDER &&
compaction_ready(zone, sc)) {
sc->compaction_ready = true; continue;
}
/* * Shrink each node in the zonelist once. If the * zonelist is ordered by zone (not the default) then a * node may be shrunk multiple times but in that case * the user prefers lower zones being preserved.
*/ if (zone->zone_pgdat == last_pgdat) continue;
/* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and * scanned pages. This works for global memory pressure * and balancing, not for a memcg's limit.
*/
nr_soft_scanned = 0;
nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat,
sc->order, sc->gfp_mask,
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */
}
if (!first_pgdat)
first_pgdat = zone->zone_pgdat;
/* See comment about same check for global reclaim above */ if (zone->zone_pgdat == last_pgdat) continue;
last_pgdat = zone->zone_pgdat;
shrink_node(zone->zone_pgdat, sc);
}
if (first_pgdat)
consider_reclaim_throttle(first_pgdat, sc);
/* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
}
/* * This is the main entry point to direct page reclaim. * * If a full scan of the inactive list fails to free enough memory then we * are "out of memory" and something needs to be killed. * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this * caller can't do much about. We kick the writeback threads and take explicit * naps in the hope that some of these pages can be written. But if the * allocating task holds filesystem locks which prevent writeout this might not * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed
*/ staticunsignedlong do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc)
{ int initial_priority = sc->priority;
pg_data_t *last_pgdat; struct zoneref *z; struct zone *zone;
retry:
delayacct_freepages_start();
if (!cgroup_reclaim(sc))
__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do { if (!sc->proactive)
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
shrink_zones(zonelist, sc);
if (sc->nr_reclaimed >= sc->nr_to_reclaim) break;
if (sc->compaction_ready) break;
/* * If we're getting trouble reclaiming, start doing * writepage even in laptop mode.
*/ if (sc->priority < DEF_PRIORITY - 2)
sc->may_writepage = 1;
} while (--sc->priority >= 0);
/* Aborted reclaim to try compaction? don't OOM, then */ if (sc->compaction_ready) return 1;
/* * In most cases, direct reclaimers can do partial walks * through the cgroup tree to meet the reclaim goal while * keeping latency low. Since the iterator state is shared * among all direct reclaim invocations (to retain fairness * among cgroups), though, high concurrency can result in * individual threads not seeing enough cgroups to make * meaningful forward progress. Avoid false OOMs in this case.
*/ if (!sc->memcg_full_walk) {
sc->priority = initial_priority;
sc->memcg_full_walk = 1; goto retry;
}
/* * We make inactive:active ratio decisions based on the node's * composition of memory, but a restrictive reclaim_idx or a * memory.low cgroup setting can exempt large amounts of * memory from reclaim. Neither of which are very common, so * instead of doing costly eligibility calculations of the * entire cgroup subtree up front, we assume the estimates are * good, and retry with forcible deactivation if that fails.
*/ if (sc->skipped_deactivate) {
sc->priority = initial_priority;
sc->force_deactivate = 1;
sc->skipped_deactivate = 0; goto retry;
}
/* If there are no reserves (unexpected config) then do not throttle */ if (!pfmemalloc_reserve) returntrue;
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}
return wmark_ok;
}
/* * Throttle direct reclaimers if backing storage is backed by the network * and the PFMEMALLOC reserve for the preferred node is getting dangerously * depleted. kswapd will continue to make progress and wake the processes * when the low watermark is reached. * * Returns true if a fatal signal was delivered during throttling. If this * happens, the page allocator should not consider triggering the OOM killer.
*/ staticbool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{ struct zoneref *z; struct zone *zone;
pg_data_t *pgdat = NULL;
/* * Kernel threads should not be throttled as they may be indirectly * responsible for cleaning pages necessary for reclaim to make forward * progress. kjournald for example may enter direct reclaim while * committing a transaction where throttling it could forcing other * processes to block on log_wait_commit().
*/ if (current->flags & PF_KTHREAD) goto out;
/* * If a fatal signal is pending, this process should not throttle. * It should return quickly so it can exit and free its memory
*/ if (fatal_signal_pending(current)) goto out;
/* * Check if the pfmemalloc reserves are ok by finding the first node * with a usable ZONE_NORMAL or lower zone. The expectation is that * GFP_KERNEL will be required for allocating network buffers when * swapping over the network so ZONE_HIGHMEM is unusable. * * Throttling is based on the first usable node and throttled processes * wait on a queue until kswapd makes progress and wakes them. There * is an affinity then between processes waking up and where reclaim * progress has been made assuming the process wakes on the same node. * More importantly, processes running on remote nodes will not compete * for remote pfmemalloc reserves and processes on different nodes * should make reasonable progress.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(gfp_mask), nodemask) { if (zone_idx(zone) > ZONE_NORMAL) continue;
/* Throttle based on the first usable node */
pgdat = zone->zone_pgdat; if (allow_direct_reclaim(pgdat)) goto out; break;
}
/* If no zone was usable by the allocation flags then do not throttle */ if (!pgdat) goto out;
/* Account for the throttling */
count_vm_event(PGSCAN_DIRECT_THROTTLE);
/* * If the caller cannot enter the filesystem, it's possible that it * is due to the caller holding an FS lock or performing a journal * transaction in the case of a filesystem like ext[3|4]. In this case, * it is not safe to block on pfmemalloc_wait as kswapd could be * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing.
*/ if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat), HZ); else /* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat));
/* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values.
*/
BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
/* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this * point.
*/ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1;
/* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. * if we don't reclaim here, the shrink_node from balance_pgdat * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero.
*/
shrink_lruvec(lruvec, &sc);
staticbool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
{ int i; struct zone *zone;
/* * Check for watermark boosts top-down as the higher zones * are more likely to be boosted. Both watermarks and boosts * should not be checked at the same time as reclaim would * start prematurely when there is no boosting and a lower * zone is balanced.
*/ for (i = highest_zoneidx; i >= 0; i--) {
zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue;
if (zone->watermark_boost) returntrue;
}
returnfalse;
}
/* * Returns true if there is an eligible zone balanced for the request order * and highest_zoneidx
*/ staticbool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
{ int i; unsignedlong mark = -1; struct zone *zone;
/* * Check watermarks bottom-up as lower zones are more likely to * meet watermarks.
*/
for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { enum zone_stat_item item; unsignedlong free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
mark = promo_wmark_pages(zone); else
mark = high_wmark_pages(zone);
/* * In defrag_mode, watermarks must be met in whole * blocks to avoid polluting allocator fallbacks. * * However, kswapd usually cannot accomplish this on * its own and needs kcompactd support. Once it's * reclaimed a compaction gap, and kswapd_shrink_node * has dropped order, simply ensure there are enough * base pages for compaction, wake kcompactd & sleep.
*/ if (defrag_mode && order)
item = NR_FREE_PAGES_BLOCKS; else
item = NR_FREE_PAGES;
/* * When there is a high number of CPUs in the system, * the cumulative error from the vmstat per-cpu cache * can blur the line between the watermarks. In that * case, be safe and get an accurate snapshot. * * TODO: NR_FREE_PAGES_BLOCKS moves in steps of * pageblock_nr_pages, while the vmstat pcp threshold * is limited to 125. On many configurations that * counter won't actually be per-cpu cached. But keep * things simple for now; revisit when somebody cares.
*/
free_pages = zone_page_state(zone, item); if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
free_pages = zone_page_state_snapshot(zone, item);
if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
0, free_pages)) returntrue;
}
/* * If a node has no managed zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd.
*/ if (mark == -1) returntrue;
returnfalse;
}
/* Clear pgdat state for congested, dirty or under writeback. */ staticvoid clear_pgdat_congested(pg_data_t *pgdat)
{ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
/* * Prepare kswapd for sleeping. This verifies that there are no processes * waiting in throttle_direct_reclaim() and that watermarks have been met. * * Returns true if kswapd is ready to sleep
*/ staticbool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int highest_zoneidx)
{ /* * The throttled processes are normally woken up in balance_pgdat() as * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the * zones, which causes kswapd to exit balance_pgdat() before reaching * the wake up checks. If kswapd is going to sleep, no process should * be sleeping on pfmemalloc_wait, so wake them now if necessary. If * the wake up is premature, processes will wake kswapd and get * throttled again. The difference from wake ups in balance_pgdat() is * that here we are under prepare_to_wait().
*/ if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Hopeless node, leave it to direct reclaim */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) returntrue;
if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
clear_pgdat_congested(pgdat); returntrue;
}
returnfalse;
}
/* * kswapd shrinks a node of pages that are at or below the highest usable * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised.
*/ staticbool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{ struct zone *zone; int z; unsignedlong nr_reclaimed = sc->nr_reclaimed;
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
}
/* * Historically care was taken to put equal pressure on all zones but * now pressure is applied based on node LRU order.
*/
shrink_node(pgdat, sc);
/* * Fragmentation may mean that the system cannot be rebalanced for * high-order allocations. If twice the allocation size has been * reclaimed then recheck watermarks only at order-0 to prevent * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact.
*/ if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
sc->order = 0;
/* account for progress from mm_account_reclaimed_pages() */ return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
}
/* Page allocator PCP high watermark is lowered if reclaim is active. */ staticinlinevoid
update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
{ int i; struct zone *zone;
for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { if (active)
set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); else
clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
}
}
staticinlinevoid
set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
{
update_reclaim_active(pgdat, highest_zoneidx, true);
}
staticinlinevoid
clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
{
update_reclaim_active(pgdat, highest_zoneidx, false);
}
/* * For kswapd, balance_pgdat() will reclaim pages across a node from zones * that are eligible for use by the caller until at least one zone is * balanced. * * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced.
*/ staticint balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{ int i; unsignedlong nr_soft_reclaimed; unsignedlong nr_soft_scanned; unsignedlong pflags; unsignedlong nr_boost_reclaim; unsignedlong zone_boosts[MAX_NR_ZONES] = { 0, }; bool boosted; struct zone *zone; struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.may_unmap = 1,
};
/* * Account for the reclaim boost. Note that the zone boost is left in * place so that parallel allocations that are near the watermark will * stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
boosted = nr_boost_reclaim;
/* * If the number of buffer_heads exceeds the maximum allowed * then consider reclaiming from all zones. This has a dual * purpose -- on 64-bit systems it is expected that * buffer_heads are stripped during active rotation. On 32-bit * systems, highmem pages can pin lowmem memory and shrinking * buffers can relieve lowmem pressure. Reclaim may still not * go ahead if all eligible zones for the original allocation * request are balanced to avoid excessive reclaim from kswapd.
*/ if (buffer_heads_over_limit) { for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue;
sc.reclaim_idx = i; break;
}
}
/* * If the pgdat is imbalanced then ignore boosting and preserve * the watermarks for a later time and restart. Note that the * zone watermarks will be still reset at the end of balancing * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes.
*/
balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) {
nr_boost_reclaim = 0; goto restart;
}
/* * If boosting is not active then only reclaim if there are no * eligible zones. Note that sc.reclaim_idx is not used as * buffer_heads_over_limit may have adjusted it.
*/ if (!nr_boost_reclaim && balanced) goto out;
/* Limit the priority of boosting to avoid reclaim writeback */ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
raise_priority = false;
/* * Do not writeback or swap pages for boosted reclaim. The * intent is to relieve pressure not issue sub-optimal IO * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted.
*/
sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
sc.may_swap = !nr_boost_reclaim;
/* * Do some background aging, to give pages a chance to be * referenced before reclaiming. All pages are rotated * regardless of classzone as this is about consistent aging.
*/
kswapd_age_node(pgdat, &sc);
/* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode.
*/ if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/* * There should be no need to raise the scanning priority if * enough pages are already being scanned that that high * watermark would be met at 100% efficiency.
*/ if (kswapd_shrink_node(pgdat, &sc))
raise_priority = false;
/* * If the low watermark is met there is no need for processes * to be throttled on pfmemalloc_wait as they should not be * able to safely make forward progress. Wake them
*/ if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
allow_direct_reclaim(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
__fs_reclaim_release(_THIS_IP_);
ret = kthread_freezable_should_stop(&was_frozen);
__fs_reclaim_acquire(_THIS_IP_); if (was_frozen || ret) break;
/* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages
*/
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
/* * If reclaim made no progress for a boost, stop reclaim as * IO cannot be queued and it could be an infinite loop in * extreme circumstances.
*/ if (nr_boost_reclaim && !nr_reclaimed) break;
if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
/* * Restart only if it went through the priority loop all the way, * but cache_trim_mode didn't work.
*/ if (!sc.nr_reclaimed && sc.priority < 1 &&
!sc.no_cache_trim_mode && sc.cache_trim_mode_failed) {
sc.no_cache_trim_mode = 1; goto restart;
}
/* If reclaim was boosted, account for the reclaim done in this pass */ if (boosted) { unsignedlong flags;
for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue;
/* Increments are under the zone lock */
zone = pgdat->node_zones + i;
spin_lock_irqsave(&zone->lock, flags);
zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
spin_unlock_irqrestore(&zone->lock, flags);
}
/* * As there is now likely space, wakeup kcompact to defragment * pageblocks.
*/
wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
}
/* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller * entered the allocator slow path while kswapd was awake, order will * remain at the higher level.
*/ return sc.order;
}
/* * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is * not a valid index then either kswapd runs for first time or kswapd couldn't * sleep after previous reclaim attempt (node is still unbalanced). In that * case return the zone index of the previous kswapd reclaim cycle.
*/ staticenum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, enum zone_type prev_highest_zoneidx)
{ enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
staticvoid kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsignedint highest_zoneidx)
{ long remaining = 0;
DEFINE_WAIT(wait);
if (freezing(current) || kthread_should_stop()) return;
/* * Try to sleep for a short interval. Note that kcompactd will only be * woken if it is possible to sleep for a short interval. This is * deliberate on the assumption that if reclaim cannot keep an * eligible zone balanced that it's also unlikely that compaction will * succeed.
*/ if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. * When kswapd is going to sleep, it is reasonable to assume * that pages and compaction may succeed so reset the cache.
*/
reset_isolation_suitable(pgdat);
/* * We have freed the memory, now we should compact it to make * allocation of the requested order possible.
*/
wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
remaining = schedule_timeout(HZ/10);
/* * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely.
*/ if (remaining) {
WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
kswapd_highest_zoneidx(pgdat,
highest_zoneidx));
if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
}
/* * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up.
*/ if (!remaining &&
prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/* * vmstat counters are not perfectly accurate and the estimated * value for counters such as NR_FREE_PAGES can deviate from the * true value by nr_online_cpus * threshold. To avoid the zone * watermarks being breached while under pressure, we reduce the * per-cpu vmstat threshold while kswapd is awake and restore * them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
/* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter.
*/ staticint kswapd(void *p)
{ unsignedint alloc_order, reclaim_order; unsignedint highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current;
/* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it * regardless (see "__alloc_pages()"). "kswapd" should * never get caught in the normal page freeing logic. * * (Kswapd normally doesn't need memory anyway, but sometimes * you need a small amount of memory in order to be able to * page out something else, and this flag essentially protects * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
set_freezable();
/* Read the new order and highest_zoneidx */
alloc_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
highest_zoneidx);
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
if (kthread_freezable_should_stop(&was_frozen)) break;
/* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator
*/ if (was_frozen) continue;
/* * Reclaim begins at the requested order but if a high-order * reclaim fails then kswapd falls back to reclaiming for * order-0. If that happens, kswapd will consider sleeping * for the order it finished reclaiming at (reclaim_order) * but kcompactd is woken to compact for the original * request (alloc_order).
*/
trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
alloc_order);
reclaim_order = balance_pgdat(pgdat, alloc_order,
highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep;
}
tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
return 0;
}
/* * A zone is low on free memory or too fragmented for high-order memory. If * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim * has failed or is not needed, still wake up kcompactd if only compaction is * needed.
*/ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type highest_zoneidx)
{
pg_data_t *pgdat; enum zone_type curr_idx;
if (!managed_zone(zone)) return;
if (!cpuset_zone_allowed(zone, gfp_flags)) return;
if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
if (READ_ONCE(pgdat->kswapd_order) < order)
WRITE_ONCE(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait)) return;
/* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
(pgdat_balanced(pgdat, order, highest_zoneidx) &&
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd * and rely on compaction_suitable() to determine if it's * needed. If it fails, it will defer subsequent attempts to * ratelimit its work.
*/ if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
wakeup_kcompactd(pgdat, order, highest_zoneidx); return;
}
/* * This kswapd start function will be called by init and node-hot-add.
*/ void __meminit kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
pgdat_kswapd_lock(pgdat); if (!pgdat->kswapd) {
pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */
pr_err("Failed to start kswapd on node %d,ret=%ld\n",
nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
pgdat->kswapd = NULL;
} else {
wake_up_process(pgdat->kswapd);
}
}
pgdat_kswapd_unlock(pgdat);
}
/* * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done().
*/ void __meminit kswapd_stop(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid); struct task_struct *kswapd;
#ifdef CONFIG_NUMA /* * Node reclaim mode * * If non-zero call node_reclaim when the number of free pages falls below * the watermarks.
*/ int node_reclaim_mode __read_mostly;
/* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone.
*/ #define NODE_RECLAIM_PRIORITY 4
/* * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur.
*/ int sysctl_min_unmapped_ratio = 1;
/* * If the number of slab pages in a zone grows beyond this percentage then * slab reclaim needs to occur.
*/ int sysctl_min_slab_ratio = 5;
/* * It's possible for there to be more file mapped pages than * accounted for by the pages on the file LRU lists because * tmpfs pages accounted for as ANON can also be FILE_MAPPED
*/ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */ staticunsignedlong node_pagecache_reclaimable(struct pglist_data *pgdat)
{ unsignedlong nr_pagecache_reclaimable; unsignedlong delta = 0;
/* * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and node_unmapped_file_pages() provides * a better estimate
*/ if (node_reclaim_mode & RECLAIM_UNMAP)
nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else
nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
/* If we can't clean pages, remove dirty pages from consideration */ if (!(node_reclaim_mode & RECLAIM_WRITE))
delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable))
delta = nr_pagecache_reclaimable;
return nr_pagecache_reclaimable - delta;
}
/* * Try to free up some pages from this node through reclaim.
*/ staticunsignedlong __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsignedlong nr_pages, struct scan_control *sc)
{ struct task_struct *p = current; unsignedint noreclaim_flag; unsignedlong pflags;
cond_resched();
psi_memstall_enter(&pflags);
delayacct_freepages_start();
fs_reclaim_acquire(sc->gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP
*/
noreclaim_flag = memalloc_noreclaim_save();
set_task_reclaim_state(p, &sc->reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { /* * Free memory by calling shrink node with increasing * priorities until we have enough memory freed.
*/ do {
shrink_node(pgdat, sc);
} while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0);
}
/* * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately * thrown out if the node is overallocated. So we do not reclaim * if less than a specified percentage of the node is used by * unmapped file backed pages.
*/ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
pgdat->min_slab_pages) return NODE_RECLAIM_FULL;
/* * Do not scan if the allocation should not be delayed.
*/ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return NODE_RECLAIM_NOSCAN;
/* * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible.
*/ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN;
if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN;
ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS); else
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
/** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list * @fbatch: Batch of lru folios to check. * * Checks folios for evictability, if an evictable folio is in the unevictable * lru list, moves it to the appropriate evictable lru list. This function * should be only used for lru folios.
*/ void check_move_unevictable_folios(struct folio_batch *fbatch)
{ struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i;
for (i = 0; i < fbatch->nr; i++) { struct folio *folio = fbatch->folios[i]; int nr_pages = folio_nr_pages(folio);
pgscanned += nr_pages;
/* block memcg migration while the folio moves between lrus */ if (!folio_test_clear_lru(folio)) continue;
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.224Bemerkung:
(vorverarbeitet am 2026-04-25)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.