/* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any * collector of these hiwater stats must therefore get total_vm * and rss too, which will usually be the higher. Barriers? not * worth the effort, such snapshots can always be inconsistent.
*/
hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm)
hiwater_vm = mm->hiwater_vm;
hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
/* split executable areas between text and lib */
text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
text = min(text, mm->exec_vm << PAGE_SHIFT);
lib = (mm->exec_vm << PAGE_SHIFT) - text;
staticinlinebool lock_vma_range(struct seq_file *m, struct proc_maps_private *priv)
{ /* * smaps and numa_maps perform page table walk, therefore require * mmap_lock but maps can be read with locking just the vma and * walking the vma tree under rcu read protection.
*/ if (m->op != &proc_pid_maps_op) { if (mmap_read_lock_killable(priv->mm)) returnfalse;
retry:
vma = get_next_vma(priv, *ppos); /* EINTR of EAGAIN is possible */ if (IS_ERR(vma)) { if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) goto retry;
return vma;
}
/* Store previous position to be able to restart if needed */
priv->last_pos = *ppos; if (vma) { /* * Track the end of the reported vma to ensure position changes * even if previous vma was merged with the next vma and we * found the extended vma with the same vm_start.
*/
*ppos = vma->vm_end;
} else {
*ppos = SENTINEL_VMA_GATE;
vma = get_gate_vma(priv->mm);
}
/* * Reset current position if last_addr was set before * and it's not a sentinel.
*/ if (last_addr > 0)
*ppos = last_addr = priv->last_pos;
vma_iter_init(&priv->iter, mm, (unsignedlong)last_addr);
hold_task_mempolicy(priv); if (last_addr == SENTINEL_VMA_GATE) return get_gate_vma(mm);
/* * Print the dentry name for named mappings, and a * special [heap] marker for the heap:
*/ if (vma->vm_file) { /* * If user named this anon shared memory via * prctl(PR_SET_VMA ..., use the provided name.
*/ if (anon_name) {
*name_fmt = "[anon_shmem:%s]";
*name = anon_name->name;
} else {
*path = file_user_path(vma->vm_file);
} return;
}
if (vma->vm_ops && vma->vm_ops->name) {
*name = vma->vm_ops->name(vma); if (*name) return;
}
*name = arch_vma_name(vma); if (*name) return;
if (!vma->vm_mm) {
*name = "[vdso]"; return;
}
if (vma_is_initial_heap(vma)) {
*name = "[heap]"; return;
}
if (vma_is_initial_stack(vma)) {
*name = "[stack]"; return;
}
next_vma:
vma = query_vma_find_by_addr(mm, addr); if (!vma) goto no_vma;
/* user requested only file-backed VMA, keep iterating */ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) goto skip_vma;
/* VMA permissions should satisfy query flags */ if (flags & PROCMAP_QUERY_VMA_FLAGS) {
u32 perm = 0;
if (flags & PROCMAP_QUERY_VMA_READABLE)
perm |= VM_READ; if (flags & PROCMAP_QUERY_VMA_WRITABLE)
perm |= VM_WRITE; if (flags & PROCMAP_QUERY_VMA_EXECUTABLE)
perm |= VM_EXEC; if (flags & PROCMAP_QUERY_VMA_SHARED)
perm |= VM_MAYSHARE;
if ((vma->vm_flags & perm) != perm) goto skip_vma;
}
/* found covering VMA or user is OK with the matching next VMA */ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) return vma;
skip_vma: /* * If the user needs closest matching VMA, keep iterating.
*/
addr = vma->vm_end; if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) goto next_vma;
if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) return -EFAULT; /* argument struct can never be that large, reject abuse */ if (usize > PAGE_SIZE) return -E2BIG; /* argument struct should have at least query_flags and query_addr fields */ if (usize < offsetofend(struct procmap_query, query_addr)) return -EINVAL;
err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); if (err) return err;
/* reject unknown flags */ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) return -EINVAL; /* either both buffer address and size are set, or both should be zero */ if (!!karg.vma_name_size != !!karg.vma_name_addr) return -EINVAL; if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL;
mm = priv->mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH;
err = query_vma_setup(mm); if (err) {
mmput(mm); return err;
}
/* * Proportional Set Size(PSS): my share of RSS. * * PSS of a process is the count of pages it has in memory, where each * page is divided by the number of processes sharing it. So if a * process has 1000 pages all to itself, and 1000 shared with one other * process, its PSS will be 1500. * * To keep (accumulated) division errors low, we adopt a 64bit * fixed-point pss counter to minimize division errors. So (pss >> * PSS_SHIFT) would be the real byte count. * * A shift of 12 before division means (assuming 4K page size): * - 1M 3-user-pages add up to 8KB errors; * - supports mapcount up to 2^24, or 16M; * - supports PSS up to 2^52 bytes, or 4PB.
*/ #define PSS_SHIFT 12
staticvoid smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, bool present)
{ struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsignedlong size = nr * PAGE_SIZE; bool exclusive; int mapcount;
/* * First accumulate quantities that depend only on |size| and the type * of the compound page.
*/ if (folio_test_anon(folio)) {
mss->anonymous += size; if (!folio_test_swapbacked(folio) && !dirty &&
!folio_test_dirty(folio))
mss->lazyfree += size;
}
if (folio_test_ksm(folio))
mss->ksm += size;
mss->resident += size; /* Accumulate the size in pages that have been accessed. */ if (young || folio_test_young(folio) || folio_test_referenced(folio))
mss->referenced += size;
/* * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * * refcount == 1 for present entries guarantees that the folio is mapped * exactly once. For large folios this implies that exactly one * PTE/PMD/... maps (a part of) this folio. * * Treat all non-present entries (where relying on the mapcount and * refcount doesn't make sense) as "maybe shared, but not sure how * often". We treat device private entries as being fake-present. * * Note that it would not be safe to read the mapcount especially for * pages referenced by migration entries, even with the PTL held.
*/ if (folio_ref_count(folio) == 1 || !present) {
smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
dirty, locked, present); return;
}
if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
mapcount = folio_average_page_mapcount(folio);
exclusive = !folio_maybe_mapped_shared(folio);
}
/* * We obtain a snapshot of the mapcount. Without holding the folio lock * this snapshot can be slightly wrong as we cannot always read the * mapcount atomically.
*/ for (i = 0; i < nr; i++, page++) { unsignedlong pss = PAGE_SIZE << PSS_SHIFT;
staticvoid show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{ /* * Don't forget to update Documentation/ on changes. * * The length of the second argument of mnemonics[] * needs to be 3 instead of previously set 2 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) * to avoid spurious * -Werror=unterminated-string-initialization warning * with GCC 15
*/ staticconstchar mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about.
*/
[0 ... (BITS_PER_LONG-1)] = "??",
/* * Gather mem stats from @vma with the indicated beginning * address @start, and keep them in @mss. * * Use vm_start of @vma as the beginning address if @start is 0.
*/ staticvoid smap_gather_stats(struct vm_area_struct *vma, struct mem_size_stats *mss, unsignedlong start)
{ conststruct mm_walk_ops *ops = &smaps_walk_ops;
/* Invalid start */ if (start >= vma->vm_end) return;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all * swapped out pages belong to the shmem object, and we can * obtain the swap value much more efficiently. For private * writable mappings, we might have COW pages that are * not affected by the parent swapped out pages of the shmem * object, so we have to distinguish them during the page walk. * Unless we know that the shmem object (or the part mapped by * our VMA) has no swapped out pages at all.
*/ unsignedlong shmem_swapped = shmem_swap_usage(vma);
/* * Release mmap_lock temporarily if someone wants to * access it for write request.
*/ if (mmap_lock_is_contended(mm)) {
vma_iter_invalidate(&vmi);
mmap_read_unlock(mm);
ret = mmap_read_lock_killable(mm); if (ret) {
release_task_mempolicy(priv); goto out_put_mm;
}
/* * After dropping the lock, there are four cases to * consider. See the following example for explanation. * * +------+------+-----------+ * | VMA1 | VMA2 | VMA3 | * +------+------+-----------+ * | | | | * 4k 8k 16k 400k * * Suppose we drop the lock after reading VMA2 due to * contention, then we get: * * last_vma_end = 16k * * 1) VMA2 is freed, but VMA3 exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end.
*/
vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break;
/* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) {
smap_gather_stats(vma, &mss, 0);
last_vma_end = vma->vm_end; continue;
}
if (!pte_write(pte)) returnfalse; if (!is_cow_mapping(vma->vm_flags)) returnfalse; if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) returnfalse;
folio = vm_normal_folio(vma, addr, pte); if (!folio) returnfalse; return folio_maybe_dma_pinned(folio);
}
staticinlinevoid clear_soft_dirty(struct vm_area_struct *vma, unsignedlong addr, pte_t *pte)
{ /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works.
*/
pte_t ptent = ptep_get(pte);
if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */
old = pmdp_invalidate(vma, addr, pmdp); if (pmd_dirty(old))
pmd = pmd_mkdirty(pmd); if (pmd_young(old))
pmd = pmd_mkyoung(pmd);
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT;
rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv;
type = (enum clear_refs_types)itype; if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL;
task = get_proc_task(file_inode(file)); if (!task) return -ESRCH;
mm = get_task_mm(task); if (mm) {
VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = {
.type = type,
};
if (mmap_write_lock_killable(mm)) {
count = -EINTR; goto out_mm;
} if (type == CLEAR_REFS_MM_HIWATER_RSS) { /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value.
*/
reset_mm_hiwater_rss(mm); goto out_unlock;
}
if (type == CLEAR_REFS_SOFT_DIRTY) {
for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue;
vm_flags_clear(vma, VM_SOFTDIRTY);
vma_set_page_prot(vma);
}
while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr);
pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsignedlong hole_end;
if (vma)
hole_end = min(end, vma->vm_start); else
hole_end = end;
for (; addr < hole_end; addr += PAGE_SIZE) {
err = add_to_pagemap(&pme, pm); if (err) goto out;
}
if (!vma) break;
/* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY)
pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
err = add_to_pagemap(&pme, pm); if (err) goto out;
}
}
out: return err;
}
/* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end.
*/
orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); if (!pte) {
walk->action = ACTION_AGAIN; return err;
} for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
/* * /proc/pid/pagemap - an array mapping virtual pages to pfns * * For each page in the address space, this file contains one 64-bit entry * consisting of the following: * * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bit 57 pte is uffd-wp write-protected * Bit 58 pte is a guard region * Bits 59-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present * * If the page is not present but in swap, then the PFN contains an * encoding of the swap file number and the page's offset into the * swap. Unmapped pages return a null PFN. This allows determining * precisely which pages are mapped (or in swap) and comparing mapped * pages between processes. * * Efficient users of this interface will use /proc/pid/maps to * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions.
*/ static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{ struct mm_struct *mm = file->private_data; struct pagemapread pm; unsignedlong src; unsignedlong svpfn; unsignedlong start_vaddr; unsignedlong end_vaddr; int ret = 0, copied = 0;
if (!mm || !mmget_not_zero(mm)) goto out;
ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_mm;
ret = 0; if (!count) goto out_mm;
/* do not disclose physical addresses: attack vector */
pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
ret = -ENOMEM; if (!pm.buffer) goto out_mm;
/* * According to pagemap_hugetlb_range(), file-backed HugeTLB * page cannot be swapped. So PAGE_IS_FILE is not checked for * swapped pages.
*/ if (pte_present(pte)) {
categories |= PAGE_IS_PRESENT; if (!huge_pte_uffd_wp(pte))
categories |= PAGE_IS_WRITTEN; if (!PageAnon(pte_page(pte)))
categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte)))
categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte))
categories |= PAGE_IS_SOFT_DIRTY;
} elseif (is_swap_pte(pte)) {
categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte))
categories |= PAGE_IS_WRITTEN; if (pte_swp_soft_dirty(pte))
categories |= PAGE_IS_SOFT_DIRTY;
}
if (!wp_allowed) { /* User requested explicit failure over wp-async capability */ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) return -EPERM; /* * User requires wr-protect, and allows silently skipping * unsupported vmas.
*/ if (p->arg.flags & PM_SCAN_WP_MATCHING) return 1; /* * Then the request doesn't involve wr-protects at all, * fall through to the rest checks, and allow vma walk.
*/
}
if (vma->vm_flags & VM_PFNMAP) return 1;
if (wp_allowed)
vma_category |= PAGE_IS_WPALLOWED;
if (vma->vm_flags & VM_SOFTDIRTY)
vma_category |= PAGE_IS_SOFT_DIRTY;
if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1;
/* * When there is no output buffer provided at all, the sentinel values * won't match here. There is no other way for `cur_buf->end` to be * non-zero other than it being non-empty.
*/ if (addr == cur_buf->end && categories == cur_buf->categories) {
cur_buf->end = end; returntrue;
}
if (cur_buf->end) { if (p->vec_buf_index >= p->vec_buf_len - 1) returnfalse;
if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock;
ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock;
if (~p->arg.flags & PM_SCAN_WP_MATCHING) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock;
/* * Break huge page into small pages if the WP operation * needs to be performed on a portion of the huge page.
*/ if (end != start + HPAGE_SIZE) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, start);
pagemap_scan_backout_range(p, start, end); /* Report as if there was no THP */ return -ENOENT;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.