/* SPDX-License-Identifier: GPL-2.0-only */ /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * MMU support * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com>
*/
/* * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, * as well as guest EPT tables, so the code in this file is compiled thrice, * once per guest PTE type. The per-type defines are #undef'd at the end.
*/
/* Common logic, but per-type values. These also need to be undefined. */ #define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK) #define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
/* * For PTTYPE_EPT, a page table can be executable but not readable * on supported processors. Therefore, set_spte does not automatically * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK * to signify readability since it isn't used in the EPT case
*/ staticinlineunsigned FNAME(gpte_access)(u64 gpte)
{ unsigned access; #if PTTYPE == PTTYPE_EPT
access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else
BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
BUILD_BUG_ON(ACC_EXEC_MASK != 1);
access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
access ^= (gpte >> PT64_NX_SHIFT); #endif
/* * If the slot is read-only, simply do not process the accessed * and dirty bits. This is the correct thing to do if the slot * is ROM, and page tables in read-as-ROM/write-as-MMIO slots * are only supported if the accessed and dirty bits are already * set in the ROM (so that MMIO writes are never needed). * * Note that NPT does not allow this at all and faults, since * it always wants nested page table entries for the guest * page tables to be writable. And EPT works but will simply * overwrite the read-only memory to set the accessed and dirty * bits.
*/ if (unlikely(!walker->pte_writable[level - 1])) continue;
ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); if (ret) return ret;
staticinlinebool FNAME(is_last_gpte)(struct kvm_mmu *mmu, unsignedint level, unsignedint gpte)
{ /* * For EPT and PAE paging (both variants), bit 7 is either reserved at * all level or indicates a huge page (ignoring CR3/EPTP). In either * case, bit 7 being set terminates the walk.
*/ #if PTTYPE == 32 /* * 32-bit paging requires special handling because bit 7 is ignored if * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is * greater than the last level for which bit 7 is the PAGE_SIZE bit. * * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 * is not reserved and does not indicate a large page at this level, * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
*/
gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); #endif /* * PG_LEVEL_4K always terminates. The RHS has bit 7 set * iff level <= PG_LEVEL_4K, which for our purpose means * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
*/
gpte |= level - PG_LEVEL_4K - 1;
/* * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging * by the MOV to CR instruction are treated as reads and do not cause the * processor to set the dirty flag in any EPT paging-structure entry.
*/
nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
pte_access = ~0;
/* * Queue a page fault for injection if this assertion fails, as callers * assume that walker.fault contains sane info on a walk failure. I.e. * avoid making the situation worse by inducing even worse badness * between when the assertion fails and when KVM kicks the vCPU out to * userspace (because the VM is bugged).
*/ if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm)) goto error;
++walker->level;
do { struct kvm_memory_slot *slot; unsignedlong host_addr;
pt_access = pte_access;
--walker->level;
index = PT_INDEX(addr, walker->level);
table_gfn = gpte_to_gfn(pte);
offset = index * sizeof(pt_element_t);
pte_gpa = gfn_to_gpa(table_gfn) + offset;
/* * FIXME: This can happen if emulation (for of an INS/OUTS * instruction) triggers a nested page fault. The exit * qualification / exit info field will incorrectly have * "guest page access" as the nested page fault's cause, * instead of "guest page structure access". To fix this, * the x86_exception struct should be augmented with enough * information to fix the exit_qualification or exit_info_1 * fields.
*/ if (unlikely(real_gpa == INVALID_GPA)) return 0;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); if (!kvm_is_visible_memslot(slot)) goto error;
if (!write_fault)
FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. * For modes without A/D bits support accessed_dirty will be * always clear.
*/
accessed_dirty &= pte >>
(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
addr, write_fault); if (unlikely(ret < 0)) goto error; elseif (ret) goto retry_walk;
}
#if PTTYPE == PTTYPE_EPT /* * Use PFERR_RSVD_MASK in error_code to tell if EPT * misconfiguration requires to be injected. The detection is * done by is_rsvd_bits_set() above. * * We set up the value of exit_qualification to inject: * [2:0] - Derive from the access bits. The exit_qualification might be * out of date if it is serving an EPT misconfiguration. * [5:3] - Calculated by the page walk of the guest EPT page tables * [7:8] - Derived from [7:8] of real exit_qualification * * The other bits are set to 0.
*/ if (!(errcode & PFERR_RSVD_MASK)) {
walker->fault.exit_qualification = 0;
if (write_fault)
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE; if (user_fault)
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; if (fetch_fault)
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
/* * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags!
*/
walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
} #endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
walker->fault.async_page_fault = false;
/* * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses.
*/ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return;
if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep);
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { if (spte == sptep) continue;
if (is_shadow_present_pte(*spte)) continue;
if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i])) break;
}
}
/* * Fetch a shadow pte for a specific level in the paging hierarchy. * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case.
*/ staticint FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, struct guest_walker *gw)
{ struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsignedint direct_access, access; int top_level, ret;
gfn_t base_gfn = fault->gfn;
top_level = vcpu->arch.mmu->cpu_role.base.level; if (top_level == PT32E_ROOT_LEVEL)
top_level = PT32_ROOT_LEVEL; /* * Verify that the top-level gpte is still there. Since the page * is a root page, it is either write protected (and cannot be * changed from now on) or it is invalid (in which case, we don't * really care if it changes underneath us after this point).
*/ if (FNAME(gpte_changed)(vcpu, gw, top_level)) return RET_PF_RETRY;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY;
/* * Load a new root and retry the faulting instruction in the extremely * unlikely scenario that the guest root gfn became visible between * loading a dummy root and handling the resulting page fault, e.g. if * userspace create a memslot in the interim.
*/ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); return RET_PF_RETRY;
}
/* * Synchronize the new page before linking it, as the CPU (KVM) * is architecturally disallowed from inserting non-present * entries into the TLB, i.e. the guest isn't required to flush * the TLB when changing the gPTE from non-present to present. * * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already * synchronized the page via kvm_sync_page(). * * For higher level pages, which cannot be unsync themselves * but can have unsync children, synchronize via the slower * mmu_sync_children(). If KVM needs to drop mmu_lock due to * contention or to reschedule, instruct the caller to retry * the #PF (mmu_sync_children() ensures forward progress will * be made).
*/ if (sp != ERR_PTR(-EEXIST) && sp->unsync_children &&
mmu_sync_children(vcpu, sp, false)) return RET_PF_RETRY;
/* * Verify that the gpte in the page, which is now either * write-protected or unsync, wasn't modified between the fault * and acquiring mmu_lock. This needs to be done even when * reusing an existing shadow page to ensure the information * gathered by the walker matches the information stored in the * shadow page (which could have been modified by a different * vCPU even if the page was already linked). Holding mmu_lock * prevents the shadow page from changing after this point.
*/ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) return RET_PF_RETRY;
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
if (fault->write && table_gfn == fault->gfn)
fault->write_fault_to_shadow_pgtable = true;
}
/* * Adjust the hugepage size _after_ resolving indirect shadow pages. * KVM doesn't support mapping hugepages into the guest for gfns that * are being shadowed by KVM, i.e. allocating a new shadow page may * affect the allowed hugepage size.
*/
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable.
*/ if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, *it.sptep, it.level);
base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break;
/* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte * - write access through a shadow pte marked read only so that we can set * the dirty bit * - write access to a shadow pte marked read only so we can update the page * dirty bitmap, when userspace requests it * - mmio access; in this case we will never install a present shadow pte * - normal guest page fault due to the guest pte marked not present, not * writable, or not executable * * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error.
*/ staticint FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{ struct guest_walker walker; int r;
WARN_ON_ONCE(fault->is_tdp);
/* * Look up the guest pte for the faulting address. * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables.
*/
r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
fault->error_code & ~PFERR_RSVD_MASK);
/* * The page is not mapped by the guest. Let the guest handle it.
*/ if (!r) { if (!fault->prefetch)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
if (page_fault_handle_page_track(vcpu, fault)) {
shadow_page_table_clear_flood(vcpu, fault->addr); return RET_PF_WRITE_PROTECTED;
}
r = mmu_topup_memory_caches(vcpu, true); if (r) return r;
r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r;
#if PTTYPE != PTTYPE_EPT /* * Treat the guest PTE protections as writable, supervisor-only if this * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO, * otherwise KVM will cache incorrect access information in the SPTE.
*/ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;
/* * If we converted a user page to a kernel page, * so that the kernel can write to it when cr0.wp=0, * then we should prevent the kernel from executing it * if SMEP is enabled.
*/ if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
} #endif
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
if (is_page_fault_stale(vcpu, fault)) goto out_unlock;
r = make_mmu_pages_available(vcpu); if (r) goto out_unlock;
r = FNAME(fetch)(vcpu, fault, &walker);
/* * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is * safe because SPTEs are protected by mmu_notifiers and memslot generations, so * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are * nuked first. * * Returns * < 0: failed to sync spte * 0: the spte is synced and no tlb flushing is required * > 0: the spte is synced and tlb flushing is required
*/ staticint FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{ bool host_writable;
gpa_t first_pte_gpa;
u64 *sptep, spte; struct kvm_memory_slot *slot; unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
gfn_t gfn;
if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE ||
!sp->shadowed_translation)) return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, sizeof(pt_element_t))) return -1;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) return 1;
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) return 0;
/* * Drop the SPTE if the new protections result in no effective * "present" bit or if the gfn is changing. The former case * only affects EPT with execute-only support with pte_access==0; * all other paging modes will create a read-only SPTE if * pte_access is zero.
*/ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
drop_spte(vcpu->kvm, &sp->spt[i]); return 1;
} /* * Do nothing if the permissions are unchanged. The existing SPTE is * still, and prefetch_invalid_gpte() has verified that the A/D bits * are set in the "new" gPTE, i.e. there is no danger of missing an A/D * update due to A/D bits being set in the SPTE but not the gPTE.
*/ if (kvm_mmu_page_get_access(sp, i) == pte_access) return 0;
/* Update the shadowed access bits in case they changed. */
kvm_mmu_page_set_access(sp, i, pte_access);
/* * There is no need to mark the pfn dirty, as the new protections must * be a subset of the old protections, i.e. synchronizing a SPTE cannot * change the SPTE from read-only to writable.
*/ return mmu_spte_update(sptep, spte);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.