// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com>
*/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
/* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions.
*/ staticbool __read_mostly nested = 1;
module_param(nested, bool, 0444);
/* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. * According to test, this time is usually smaller than 128 cycles. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. Tests indicate that most spinlocks are held for * less than 2^12 cycles * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3.
*/ staticunsignedint ple_gap = KVM_DEFAULT_PLE_GAP;
module_param(ple_gap, uint, 0444);
/* Default resets per-vcpu window every exit to ple_window. */ staticunsignedint ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(ple_window_shrink, uint, 0444);
/* Default is to compute the maximum so we can never overflow. */ staticunsignedint ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; #ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO); #endif
/* If set to auto use the default l1tf mitigation method */ if (l1tf == VMENTER_L1D_FLUSH_AUTO) { switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER; break; case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT:
l1tf = VMENTER_L1D_FLUSH_COND; break; case L1TF_MITIGATION_FULL: case L1TF_MITIGATION_FULL_FORCE:
l1tf = VMENTER_L1D_FLUSH_ALWAYS; break;
}
} elseif (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
}
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { /* * This allocation for vmx_l1d_flush_pages is not tied to a VM * lifetime and so should not be charged to a memcg.
*/
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); if (!page) return -ENOMEM;
vmx_l1d_flush_pages = page_address(page);
/* * Initialize each page with a different pattern in * order to protect against KSM in the nested * virtualization case.
*/ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
PAGE_SIZE);
}
}
l1tf_vmx_mitigation = l1tf;
if (l1tf != VMENTER_L1D_FLUSH_NEVER)
static_branch_enable(&vmx_l1d_should_flush); else
static_branch_disable(&vmx_l1d_should_flush);
if (l1tf == VMENTER_L1D_FLUSH_COND)
static_branch_enable(&vmx_l1d_flush_cond); else
static_branch_disable(&vmx_l1d_flush_cond); return 0;
}
if (s) { for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { if (vmentry_l1d_param[i].for_parse &&
sysfs_streq(s, vmentry_l1d_param[i].option)) return i;
}
} return -EINVAL;
}
staticint vmentry_l1d_flush_set(constchar *s, conststruct kernel_param *kp)
{ int l1tf, ret;
l1tf = vmentry_l1d_flush_parse(s); if (l1tf < 0) return l1tf;
if (!boot_cpu_has(X86_BUG_L1TF)) return 0;
/* * Has vmx_init() run already? If not then this is the pre init * parameter parsing. In that case just store the value and let * vmx_init() do the proper setup after enable_ept has been * established.
*/ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
vmentry_l1d_flush_param = l1tf; return 0;
}
mutex_lock(&vmx_l1d_flush_mutex);
ret = vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex); return ret;
}
msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
msr |= FB_CLEAR_DIS;
native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */
vmx->msr_ia32_mcu_opt_ctrl = msr;
}
static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
{ if (!vmx->disable_fb_clear) return;
staticvoid vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{ /* * Disable VERW's behavior of clearing CPU buffers for the guest if the * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled * the mitigation. Disabling the clearing behavior provides a * performance boost for guests that aren't aware that manually clearing * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry * and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
(kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
/* * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS * at VMEntry. Skip the MSR read/write when a guest has no use case to * execute VERW.
*/ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
(vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
(vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
(vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
(vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
vmx->disable_fb_clear = false;
}
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
/* * Enlightened VMCS usage should be recommended and the host needs * to support eVMCS v1 or above.
*/ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
(ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
KVM_EVMCS_VERSION) {
/* Check that we have assist pages on all online CPUs */
for_each_online_cpu(cpu) { if (!hv_get_vp_assist_page(cpu)) {
enlightened_vmcs = false; break;
}
}
if (enlightened_vmcs) {
pr_info("Using Hyper-V Enlightened VMCS\n");
static_branch_enable(&__kvm_is_using_evmcs);
}
/* * KVM should enable eVMCS if and only if all CPUs have a VP assist * page, and should reject CPU onlining if eVMCS is enabled the CPU * doesn't have a VP assist page allocated.
*/
vp_ap = hv_get_vp_assist_page(smp_processor_id()); if (WARN_ON_ONCE(!vp_ap)) return;
/* * Reset everything to support using non-enlightened VMCS access later * (e.g. when we reload the module with enlightened_vmcs=0)
*/
vp_ap->nested_control.features.directhypercall = 0;
vp_ap->current_nested_vmcs = 0;
vp_ap->enlighten_vmentry = 0;
}
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{ int i;
i = kvm_find_user_return_msr(msr); if (i >= 0) return &vmx->guest_uret_msrs[i]; return NULL;
}
staticint vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data)
{ unsignedint slot = msr - vmx->guest_uret_msrs; int ret = 0;
if (msr->load_into_hardware) {
preempt_disable();
ret = kvm_set_user_return_msr(slot, data, msr->mask);
preempt_enable();
} if (!ret)
msr->data = data; return ret;
}
/* * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) * * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to * atomically track post-VMXON state, e.g. this may be called in NMI context. * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. * faults are guaranteed to be due to the !post-VMXON check unless the CPU is * magically in RM, VM86, compat mode, or at CPL>0.
*/ staticint kvm_cpu_vmxoff(void)
{ asmgoto("1: vmxoff\n\t"
_ASM_EXTABLE(1b, %l[fault])
::: "cc", "memory" : fault);
void vmx_emergency_disable_virtualization_cpu(void)
{ int cpu = raw_smp_processor_id(); struct loaded_vmcs *v;
kvm_rebooting = true;
/* * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be * set in task context. If this races with VMX is disabled by an NMI, * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to * kvm_rebooting set.
*/ if (!(__read_cr4() & X86_CR4_VMXE)) return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs); if (v->shadow_vmcs)
vmcs_clear(v->shadow_vmcs);
}
kvm_cpu_vmxoff();
}
staticvoid __loaded_vmcs_clear(void *arg)
{ struct loaded_vmcs *loaded_vmcs = arg; int cpu = raw_smp_processor_id();
if (loaded_vmcs->cpu != cpu) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
vmcs_clear(loaded_vmcs->vmcs); if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
vmcs_clear(loaded_vmcs->shadow_vmcs);
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/* * Ensure all writes to loaded_vmcs, including deleting it from its * current percpu list, complete before setting loaded_vmcs->cpu to * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first * and add loaded_vmcs to its percpu list before it's deleted from this * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR); /* * #VE isn't used for VMX. To test against unexpected changes * related to #VE for VMX, intercept unexpected #VE and warn on it.
*/ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
eb |= 1u << VE_VECTOR; /* * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway * as VMware does.
*/ if (enable_vmware_backdoor)
eb |= (1u << GP_VECTOR); if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0; if (!vmx_need_pf_intercept(vcpu))
eb &= ~(1u << PF_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass * them to L1. When running L2, we will only handle the exceptions * specified above if L1 did not want them.
*/ if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap; else { int mask = 0, match = 0;
if (enable_ept && (eb & (1u << PF_VECTOR))) { /* * If EPT is enabled, #PF is currently only intercepted * if MAXPHYADDR is smaller on the guest than on the * host. In that case we only care about present, * non-reserved faults. For vmcs02, however, PFEC_MASK * and PFEC_MATCH are set in prepare_vmcs02_rare.
*/
mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
match = PFERR_PRESENT_MASK;
}
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}
/* * Disabling xfd interception indicates that dynamic xfeatures * might be used in the guest. Always trap #NM in this case * to save guest xfd_err timely.
*/ if (vcpu->arch.xfd_no_write_intercept)
eb |= (1u << NM_VECTOR);
vmcs_write32(EXCEPTION_BITMAP, eb);
}
/* * Check if MSR is intercepted for currently loaded MSR bitmap.
*/ staticbool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) returntrue;
if (vmx->loaded_vmcs->launched)
flags |= VMX_RUN_VMRESUME;
/* * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl.
*/ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
if (static_branch_unlikely(&cpu_buf_vm_clear) &&
kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer()) {
add_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_EFER,
VM_EXIT_LOAD_IA32_EFER,
GUEST_IA32_EFER,
HOST_IA32_EFER,
guest_val, host_val); return;
} break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl()) {
add_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
GUEST_IA32_PERF_GLOBAL_CTRL,
HOST_IA32_PERF_GLOBAL_CTRL,
guest_val, host_val); return;
} break; case MSR_IA32_PEBS_ENABLE: /* PEBS needs a quiescent period after being disabled (to write * a record). Disabling PEBS through VMX MSR swapping doesn't * provide that period, so a CPU could write host's record into * guest's memory.
*/
wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
}
i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (!entry_only)
j = vmx_find_loadstore_msr_slot(&m->host, msr);
/* Shadow paging assumes NX to be available. */ if (!enable_ept)
guest_efer |= EFER_NX;
/* * LMA and LME handled by hardware; SCE meaningless outside long mode.
*/
ignore_bits |= EFER_SCE; #ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME; /* SCE is meaningful only in long mode on Intel */ if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE; #endif
/* * On EPT, we can't emulate NX, so we must switch EFER atomically. * On CPUs that support "load IA32_EFER", always switch EFER * atomically, since it's faster than switching it manually.
*/ if (cpu_has_load_ia32_efer() ||
(enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME; if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, kvm_host.efer, false); else
clear_atomic_switch_msr(vmx, MSR_EFER); returnfalse;
}
i = kvm_find_user_return_msr(MSR_EFER); if (i < 0) returnfalse;
#ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the * VMCS rather than the segment table. KVM uses this helper to figure * out the current bases to poke them into the VMCS before entry.
*/ staticunsignedlong segment_base(u16 selector)
{ struct desc_struct *table; unsignedlong v;
staticinlinebool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
{ /* The base must be 128-byte aligned and a legal physical address. */ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
}
rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) {
rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
staticvoid pt_guest_enter(struct vcpu_vmx *vmx)
{ if (vmx_pt_mode_is_system()) return;
/* * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry.
*/
rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
wrmsrq(MSR_IA32_RTIT_CTL, 0);
pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
}
staticvoid pt_guest_exit(struct vcpu_vmx *vmx)
{ if (vmx_pt_mode_is_system()) return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
}
/* * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
*/ if (vmx->pt_desc.host.ctl)
wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
/* * Note that guest MSRs to be saved/restored can also be changed * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA.
*/ if (!vmx->guest_uret_msrs_loaded) {
vmx->guest_uret_msrs_loaded = true; for (i = 0; i < kvm_nr_uret_msrs; ++i) { if (!vmx->guest_uret_msrs[i].load_into_hardware) continue;
if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
if (vt->guest_state_loaded) return;
host_state = &vmx->loaded_vmcs->host_state;
/* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1.
*/
host_state->ldt_sel = kvm_read_ldt();
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
/* * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to * this cpu's percpu list, otherwise it may not yet be deleted * from its previous cpu's percpu list. Pairs with the * smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
if (!already_loaded) { void *gdt = get_current_gdt_ro();
/* * Flush all EPTP/VPID contexts, the new pCPU may have stale * TLB entries from its previous association with the vCPU.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
(unsignedlong)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsignedlong)gdt); /* 22.2.4 */
/* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken.
*/ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
/* * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU * is an unrestricted guest in order to mark L2 as needing emulation * if L1 runs L2 as a restricted guest.
*/ if (is_unrestricted_guest(vcpu)) {
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
vmx->rflags = rflags;
vmcs_writel(GUEST_RFLAGS, rflags); return;
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0;
if (interruptibility & GUEST_INTR_STATE_STI)
ret |= KVM_X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS)
ret |= KVM_X86_SHADOW_INT_MOV_SS;
/* * Any MSR write that attempts to change bits marked reserved will * case a #GP fault.
*/ if (data & vmx->pt_desc.ctl_bitmask) return 1;
/* * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will * result in a #GP unless the same write also clears TraceEn.
*/ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
(data & RTIT_CTL_TRACEEN) &&
data != vmx->pt_desc.guest.ctl) return 1;
/* * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit * and FabricEn would cause #GP, if * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
*/ if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
!(data & RTIT_CTL_FABRIC_EN) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)) return 1;
/* * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that * utilize encodings marked reserved will cause a #GP fault.
*/
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
!test_bit((data & RTIT_CTL_MTC_RANGE) >>
RTIT_CTL_MTC_RANGE_OFFSET, &value)) return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cycle_thresholds); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
!test_bit((data & RTIT_CTL_CYC_THRESH) >>
RTIT_CTL_CYC_THRESH_OFFSET, &value)) return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
!test_bit((data & RTIT_CTL_PSB_FREQ) >>
RTIT_CTL_PSB_FREQ_OFFSET, &value)) return 1;
/* * If ADDRx_CFG is reserved or the encodings is >2 will * cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) return 1;
return 0;
}
int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len)
{ /* * Emulation of instructions in SGX enclaves is impossible as RIP does * not point at the failing instruction, and even if it did, the code * stream is inaccessible. Inject #UD instead of exiting to userspace * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only).
*/ if (vmx_get_exit_reason(vcpu).enclave_mode) {
kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT;
}
/* Check that emulation is possible during event vectoring */ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
!kvm_can_emulate_event_vectoring(emul_type)) return X86EMUL_UNHANDLEABLE_VECTORING;
/* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on * undefined behavior: Intel's SDM doesn't mandate the VMCS field be * set when EPT misconfig occurs. In practice, real hardware updates * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors * (namely Hyper-V) don't set it due to it being undefined behavior, * i.e. we end up advancing IP with some random value.
*/ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
/* * Emulating an enclave's instructions isn't supported as KVM * cannot access the enclave's memory or its true RIP, e.g. the * vmcs.GUEST_RIP points at the exit point of the enclave, not * the RIP that actually triggered the VM-Exit. But, because * most instructions that cause VM-Exit will #UD in an enclave, * most instruction-based VM-Exits simply do not occur. * * There are a few exceptions, notably the debug instructions * INT1ICEBRK and INT3, as they are allowed in debug enclaves * and generate #DB/#BP as expected, which KVM might intercept. * But again, the CPU does the dirty work and saves an instr * length of zero so VMMs don't shoot themselves in the foot. * WARN if KVM tries to skip a non-zero length instruction on * a VM-Exit from an enclave.
*/ if (!instr_len) goto rip_updated;
WARN_ONCE(exit_reason.enclave_mode, "skipping instruction after SGX enclave VM-Exit");
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit * mode, but just finding out that we are in 64-bit mode is * quite expensive. Only do it if there was a carry.
*/ if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
rip = (u32)rip; #endif
kvm_rip_write(vcpu, rip);
} else { if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0;
}
rip_updated: /* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
return 1;
}
/* * Recognizes a pending MTF VM-exit and records the nested state for later * delivery.
*/ void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{ struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
if (!is_guest_mode(vcpu)) return;
/* * Per the SDM, MTF takes priority over debug-trap exceptions besides * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps * or ICEBP (in the emulator proper), and skipping of ICEBP after an * intercepted #DB deliberately avoids single-step #DB and MTF updates * as ICEBP is higher priority than both. As instruction emulation is * completed at this point (i.e. KVM is at the instruction boundary), * any #DB exception pending delivery must be a debug-trap of lower * priority than MTF. Record the pending MTF state to be delivered in * vmx_check_nested_events().
*/ if (nested_cpu_has_mtf(vmcs12) &&
(!vcpu->arch.exception.pending ||
vcpu->arch.exception.vector == DB_VECTOR) &&
(!vcpu->arch.exception_vmexit.pending ||
vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
vmx->nested.mtf_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
vmx->nested.mtf_pending = false;
}
}
int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu); return skip_emulated_instruction(vcpu);
}
staticvoid vmx_clear_hlt(struct kvm_vcpu *vcpu)
{ /* * Ensure that we clear the HLT state in the VMCS. We don't need to * explicitly skip the instruction because if the HLT state is set, * then the instruction is already executing and RIP has already been * advanced.
*/ if (kvm_hlt_in_guest(vcpu->kvm) &&
vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
if (ex->has_error_code) { /* * Despite the error code being architecturally defined as 32 * bits, and the VMCS field being 32 bits, Intel CPUs and thus * VMX don't actually supporting setting bits 31:16. Hardware * will (should) never provide a bogus error code, but AMD CPUs * do generate error codes with bits 31:16 set, and so KVM's * ABI lets userspace shove in arbitrary 32-bit values. Drop * the upper bits to avoid VM-Fail, losing information that * doesn't really exist is preferable to killing the VM.
*/
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) { int inc_eip = 0; if (kvm_exception_is_soft(ex->vector))
inc_eip = vcpu->arch.event_exit_inst_len;
kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); return;
}
/* * Configuring user return MSRs to automatically save, load, and restore MSRs * that need to be shoved into hardware when running the guest. Note, omitting * an MSR here does _NOT_ mean it's not emulated, only that it will not be * loaded into hardware when running the guest.
*/ staticvoid vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{ #ifdef CONFIG_X86_64 bool load_syscall_msrs;
/* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set.
*/
load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
(vmx->vcpu.arch.efer & EFER_SCE);
/* * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new * kernel and old userspace. If those guests run on a tsx=off host, do * allow guests to use TSX_CTRL, but don't change the value in hardware * so that TSX remains always disabled.
*/
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
/* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter.
*/
vmx->guest_uret_msrs_loaded = false;
}
/* * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain * backwards compatibility even though KVM doesn't support emulating SMX. And * because userspace set "VMX in SMX", the guest must also be allowed to set it, * e.g. if the MSR is left unlocked and the guest does a RMW operation.
*/ #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
FEAT_CTL_SGX_LC_ENABLED | \
FEAT_CTL_SGX_ENABLED | \
FEAT_CTL_LMCE_ENABLED)
/* * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are * exposed to the guest.
*/
WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
~KVM_SUPPORTED_FEATURE_CONTROL);
if (!msr->host_initiated &&
(vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) returnfalse;
if (msr->host_initiated)
valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; else
valid_bits = vmx->msr_ia32_feature_control_valid_bits;
return !(msr->data & ~valid_bits);
}
int vmx_get_feature_msr(u32 msr, u64 *data)
{ switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: return KVM_MSR_RET_UNSUPPORTED;
}
}
/* * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called.
*/ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr;
u32 index;
switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE:
msr_info->data = vmcs_readl(GUEST_FS_BASE); break; case MSR_GS_BASE:
msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE:
msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSX_CTRL: if (!msr_info->host_initiated &&
!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; goto find_uret_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1;
msr_info->data = vmx->msr_ia32_umwait_control; break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated &&
!guest_has_spec_ctrl_msr(vcpu)) return 1;
msr_info->data = to_vmx(vcpu)->spec_ctrl; break; case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
!guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_MCG_EXT_CTL: if (!msr_info->host_initiated &&
!(vmx->msr_ia32_feature_control &
FEAT_CTL_LMCE_ENABLED)) return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl; break; case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control; break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: if (!msr_info->host_initiated &&
!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) return 1;
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data)) return 1; #ifdef CONFIG_KVM_HYPERV /* * Enlightened VMCS v1 doesn't have certain VMCS fields but * instead of just ignoring the features, different Hyper-V * versions are either trying to use them and fail or do some * sanity checking and refuse to boot. Filter all unsupported * features out.
*/ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data); #endif break; case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest()) return 1;
msr_info->data = vmx->pt_desc.guest.ctl; break; case MSR_IA32_RTIT_STATUS: if (!vmx_pt_mode_is_host_guest()) return 1;
msr_info->data = vmx->pt_desc.guest.status; break; case MSR_IA32_RTIT_CR3_MATCH: if (!vmx_pt_mode_is_host_guest() ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering)) return 1;
msr_info->data = vmx->pt_desc.guest.cr3_match; break; case MSR_IA32_RTIT_OUTPUT_BASE: if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output))) return 1;
msr_info->data = vmx->pt_desc.guest.output_base; break; case MSR_IA32_RTIT_OUTPUT_MASK: if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output))) return 1;
msr_info->data = vmx->pt_desc.guest.output_mask; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (!vmx_pt_mode_is_host_guest() ||
(index >= 2 * vmx->pt_desc.num_address_ranges)) return 1; if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_DEBUGCTLMSR:
msr_info->data = vmx_guest_debugctl_read(); break; default:
find_uret_msr:
msr = vmx_find_uret_msr(vmx, msr_info->index); if (msr) {
msr_info->data = msr->data; break;
} return kvm_get_msr_common(vcpu, msr_info);
}
/* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called.
*/ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
u32 index;
switch (msr_index) { case MSR_EFER:
ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_FS_BASE, data); break; case MSR_GS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data); break; case MSR_IA32_XFD:
ret = kvm_set_msr_common(vcpu, msr_info); /* * Always intercepting WRMSR could incur non-negligible * overhead given xfd might be changed frequently in * guest context switch. Disable write interception * upon the first write with a non-zero value (indicating * potential usage on dynamic xfeatures). Also update * exception bitmap to trap #NM for proper virtualization * of guest xfd_err.
*/ if (!ret && data) {
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
MSR_TYPE_RW);
vcpu->arch.xfd_no_write_intercept = true;
vmx_update_exception_bitmap(vcpu);
} break; #endif case MSR_IA32_SYSENTER_CS: if (is_guest_mode(vcpu))
get_vmcs12(vcpu)->guest_sysenter_cs = data;
vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: if (is_guest_mode(vcpu)) {
data = nested_vmx_truncate_sysenter_addr(vcpu, data);
get_vmcs12(vcpu)->guest_sysenter_eip = data;
}
vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: if (is_guest_mode(vcpu)) {
data = nested_vmx_truncate_sysenter_addr(vcpu, data);
get_vmcs12(vcpu)->guest_sysenter_esp = data;
}
vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) return 1;
data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
vmx_guest_debugctl_write(vcpu, data);
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu); return 0; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
!guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD)) return 1;
vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1;
/* The reserved bit 1 and non-32 bit [63:32] should be zero */ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) return 1;
vmx->msr_ia32_umwait_control = data; break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated &&
!guest_has_spec_ctrl_msr(vcpu)) return 1;
if (kvm_spec_ctrl_test_value(data)) return 1;
vmx->spec_ctrl = data; if (!data) break;
/* * For non-nested: * When it's written (to non-zero) for the first time, pass * it through. * * For nested: * The handling of the MSR bitmap for L2 guests is done in * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now.
*/
vmx_disable_intercept_for_msr(vcpu,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW); break; case MSR_IA32_TSX_CTRL: if (!msr_info->host_initiated &&
!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; goto find_uret_msr; case MSR_IA32_CR_PAT:
ret = kvm_set_msr_common(vcpu, msr_info); if (ret) break;
if (is_guest_mode(vcpu) &&
get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
get_vmcs12(vcpu)->guest_ia32_pat = data;
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, data); break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
FEAT_CTL_LMCE_ENABLED)) ||
(data & ~MCG_EXT_CTL_LMCE_EN)) return 1;
vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) return 1;
vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
/* SGX may be enabled/disabled by guest's firmware */
vmx_write_encls_bitmap(vcpu, NULL); break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: /* * On real hardware, the LE hash MSRs are writable before * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), * at which point SGX related bits in IA32_FEATURE_CONTROL * become writable. * * KVM does not emulate SGX activation for simplicity, so * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL * is unlocked. This is technically not architectural * behavior, but it's close enough.
*/ if (!msr_info->host_initiated &&
(!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
!(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) return 1;
vmx->msr_ia32_sgxlepubkeyhash
[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest() ||
vmx_rtit_ctl_check(vcpu, data) ||
vmx->nested.vmxon) return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
vmx->pt_desc.guest.ctl = data;
pt_update_intercept_for_msr(vcpu); break; case MSR_IA32_RTIT_STATUS: if (!pt_can_write_msr(vmx)) return 1; if (data & MSR_IA32_RTIT_STATUS_MASK) return 1;
vmx->pt_desc.guest.status = data; break; case MSR_IA32_RTIT_CR3_MATCH: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering)) return 1;
vmx->pt_desc.guest.cr3_match = data; break; case MSR_IA32_RTIT_OUTPUT_BASE: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)) return 1; if (!pt_output_base_valid(vcpu, data)) return 1;
vmx->pt_desc.guest.output_base = data; break; case MSR_IA32_RTIT_OUTPUT_MASK: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)) return 1;
vmx->pt_desc.guest.output_mask = data; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: if (!pt_can_write_msr(vmx)) return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (index >= 2 * vmx->pt_desc.num_address_ranges) return 1; if (is_noncanonical_msr_address(data, vcpu)) return 1; if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data; else
vmx->pt_desc.guest.addr_a[index / 2] = data; break; case MSR_IA32_PERF_CAPABILITIES: if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) !=
(kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1;
} if (data & PERF_CAP_PEBS_FORMAT) { if ((data & PERF_CAP_PEBS_MASK) !=
(kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) return 1; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1;
}
ret = kvm_set_msr_common(vcpu, msr_info); break;
default:
find_uret_msr:
msr = vmx_find_uret_msr(vmx, msr_index); if (msr)
ret = vmx_set_guest_uret_msr(vmx, msr, data); else
ret = kvm_set_msr_common(vcpu, msr_info);
}
/* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
vmx_update_fb_clear_dis(vcpu, vmx);
switch (reg) { case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); break; case VCPU_REGS_RIP:
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); break; case VCPU_EXREG_PDPTR: if (enable_ept)
ept_save_pdptrs(vcpu); break; case VCPU_EXREG_CR0:
guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
vcpu->arch.cr0 &= ~guest_owned_bits;
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: /* * When intercepting CR3 loads, e.g. for shadowing paging, KVM's * CR3 is loaded into hardware, not the guest's CR3.
*/ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4:
guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
/* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID * directly instead of going through cpu_has(), to ensure KVM is trapping * ENCLS whenever it's supported in hardware. It does not matter whether * the host OS supports or has enabled SGX.
*/ staticbool cpu_has_sgx(void)
{ return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
}
/* * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always * intercepts writes to PAT and EFER, i.e. never enables those controls.
*/ struct {
u32 entry_control;
u32 exit_control;
} const vmcs_entry_exit_pairs[] = {
{ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
{ VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
{ VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
{ VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
{ VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
};
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control)) return -EIO; if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control)) return -EIO;
} if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
#ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
vmx_cap->ept) {
pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n");
if (error_on_inconsistent_vmcs_config) return -EIO;
vmx_cap->ept = 0;
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
} if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n");
if (error_on_inconsistent_vmcs_config) return -EIO;
vmx_cap->vpid = 0;
}
if (!cpu_has_sgx())
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
_cpu_based_3rd_exec_control =
adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
MSR_IA32_VMX_PROCBASED_CTLS3);
if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control)) return -EIO;
if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control)) return -EIO;
if (cpu_has_broken_vmx_preemption_timer())
_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
MSR_IA32_VMX_ENTRY_CTLS,
&_vmentry_control)) return -EIO;
if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
_vmentry_control, _vmexit_control)) return -EIO;
/* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
*/ switch (boot_cpu_data.x86_vfm) { case INTEL_NEHALEM_EP: /* AAK155 */ case INTEL_NEHALEM: /* AAP115 */ case INTEL_WESTMERE: /* AAT100 */ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ case INTEL_NEHALEM_EX: /* BA97 */
_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); break; default: break;
}
rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) return -EIO;
#ifdef CONFIG_X86_64 /* * KVM expects to be able to shove all legal physical addresses into * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always * 0 for processors that support Intel 64 architecture".
*/ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EIO; #endif
/* Require Write-Back (WB) memory type for VMCS accesses. */ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO;
#if IS_ENABLED(CONFIG_HYPERV) if (enlightened_vmcs)
evmcs_sanitize_exec_ctrls(vmcs_conf); #endif
return 0;
}
staticbool __kvm_is_vmx_supported(void)
{ int cpu = smp_processor_id();
if (!(cpuid_ecx(1) & feature_bit(VMX))) {
pr_err("VMX not supported by CPU %d\n", cpu); returnfalse;
}
if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
!this_cpu_has(X86_FEATURE_VMX)) {
pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); returnfalse;
}
int vmx_enable_virtualization_cpu(void)
{ int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); int r;
if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY;
/* * This can happen if we hot-added a CPU but failed to allocate * VP assist page for it.
*/ if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) return -EFAULT;
intel_pt_handle_vmx(1);
r = kvm_cpu_vmxon(phys_addr); if (r) {
intel_pt_handle_vmx(0); return r;
}
return 0;
}
staticvoid vmclear_local_loaded_vmcss(void)
{ int cpu = raw_smp_processor_id(); struct loaded_vmcs *v, *n;
list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
__loaded_vmcs_clear(v);
}
/* * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
*/ void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{ if (!loaded_vmcs->vmcs) return;
loaded_vmcs_clear(loaded_vmcs);
free_vmcs(loaded_vmcs->vmcs);
loaded_vmcs->vmcs = NULL; if (loaded_vmcs->msr_bitmap)
free_page((unsignedlong)loaded_vmcs->msr_bitmap);
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) return -ENOMEM;
/* * When eVMCS is enabled, alloc_vmcs_cpu() sets * vmcs->revision_id to KVM_EVMCS_VERSION instead of * revision_id reported by MSR_IA32_VMX_BASIC. * * However, even though not explicitly documented by * TLFS, VMXArea passed as VMXON argument should * still be marked with revision_id reported by * physical CPU.
*/ if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
per_cpu(vmxarea, cpu) = vmcs;
} return 0;
}
staticvoid fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
{ if (!emulate_invalid_guest_state) { /* * CS and SS RPL should be equal during guest entry according * to VMX spec, but in reality it is not always so. Since vcpu * is in the middle of the transition from real mode to * protected mode it is safe to assume that RPL 0 is a good * default value.
*/ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
save->selector &= ~SEGMENT_RPL_MASK;
save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
__vmx_set_segment(vcpu, save, seg);
}
/* * Update real mode segment cache. It may be not up-to-date if segment * register was written while vcpu was in a guest mode.
*/
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
/* * KVM should never use VM86 to virtualize Real Mode when L2 is active, * as using VM86 is unnecessary if unrestricted guest is enabled, and * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 * should VM-Fail and KVM should reject userspace attempts to stuff * CR0.PG=0 when L2 is active.
*/
WARN_ON_ONCE(is_guest_mode(vcpu));
/* * INVEPT must be issued when EPT is enabled, irrespective of VPID, as * the CPU is not required to invalidate guest-physical mappings on * VM-Entry, even if VPID is disabled. Guest-physical mappings are * associated with the root EPT structure and not any particular VPID * (INVVPID also isn't required to invalidate guest-physical mappings).
*/ if (enable_ept) {
ept_sync_global();
} elseif (enable_vpid) { if (cpu_has_vmx_invvpid_global()) {
vpid_sync_vcpu_global();
} else {
vpid_sync_vcpu_single(vmx->vpid);
vpid_sync_vcpu_single(vmx->nested.vpid02);
}
}
}
/* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) return;
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
mmu->root_role.level)); else
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{ /* * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{ /* * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary.
*/
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
if (enable_ept && !enable_unrestricted_guest) { /* * Ensure KVM has an up-to-date snapshot of the guest's CR3. If * the below code _enables_ CR3 exiting, vmx_cache_reg() will * (correctly) stop reading vmcs.GUEST_CR3 because it thinks * KVM's CR3 is installed.
*/ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
/* * When running with EPT but not unrestricted guest, KVM must * intercept CR3 accesses when paging is _disabled_. This is * necessary because restricted guests can't actually run with * paging disabled, and so KVM stuffs its own CR3 in order to * run the guest when identity mapped page tables. * * Do _NOT_ check the old CR0.PG, e.g. to optimize away the * update, it may be stale with respect to CR3 interception, * e.g. after nested VM-Enter. * * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or * stores to forward them to L1, even if KVM does not need to * intercept them to preserve its identity mapped page tables.
*/ if (!(cr0 & X86_CR0_PG)) {
exec_controls_setbit(vmx, CR3_EXITING_BITS);
} elseif (!is_guest_mode(vcpu)) {
exec_controls_clearbit(vmx, CR3_EXITING_BITS);
} else {
tmp = exec_controls_get(vmx);
tmp &= ~CR3_EXITING_BITS;
tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
exec_controls_set(vmx, tmp);
}
/* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
/* * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
*/ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
}
/* depends on vcpu->arch.cr0 to be set to a new value */
vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
staticint vmx_get_max_ept_level(void)
{ if (cpu_has_vmx_ept_5levels()) return 5; return 4;
}
if (update_guest_cr3)
vmcs_writel(GUEST_CR3, guest_cr3);
}
bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsignedlong cr4)
{ /* * We operate under the default treatment of SMM, so VMX cannot be * enabled under SMM. Note, whether or not VMXE is allowed at all, * i.e. is a reserved bit, is handled by common x86 code.
*/ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) returnfalse;
if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) returnfalse;
/* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0.
*/
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); if (enable_unrestricted_guest)
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; elseif (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; else
hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
if (!enable_unrestricted_guest) { if (enable_ept) { if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
} elseif (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
}
/* * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in * hardware. To emulate this behavior, SMEP/SMAP/PKU needs * to be manually disabled when guest switches to non-paging * mode. * * If !enable_unrestricted_guest, the CPU is always running * with CR0.PG=1 and CR4 needs to be modified. * If enable_unrestricted_guest, the CPU automatically * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
*/ if (!is_paging(vcpu))
hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
}
/* * Fix the "Accessed" bit in AR field of segment registers for older * qemu binaries. * IA32 arch specifies that at the time of processor reset the * "Accessed" bit in the AR field of segment registers is 1. And qemu * is setting it to 0 in the userland code. This causes invalid guest * state vmexit when "unrestricted guest" mode is turned on. * Fix for this setup issue in cpu_reset is being pushed in the qemu * tree. Newer qemu binaries with that qemu fix would not need this * kvm hack.
*/ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
var->type |= 0x1; /* Accessed */
if (cs.unusable) returnfalse; if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) returnfalse; if (!cs.s) returnfalse; if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { if (cs.dpl > cs_rpl) returnfalse;
} else { if (cs.dpl != cs_rpl) returnfalse;
} if (!cs.present) returnfalse;
/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ returntrue;
}
if (var.unusable) returntrue; if (!var.s) returnfalse; if (!var.present) returnfalse; if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { if (var.dpl < rpl) /* DPL < RPL */ returnfalse;
}
/* TODO: Add other members to kvm_segment_field to allow checking for other access * rights flags
*/ returntrue;
}
if (tr.unusable) returnfalse; if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ returnfalse; if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ returnfalse; if (!tr.present) returnfalse;
if (ldtr.unusable) returntrue; if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ returnfalse; if (ldtr.type != 2) returnfalse; if (!ldtr.present) returnfalse;
/* * Check if guest state is valid. Returns true if valid, false if * not. * We assume that registers are always usable
*/ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
{ /* real mode guest state checks */ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) returnfalse; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) returnfalse; if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) returnfalse; if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) returnfalse; if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) returnfalse; if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) returnfalse;
} else { /* protected mode guest state checks */ if (!cs_ss_rpl_check(vcpu)) returnfalse; if (!code_segment_valid(vcpu)) returnfalse; if (!stack_segment_valid(vcpu)) returnfalse; if (!data_segment_valid(vcpu, VCPU_SREG_DS)) returnfalse; if (!data_segment_valid(vcpu, VCPU_SREG_ES)) returnfalse; if (!data_segment_valid(vcpu, VCPU_SREG_FS)) returnfalse; if (!data_segment_valid(vcpu, VCPU_SREG_GS)) returnfalse; if (!tr_valid(vcpu)) returnfalse; if (!ldtr_valid(vcpu)) returnfalse;
} /* TODO: * - Add checks on RIP * - Add checks on RFLAGS
*/
if (likely(kvm_vmx->ept_identity_pagetable_done)) goto out;
if (!kvm_vmx->ept_identity_map_addr)
kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
uaddr = __x86_set_memory_region(kvm,
IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
kvm_vmx->ept_identity_map_addr,
PAGE_SIZE); if (IS_ERR(uaddr)) {
r = PTR_ERR(uaddr); goto out;
}
/* Set up identity-mapping pagetable for EPT in real mode */ for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
r = -EFAULT; goto out;
}
}
kvm_vmx->ept_identity_pagetable_done = true;
staticvoid vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
{ /* * When KVM is a nested hypervisor on top of Hyper-V and uses * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed.
*/ if (kvm_is_using_evmcs()) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
if (evmcs->hv_enlightenments_control.msr_bitmap)
evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
}
if (type & MSR_TYPE_R) { if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
vmx_clear_msr_bitmap_read(msr_bitmap, msr); else
vmx_set_msr_bitmap_read(msr_bitmap, msr);
}
if (type & MSR_TYPE_W) { if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
vmx_clear_msr_bitmap_write(msr_bitmap, msr); else
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
}
staticvoid vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{ /* * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
*/ constint read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; constint write_idx = read_idx + (0x800 / sizeof(u64)); struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
u8 mode;
if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) return;
/* * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended * registers (0x840 and above) intercepted, KVM doesn't support them. * Intercept all writes by default and poke holes as needed. Pass * through reads for all valid registers by default in x2APIC+APICv * mode, only the current timer count needs on-demand emulation by KVM.
*/ if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); else
msr_bitmap[read_idx] = ~0ull;
msr_bitmap[write_idx] = ~0ull;
/* * TPR reads and writes can be virtualized even if virtual interrupt * delivery is not in use.
*/
vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
!(mode & MSR_BITMAP_MODE_X2APIC));
/* * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated * and freed, and must not be accessed outside of vcpu->mutex. The * vCPU's cached PI NV is valid if and only if posted interrupts * enabled in its vmcs12, i.e. checking the vector also checks that * L1 has enabled posted interrupts for L2.
*/ if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) { /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry.
*/
vmx->nested.pi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
/* * This pairs with the smp_mb_*() after setting vcpu->mode in * vcpu_enter_guest() to guarantee the vCPU sees the event * request if triggering a posted interrupt "fails" because * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as * the smb_wmb() in kvm_make_request() only ensures everything * done before making the request is visible when the request * is visible, it doesn't ensure ordering between the store to * vcpu->requests and the load from vcpu->mode.
*/
smp_mb__after_atomic();
/* the PIR and ON have been set by L1. */
kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0;
} return -1;
} /* * Send interrupt to vcpu via posted interrupt way. * 1. If target vcpu is running(non-root mode), send posted interrupt * notification to vcpu and hardware will sync PIR to vIRR atomically. * 2. If target vcpu isn't running(root mode), kick it to pick up the * interrupt from PIR in next vmentry.
*/ staticint vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{ struct vcpu_vt *vt = to_vt(vcpu); int r;
r = vmx_deliver_nested_posted_interrupt(vcpu, vector); if (!r) return 0;
/* Note, this is called iff the local APIC is in-kernel. */ if (!vcpu->arch.apic->apicv_active) return -1;
/* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. * Note that host-state that does change is set elsewhere. E.g., host-state * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
*/ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
{
u32 low32, high32; unsignedlong tmpl; unsignedlong cr0, cr3, cr4;
/* * Save the most likely value for this task's CR3 in the VMCS. * We can't use __get_current_cr3_fast() because we're not atomic.
*/
cr3 = __read_cr3();
vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
vmx->loaded_vmcs->host_state.cr3 = cr3;
/* Save the most likely value for this task's CR4 in the VMCS. */
cr4 = cr4_read_shadow();
vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
vmx->loaded_vmcs->host_state.cr4 = cr4;
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in * vmx_prepare_switch_to_host(), in case userspace uses * the null selectors too (the expected case).
*/
vmcs_write16(HOST_DS_SELECTOR, 0);
vmcs_write16(HOST_ES_SELECTOR, 0); #else
vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #endif
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
/* * SYSENTER is used for 32-bit system calls on either 32-bit or * 64-bit kernels. It is always zero If neither is allowed, otherwise * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may * have already done so!).
*/ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
/* * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for * nested virtualization and thus allowed to be set in vmcs12.
*/
vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
if (vmx_pt_mode_is_system())
vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl &
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
/* * Not used by KVM, but fully supported for nesting, i.e. are allowed in * vmcs12 and propagated to vmcs02 when set in vmcs12.
*/
exec_control &= ~(CPU_BASED_RDTSC_EXITING |
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MONITOR_TRAP_FLAG |
CPU_BASED_PAUSE_EXITING);
/* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
CPU_BASED_NMI_WINDOW_EXITING);
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
if (!cpu_need_tpr_shadow(&vmx->vcpu))
exec_control &= ~CPU_BASED_TPR_SHADOW;
#ifdef CONFIG_X86_64 if (exec_control & CPU_BASED_TPR_SHADOW)
exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING); else
exec_control |= CPU_BASED_CR8_STORE_EXITING |
CPU_BASED_CR8_LOAD_EXITING; #endif /* No need to intercept CR3 access or INVPLG when using EPT. */ if (enable_ept)
exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING); if (kvm_mwait_in_guest(vmx->vcpu.kvm))
exec_control &= ~(CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING); if (kvm_hlt_in_guest(vmx->vcpu.kvm))
exec_control &= ~CPU_BASED_HLT_EXITING; return exec_control;
}
/* * IPI virtualization relies on APICv. Disable IPI virtualization if * APICv is inhibited.
*/ if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
return exec_control;
}
/* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a * feature has been exposed to the guest in order to correctly emulate faults.
*/ staticinlinevoid
vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
u32 control, bool enabled, bool exiting)
{ /* * If the control is for an opt-in feature, clear the control if the * feature is not exposed to the guest, i.e. not enabled. If the * control is opt-out, i.e. an exiting control, clear the control if * the feature _is_ exposed to the guest, i.e. exiting/interception is * disabled for the associated instruction. Note, the caller is * responsible presetting exec_control to set all supported bits.
*/ if (enabled == exiting)
*exec_control &= ~control;
/* * Update the nested MSR settings so that a nested VMM can/can't set * controls for features that are/aren't exposed to the guest.
*/ if (nested &&
kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { /* * All features that can be added or removed to VMX MSRs must * be supported in the first place for nested virtualization.
*/ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
enabled = false;
/* * Wrapper macro for the common case of adjusting a secondary execution control * based on a single guest CPUID bit, with a dedicated feature bit. This also * verifies that the control is actually supported by KVM and hardware.
*/ #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
({ \ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ bool __enabled; \
\ if (cpu_has_vmx_##name()) { \
__enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
__enabled, exiting); \
} \
})
/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
if (vmx_pt_mode_is_system())
exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); if (!cpu_need_virtualize_apic_accesses(vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
} if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (kvm_pause_in_guest(vmx->vcpu.kvm))
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (!kvm_vcpu_apicv_active(vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
/* * KVM doesn't support VMFUNC for L1, but the control is set in KVM's * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
*/
exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
* in vmx_set_cr4. */
exec_control &= ~SECONDARY_EXEC_DESC;
/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD (handle_vmptrld). We can NOT enable shadow_vmcs here because we don't have yet a current VMCS12
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
/* * PML is enabled/disabled when dirty logging of memsmlots changes, but * it needs to be set here when dirty logging is already active, e.g. * if this vCPU was created after dirty logging was enabled.
*/ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
/* * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either * feature is exposed to the guest. This creates a virtualization hole * if both are supported in hardware but only one is exposed to the * guest, but letting the guest execute RDTSCP or RDPID when either one * is advertised is preferable to emulating the advertised instruction * in KVM on #UD, and obviously better than incorrectly injecting #UD.
*/ if (cpu_has_vmx_rdtscp()) { bool rdpid_or_rdtscp_enabled =
guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
/* Control */
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
exec_controls_set(vmx, vmx_exec_control(vmx));
if (cpu_has_secondary_exec_ctrls()) {
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); if (vmx->ve_info)
vmcs_write64(VE_INFORMATION_ADDRESS,
__pa(vmx->ve_info));
}
if (cpu_has_tertiary_exec_ctrls())
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
if (!enable_vnmi) { /* * Tracking the NMI-blocked state in software is built upon * finding the next open IRQ window. This, in turn, depends on * well-behaving guests: They have to keep IRQs disabled at * least as long as the NMI handler runs. Otherwise we may * cause NMI nesting, maybe breaking the guest. But as this is * highly unlikely, we can live with the residual risk.
*/
vmx->loaded_vmcs->soft_vnmi_blocked = 1;
vmx->loaded_vmcs->vnmi_blocked_time = 0;
}
int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{ if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY;
/* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) return -EBUSY;
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) returnfalse;
return __vmx_interrupt_blocked(vcpu);
}
int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{ if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY;
/* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events.
*/ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return -EBUSY;
return !vmx_interrupt_blocked(vcpu);
}
int vmx_set_tss_addr(struct kvm *kvm, unsignedint addr)
{ void __user *ret;
if (enable_unrestricted_guest) return 0;
mutex_lock(&kvm->slots_lock);
ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
PAGE_SIZE * 3);
mutex_unlock(&kvm->slots_lock);
staticbool rmode_exception(struct kvm_vcpu *vcpu, int vec)
{ switch (vec) { case BP_VECTOR: /* * Update instruction length as we may reinject the exception * from user space while in guest debugging mode.
*/
to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) returnfalse;
fallthrough; case DB_VECTOR: return !(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); case DE_VECTOR: case OF_VECTOR: case BR_VECTOR: case UD_VECTOR: case DF_VECTOR: case SS_VECTOR: case GP_VECTOR: case MF_VECTOR: returntrue;
} returnfalse;
}
staticint handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code)
{ /* * Instruction with address size override prefix opcode 0x67 * Cause the #SS fault with 0 error code in VM86 mode.
*/ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0; return kvm_emulate_halt_noskip(vcpu);
} return 1;
} return 0;
}
/* * Forward all other exceptions that are valid in real mode. * FIXME: Breaks guest debugging in real mode, needs to be fixed with * the required debugging infrastructure rework.
*/
kvm_queue_exception(vcpu, vec); return 1;
}
/* * If the host has split lock detection disabled, then #AC is * unconditionally injected into the guest, which is the pre split lock * detection behaviour. * * If the host has split lock detection enabled then #AC is * only injected into the guest when: * - Guest CPL == 3 (user mode) * - Guest has #AC detection enabled in CR0 * - Guest EFLAGS has AC bit set
*/ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
{ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) returntrue;
/* * Machine checks are handled by handle_exception_irqoff(), or by * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by * vmx_vcpu_enter_exit().
*/ if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1;
/* * Queue the exception here instead of in handle_nm_fault_irqoff(). * This ensures the nested_vmx check is not skipped so vmexit can * be reflected to L1 (when it intercepts #NM) before reaching this * point.
*/ if (is_nm_fault(intr_info)) {
kvm_queue_exception_p(vcpu, NM_VECTOR,
is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); return 1;
}
if (is_invalid_opcode(intr_info)) return handle_ud(vcpu);
if (WARN_ON_ONCE(is_ve_fault(intr_info))) { struct vmx_ve_information *ve_info = vmx->ve_info;
error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
WARN_ON_ONCE(!enable_vmware_backdoor);
/* * VMware backdoor emulation on #GP interception only handles * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero * error code on #GP.
*/ if (error_code) {
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); return 1;
} return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
}
/* * The #PF with PFEC.RSVD = 1 indicates the guest is accessing * MMIO, it is better to report an internal error. * See the comments in vmx_handle_exit.
*/ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
vcpu->run->internal.ndata = 4;
vcpu->run->internal.data[0] = vect_info;
vcpu->run->internal.data[1] = intr_info;
vcpu->run->internal.data[2] = error_code;
vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; return 0;
}
if (is_page_fault(intr_info)) {
cr2 = vmx_get_exit_qual(vcpu); if (enable_ept && !vcpu->arch.apf.host_apf_flags) { /* * EPT will cause page fault only if we need to * detect illegal GPAs.
*/
WARN_ON_ONCE(!allow_smaller_maxphyaddr);
kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); return 1;
} else return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) { case DB_VECTOR:
dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { /* * If the #DB was due to ICEBP, a.k.a. INT1, skip the * instruction. ICEBP generates a trap-like #DB, but * despite its interception control being tied to #DB, * is an instruction intercept, i.e. the VM-Exit occurs * on the ICEBP itself. Use the inner "skip" helper to * avoid single-step #DB and MTF updates, as ICEBP is * higher priority. Note, skipping ICEBP still clears * STI and MOVSS blocking. * * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS * if single-step is enabled in RFLAGS and STI or MOVSS * blocking is active, as the CPU doesn't set the bit * on VM-Exit due to #DB interception. VM-Entry has a * consistency check that a single-step #DB is pending * in this scenario as the previous instruction cannot * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV * don't modify RFLAGS), therefore the one instruction * delay when activating single-step breakpoints must * have already expired. Note, the CPU sets/clears BS * as appropriate for all other VM-Exits types.
*/ if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu)); elseif ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1;
}
kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
fallthrough; case BP_VECTOR: /* * Update instruction length as we may reinject #BP from * user space while in guest debugging mode. Reading it for * #DB as well causes no harm, it is not used in that case.
*/
vmx->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: if (vmx_guest_inject_ac(vcpu)) {
kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); return 1;
}
/* * Handle split lock. Depending on detection mode this will * either warn and disable split lock detection for this * task or force SIGBUS on it.
*/ if (handle_guest_split_lock(kvm_rip_read(vcpu))) return 1;
fallthrough; default:
kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
kvm_run->ex.exception = ex_no;
kvm_run->ex.error_code = error_code; break;
} return 0;
}
/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ staticint handle_set_cr0(struct kvm_vcpu *vcpu, unsignedlong val)
{ if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsignedlong orig_val = val;
/* * We get here when L2 changed cr0 in a way that did not change * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), * but did change L0 shadowed bits. So we first calculate the * effective cr0 value that L1 would like to write into the * hardware. It consists of the L2-owned bits from the new * value combined with the L1-owned bits from L1's guest_cr0.
*/
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
/* analogously to handle_set_cr0 */
val = (val & ~vmcs12->cr4_guest_host_mask) |
(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); if (kvm_set_cr4(vcpu, val)) return 1;
vmcs_writel(CR4_READ_SHADOW, orig_val); return 0;
} else return kvm_set_cr4(vcpu, val);
}
staticint handle_desc(struct kvm_vcpu *vcpu)
{ /* * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this * and other code needs to be updated if UMIP can be guest owned.
*/
BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
staticint handle_dr(struct kvm_vcpu *vcpu)
{ unsignedlong exit_qualification; int dr, dr7, reg; int err = 1;
exit_qualification = vmx_get_exit_qual(vcpu);
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
/* First, if DR does not exist, trigger UD */ if (!kvm_require_dr(vcpu, dr)) return 1;
if (vmx_get_cpl(vcpu) > 0) goto out;
dr7 = vmcs_readl(GUEST_DR7); if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the * guest debugging itself.
*/ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
vcpu->run->debug.arch.dr7 = dr7;
vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
vcpu->run->debug.arch.exception = DB_VECTOR;
vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0;
} else {
kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); return 1;
}
}
if (vcpu->guest_debug == 0) {
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
/* * No more DR vmexits; force a reload of the debug registers * and reenter on this instruction. The next vmexit will * retrieve the full state of the debug registers.
*/
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; return 1;
}
staticint handle_apic_access(struct kvm_vcpu *vcpu)
{ if (likely(fasteoi)) { unsignedlong exit_qualification = vmx_get_exit_qual(vcpu); int access_type, offset;
access_type = exit_qualification & APIC_ACCESS_TYPE;
offset = exit_qualification & APIC_ACCESS_OFFSET; /* * Sane guest uses MOV to write EOI, with written value * not cared. So make a short-circuit here by avoiding * heavy instruction emulation.
*/ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
(offset == APIC_EOI)) {
kvm_lapic_set_eoi(vcpu); return kvm_skip_emulated_instruction(vcpu);
}
} return kvm_emulate_instruction(vcpu, 0);
}
/* * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and * hardware has done any necessary aliasing, offset adjustments, etc... * for the access. I.e. the correct value has already been written to * the vAPIC page for the correct 16-byte chunk. KVM needs only to * retrieve the register value and emulate the access.
*/
u32 offset = exit_qualification & 0xff0;
reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { switch (type) { case INTR_TYPE_NMI_INTR:
vcpu->arch.nmi_injected = false;
vmx_set_nmi_mask(vcpu, true); break; case INTR_TYPE_EXT_INTR: case INTR_TYPE_SOFT_INTR:
kvm_clear_interrupt_queue(vcpu); break; case INTR_TYPE_HARD_EXCEPTION: if (vmx->idt_vectoring_info &
VECTORING_INFO_DELIVER_CODE_MASK) {
has_error_code = true;
error_code =
vmcs_read32(IDT_VECTORING_ERROR_CODE);
}
fallthrough; case INTR_TYPE_SOFT_EXCEPTION:
kvm_clear_exception_queue(vcpu); break; default: break;
}
}
tss_selector = exit_qualification;
if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
type != INTR_TYPE_EXT_INTR &&
type != INTR_TYPE_NMI_INTR))
WARN_ON(!skip_emulated_instruction(vcpu));
/* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6?
*/ return kvm_task_switch(vcpu, tss_selector,
type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
reason, has_error_code, error_code);
}
/* * EPT violation happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. * There are errata that may cause this bit to not be set: * AAK134, BY25.
*/ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
enable_vnmi &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
/* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because * if the illegal address is that of a paging structure, then * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code.
*/ if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0);
if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1;
/* * A nested guest cannot optimize MMIO vmexits, because we have an * nGPA here instead of the required GPA.
*/
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!is_guest_mode(vcpu) &&
!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
trace_kvm_fast_mmio(gpa); return kvm_skip_emulated_instruction(vcpu);
}
/* * Returns true if emulation is required (due to the vCPU having invalid state * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the * current vCPU state.
*/ staticbool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
{ struct vcpu_vmx *vmx = to_vmx(vcpu);
if (!vmx->vt.emulation_required) returnfalse;
/* * It is architecturally impossible for emulation to be required when a * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if * guest state is invalid and unrestricted guest is disabled, i.e. KVM * should synthesize VM-Fail instead emulation L2 code. This path is * only reachable if userspace modifies L2 guest state after KVM has * performed the nested VM-Enter consistency checks.
*/ if (vmx->nested.nested_run_pending) returntrue;
/* * KVM only supports emulating exceptions if the vCPU is in Real Mode. * If emulation is required, KVM can't perform a successful VM-Enter to * inject the exception.
*/ return !vmx->rmode.vm86_active &&
(kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
}
while (vmx->vt.emulation_required && count-- != 0) { if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu);
if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1;
if (!kvm_emulate_instruction(vcpu, 0)) return 0;
if (vmx_unhandleable_emulation_required(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu); return 0;
}
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0; return kvm_emulate_halt_noskip(vcpu);
}
/* * Note, return 1 and not 0, vcpu_run() will invoke * xfer_to_guest_mode() which will create a proper return * code.
*/ if (__xfer_to_guest_mode_work_pending()) return 1;
}
return 1;
}
int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{ if (vmx_unhandleable_emulation_required(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu); return 0;
}
return 1;
}
/* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
*/ staticint handle_pause(struct kvm_vcpu *vcpu)
{ if (!kvm_pause_in_guest(vcpu->kvm))
grow_ple_window(vcpu);
/* * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" * VM-execution control is ignored if CPL > 0. OTOH, KVM * never set PAUSE_EXITING and just set PLE if supported, * so the vcpu must be CPL=0 if it gets a PAUSE exit.
*/
kvm_vcpu_on_spin(vcpu, true); return kvm_skip_emulated_instruction(vcpu);
}
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR); return 1;
}
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
type = kvm_register_read(vcpu, gpr_index);
/* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all)
*/ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva)) return 1;
/* * PML buffer FULL happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry.
*/ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
enable_vnmi &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
/* * PML buffer already flushed at beginning of VMEXIT. Nothing to do * here.., and there's no userspace involvement needed for PML.
*/ return 1;
}
/* * In the *extremely* unlikely scenario that this is a spurious VM-Exit * due to the timer expiring while it was "soft" disabled, just eat the * exit and re-enter the guest.
*/ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) return EXIT_FASTPATH_REENTER_GUEST;
/* * If the timer expired because KVM used it to force an immediate exit, * then mission accomplished.
*/ if (force_immediate_exit) return EXIT_FASTPATH_EXIT_HANDLED;
/* * If L2 is active, go down the slow path as emulating the guest timer * expiration likely requires synthesizing a nested VM-Exit.
*/ if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE;
staticint handle_preemption_timer(struct kvm_vcpu *vcpu)
{ /* * This non-fastpath handler is reached if and only if the preemption * timer was being used to emulate a guest timer while L2 is active. * All other scenarios are supposed to be handled in the fastpath.
*/
WARN_ON_ONCE(!is_guest_mode(vcpu));
kvm_lapic_expired_hv_timer(vcpu); return 1;
}
/* * When nested=0, all VMX instruction VM Exits filter here. The handlers * are overwritten by nested_vmx_hardware_setup() when nested=1.
*/ staticint handle_vmx_instruction(struct kvm_vcpu *vcpu)
{
kvm_queue_exception(vcpu, UD_VECTOR); return 1;
}
#ifndef CONFIG_X86_SGX_KVM staticint handle_encls(struct kvm_vcpu *vcpu)
{ /* * SGX virtualization is disabled. There is no software enable bit for * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent * the guest from executing ENCLS (when SGX is supported by hardware).
*/
kvm_queue_exception(vcpu, UD_VECTOR); return 1;
} #endif/* CONFIG_X86_SGX_KVM */
staticint handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
{ /* * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit().
*/
to_vt(vcpu)->exit_reason.bus_lock_detected = true; return 1;
}
/* * Notify VM exit happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry.
*/ if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
/* Do nothing if PML buffer is empty */ if (pml_idx == PML_HEAD_INDEX) return; /* * PML index always points to the next available PML buffer entity * unless PML log has just overflowed.
*/
pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
/* * PML log is written backwards: the CPU first writes the entry 511 * then the entry 510, and so on. * * Read the entries in the same order they were written, to ensure that * the dirty ring is filled in the same order the CPU wrote them.
*/
pml_buf = page_address(vmx->pml_pg);
for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
u64 gpa;
/* * If KVM is dumping the VMCS, then something has gone wrong * already. Derefencing an address from the VMCS, which could * very well be corrupted, is a terrible idea. The virtual * address is known so use it.
*/
pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
ve_info->exit_reason, ve_info->delivery,
ve_info->exit_qualification,
ve_info->guest_linear_address,
ve_info->guest_physical_address, ve_info->eptp_index);
}
}
/* * The guest has exited. See if we can fix it or if we need userspace * assistance.
*/ staticint __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
u16 exit_handler_index;
/* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before * querying dirty_bitmap, we only need to kick all vcpus out of guest * mode as if vcpus is in root mode, the PML buffer must has been * flushed already. Note, PML is never enabled in hardware while * running L2.
*/ if (enable_pml && !is_guest_mode(vcpu))
vmx_flush_pml_buffer(vcpu);
/* * KVM should never reach this point with a pending nested VM-Enter. * More specifically, short-circuiting VM-Entry to emulate L2 due to * invalid guest state should never happen as that means KVM knowingly * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) return -EIO;
if (is_guest_mode(vcpu)) { /* * PML is never enabled when running L2, bail immediately if a * PML full exit occurs as something is horribly wrong.
*/ if (exit_reason.basic == EXIT_REASON_PML_FULL) goto unexpected_vmexit;
/* * The host physical addresses of some pages of guest memory * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC * Page). The CPU may write to these pages via their host * physical address while L2 is running, bypassing any * address-translation-based dirty tracking (e.g. EPT write * protection). * * Mark them dirty on every exit from L2 to prevent them from * getting out of sync with dirty tracking.
*/
nested_mark_vmcs12_pages_dirty(vcpu);
/* * Synthesize a triple fault if L2 state is invalid. In normal * operation, nested VM-Enter rejects any attempt to enter L2 * with invalid state. However, those checks are skipped if * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If * L2 state is invalid, it means either L1 modified SMRAM state * or userspace provided bad state. Synthesize TRIPLE_FAULT as * doing so is architecturally allowed in the RSM case, and is * the least awful solution for the userspace case without * risking false positives.
*/ if (vmx->vt.emulation_required) {
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); return 1;
}
if (nested_vmx_reflect_vmexit(vcpu)) return 1;
}
/* If guest state is invalid, start emulating. L2 is handled above. */ if (vmx->vt.emulation_required) return handle_invalid_guest_state(vcpu);
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked)) { if (!vmx_interrupt_blocked(vcpu)) {
vmx->loaded_vmcs->soft_vnmi_blocked = 0;
} elseif (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
vcpu->arch.nmi_pending) { /* * This CPU don't support us in finding the end of an * NMI-blocked window if the guest runs with IRQs * disabled. So we pull the trigger after 1 s of * futile waiting, but inform the user about this.
*/
printk(KERN_WARNING "%s: Breaking out of NMI-blocked " "state on VCPU %d after 1 s timeout\n",
__func__, vcpu->vcpu_id);
vmx->loaded_vmcs->soft_vnmi_blocked = 0;
}
}
if (exit_fastpath != EXIT_FASTPATH_NONE) return 1;
int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{ int ret = __vmx_handle_exit(vcpu, exit_fastpath);
/* * Exit to user space when bus lock detected to inform that there is * a bus lock in guest.
*/ if (vmx_get_exit_reason(vcpu).bus_lock_detected) { if (ret > 0)
vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
/* * Software based L1D cache flush which is used when microcode providing * the cache control MSR is not loaded. * * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to * flush it is required to read in 64 KiB because the replacement algorithm * is not exactly LRU. This could be sized at runtime via topology * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so.
*/ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{ int size = PAGE_SIZE << L1D_CACHE_ORDER;
/* * This code is only executed when the flush mode is 'cond' or * 'always'
*/ if (static_branch_likely(&vmx_l1d_flush_cond)) { bool flush_l1d;
/* * Clear the per-vcpu flush bit, it gets set again if the vCPU * is reloaded, i.e. if the vCPU is scheduled out or if KVM * exits to userspace, or if KVM reaches one of the unsafe * VMEXIT handlers, e.g. if KVM calls into the emulator.
*/
flush_l1d = vcpu->arch.l1tf_flush_l1d;
vcpu->arch.l1tf_flush_l1d = false;
/* * Clear the per-cpu flush bit, it gets set again from * the interrupt handlers.
*/
flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
kvm_clear_cpu_l1tf_flush_l1d();
if (!flush_l1d) return;
}
vcpu->stat.l1d_flush++;
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return;
}
asmvolatile( /* First ensure the pages are in the TLB */ "xorl %%eax, %%eax\n" ".Lpopulate_tlb:\n\t" "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" "addl $4096, %%eax\n\t" "cmpl %%eax, %[size]\n\t" "jne .Lpopulate_tlb\n\t" "xorl %%eax, %%eax\n\t" "cpuid\n\t" /* Now fill the cache */ "xorl %%eax, %%eax\n" ".Lfill_cache:\n" "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" "addl $64, %%eax\n\t" "cmpl %%eax, %[size]\n\t" "jne .Lfill_cache\n\t" "lfence\n"
:: [flush_pages] "r" (vmx_l1d_flush_pages),
[size] "r" (size)
: "eax", "ebx", "ecx", "edx");
}
void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{ struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold;
if (is_guest_mode(vcpu) &&
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return;
switch (kvm_get_apic_mode(vcpu)) { case LAPIC_MODE_INVALID:
WARN_ONCE(true, "Invalid local APIC state"); break; case LAPIC_MODE_DISABLED: break; case LAPIC_MODE_XAPIC: if (flexpriority_enabled) {
sec_exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
/* * Flush the TLB, reloading the APIC access page will * only do so if its physical address has changed, but * the guest may have inserted a non-APIC mapping into * the TLB while the APIC access page was disabled.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
} break; case LAPIC_MODE_X2APIC: if (cpu_has_vmx_virtualize_x2apic_mode())
sec_exec_control |=
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; break;
}
secondary_exec_controls_set(vmx, sec_exec_control);
/* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) {
to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; return;
}
if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) return;
/* * Explicitly grab the memslot using KVM's internal slot ID to ensure * KVM doesn't unintentionally grab a userspace memslot. It _should_ * be impossible for userspace to create a memslot for the APIC when * APICv is enabled, but paranoia won't hurt in this case.
*/
slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return;
/* * Ensure that the mmu_notifier sequence count is read before KVM * retrieves the pfn from the primary MMU. Note, the memslot is * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() * in kvm_mmu_invalidate_end().
*/
mmu_seq = kvm->mmu_invalidate_seq;
smp_rmb();
/* * No need to retry if the memslot does not exist or is invalid. KVM * controls the APIC-access page memslot, and only deletes the memslot * if APICv is permanently inhibited, i.e. the memslot won't reappear.
*/
pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); if (is_error_noslot_pfn(pfn)) return;
read_lock(&vcpu->kvm->mmu_lock); if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); else
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
/* * Do not pin the APIC access page in memory so that it can be freely * migrated, the MMU notifier will call us again if it is migrated or * swapped out. KVM backs the memslot with anonymous memory, the pfn * should always point at a refcounted page (if the pfn is valid).
*/ if (!WARN_ON_ONCE(!refcounted_page))
kvm_release_page_clean(refcounted_page);
/* * No need for a manual TLB flush at this point, KVM has already done a * flush if there were SPTEs pointing at the previous page.
*/
read_unlock(&vcpu->kvm->mmu_lock);
}
/* * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI * is only relevant for if and only if Virtual Interrupt Delivery is * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested * VM-Exit, otherwise L1 with run with a stale SVI.
*/ if (is_guest_mode(vcpu)) { /* * KVM is supposed to forward intercepted L2 EOIs to L1 if VID * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. * Note, userspace can stuff state while L2 is active; assert * that VID is disabled if and only if the vCPU is in KVM_RUN * to avoid false positives if userspace is setting APIC state.
*/
WARN_ON_ONCE(vcpu->wants_to_run &&
nested_cpu_has_vid(get_vmcs12(vcpu)));
to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; return;
}
if (max_isr == -1)
max_isr = 0;
status = vmcs_read16(GUEST_INTR_STATUS);
old = status >> 8; if (max_isr != old) {
status &= 0xff;
status |= max_isr << 8;
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
status = vmcs_read16(GUEST_INTR_STATUS);
old = (u8)status & 0xff; if ((u8)vector != old) {
status &= ~0xff;
status |= (u8)vector;
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{ struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt;
if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO;
if (pi_test_on(&vt->pi_desc)) {
pi_clear_on(&vt->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
got_posted_interrupt =
kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
got_posted_interrupt = false;
}
/* * Newly recognized interrupts are injected via either virtual interrupt * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is * disabled in two cases: * * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected * into L2, but KVM doesn't use virtual interrupt delivery to inject * interrupts into L2, and so KVM_REQ_EVENT is again needed. * * 2) If APICv is disabled for this vCPU, assigned devices may still * attempt to post interrupts. The posted interrupt vector will cause * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
*/ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
vmx_set_rvi(max_irr); elseif (got_posted_interrupt)
kvm_make_request(KVM_REQ_EVENT, vcpu);
return max_irr;
}
void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{ if (!kvm_vcpu_apicv_active(vcpu)) return;
staticvoid handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{ /* * Save xfd_err to guest_fpu before interrupt is enabled, so the * MSR value is not clobbered by the host activity before the guest * has chance to consume it. * * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM * interception may have been caused by L1 interception. Per the SDM, * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. * * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. * unlike CR2 and DR6, the value is not a payload that is attached to * the #NM exception.
*/ if (is_xfd_nm_fault(vcpu))
rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
staticvoid handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{ /* if exit due to PF check for async PF */ if (is_page_fault(intr_info))
vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* if exit due to NM, handle before interrupts are enabled */ elseif (is_nm_fault(intr_info))
handle_nm_fault_irqoff(vcpu); /* Handle machine checks before interrupts are enabled */ elseif (is_machine_check(intr_info))
kvm_machine_check();
}
/* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it.
*/ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{ switch (index) { case MSR_IA32_SMBASE: if (!IS_ENABLED(CONFIG_KVM_SMM)) returnfalse; /* * We cannot do SMM unless we can run the guest in big * real mode.
*/ return enable_unrestricted_guest || emulate_invalid_guest_state; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: case MSR_AMD64_TSC_RATIO: /* This is AMD only. */ returnfalse; default: returntrue;
}
}
if (enable_vnmi) { if (vmx->loaded_vmcs->nmi_known_unmasked) return;
exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* * SDM 3: 27.7.1.2 (September 2008) * Re-set bit "block by NMI" before VM entry if vmexit caused by * a guest IRET fault. * SDM 3: 23.2.2 (September 2008) * Bit 12 is undefined in any of the following cases: * If the VM exit sets the valid bit in the IDT-vectoring * information field. * If the VM exit is due to a double fault.
*/ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
vector != DF_VECTOR && !idtv_info_valid)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI); else
vmx->loaded_vmcs->nmi_known_unmasked =
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
& GUEST_INTR_STATE_NMI);
} elseif (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
vmx->loaded_vmcs->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(),
vmx->loaded_vmcs->entry_time));
}
staticvoid __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
u32 idt_vectoring_info, int instr_len_field, int error_code_field)
{
u8 vector; int type; bool idtv_info_valid;
if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) return;
if (flags & VMX_RUN_SAVE_SPEC_CTRL)
vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
/* * If the guest/host SPEC_CTRL values differ, restore the host value. * * For legacy IBRS, the IBRS bit always needs to be written after * transitioning from a less privileged predictor mode, regardless of * whether the guest/host values differ.
*/ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
vmx->spec_ctrl != hostval)
native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
barrier_nospec();
}
static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{ /* * If L2 is active, some VMX preemption timer exits can be handled in * the fastpath even, all other exits must use the slow path.
*/ if (is_guest_mode(vcpu) &&
vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) return EXIT_FASTPATH_NONE;
switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); case EXIT_REASON_HLT: return handle_fastpath_hlt(vcpu); default: return EXIT_FASTPATH_NONE;
}
}
/* * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. * * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, * and is affected by MMIO Stale Data. In such cases mitigation in only * needed against an MMIO capable guest.
*/ if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu); elseif (static_branch_unlikely(&cpu_buf_vm_clear) &&
(flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO))
x86_clear_cpu_buffers();
vmx_disable_fb_clear(vmx);
if (vcpu->arch.cr2 != native_read_cr2())
native_write_cr2(vcpu->arch.cr2);
/* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked))
vmx->loaded_vmcs->entry_time = ktime_get();
/* * Don't enter VMX if guest state is invalid, let the exit handler * start emulation until we arrive back to a valid state. Synthesize a * consistency check VM-Exit due to invalid guest state and bail.
*/ if (unlikely(vmx->vt.emulation_required)) {
vmx->fail = 0;
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
vmcs_write32(PLE_WINDOW, vmx->ple_window);
}
/* * We did this in prepare_switch_to_guest, because it needs to * be within srcu_read_lock.
*/
WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
set_debugreg(vcpu->arch.dr6, 6);
if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
vmx_reload_guest_debugctl(vcpu);
/* * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time * it switches back to the current->mm, which can occur in KVM context * when switching to a temporary mm to patch kernel code, e.g. if KVM * toggles a static key while handling a VM-Exit.
*/
cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
vmcs_writel(HOST_CR3, cr3);
vmx->loaded_vmcs->host_state.cr3 = cr3;
}
/* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug * exceptions being set, but that's not correct for the guest debugging
* case. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
kvm_load_guest_xsave_state(vcpu);
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx); if (intel_pmu_lbr_is_enabled(vcpu))
vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu, force_immediate_exit); elseif (force_immediate_exit)
smp_send_reschedule(vcpu->cpu);
kvm_wait_lapic_expire(vcpu);
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
/* All fields are clean at this point */ if (kvm_is_using_evmcs()) {
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vcpu->arch.host_debugctl)
update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64 /* * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * * We can't defer this to vmx_prepare_switch_to_host() since that * function may be executed in interrupt context, which saves and * restore segments around it, nullifying its effect.
*/
loadsegment(ds, __USER_DS);
loadsegment(es, __USER_DS); #endif
pt_guest_exit(vmx);
kvm_load_host_xsave_state(vcpu);
if (is_guest_mode(vcpu)) { /* * Track VMLAUNCH/VMRESUME that have made past guest state * checking.
*/ if (vmx->nested.nested_run_pending &&
!vmx_get_exit_reason(vcpu).failed_vmentry)
++vcpu->stat.nested_run;
vmx->nested.nested_run_pending = 0;
}
if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE;
if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) return EXIT_FASTPATH_NONE;
/* * If PML is turned on, failure on enabling PML just results in failure * of creating the vcpu, therefore we can simplify PML logic (by * avoiding dealing with cases, such as enabling PML partially on vcpus * for the guest), etc.
*/ if (enable_pml) {
vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) goto free_vpid;
}
for (i = 0; i < kvm_nr_uret_msrs; ++i)
vmx->guest_uret_msrs[i].mask = -1ull; if (boot_cpu_has(X86_FEATURE_RTM)) { /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. * Keep the host value unchanged to avoid changing CPUID bits * under the host kernel's feet.
*/
tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (tsx_ctrl)
tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
}
err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) goto free_pml;
/* * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the * feature only for vmcs01, KVM currently isn't equipped to realize any * performance benefits from enabling it for vmcs02.
*/ if (kvm_is_using_evmcs() &&
(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
int vmx_vm_init(struct kvm *kvm)
{ if (!ple_gap)
kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: /* * Warn upon starting the first VM in a potentially * insecure environment.
*/ if (sched_smt_active())
pr_warn_once(L1TF_MSG_SMT); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
pr_warn_once(L1TF_MSG_L1D); break; case L1TF_MITIGATION_FULL_FORCE: /* Flush is enforced */ break;
}
}
if (enable_pml)
kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0;
}
staticinlinebool vmx_ignore_guest_pat(struct kvm *kvm)
{ /* * Non-coherent DMA devices need the guest to flush CPU properly. * In that case it is not possible to map all guest RAM as WB, so * always trust guest PAT.
*/ return !kvm_arch_has_noncoherent_dma(kvm) &&
kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
}
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{ /* * Force UC for host MMIO regions, as allowing the guest to access MMIO * with cacheable accesses will result in Machine Checks.
*/ if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
/* Force WB if ignoring guest PAT */ if (vmx_ignore_guest_pat(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
staticvoid vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
{ /* * These bits in the secondary execution controls field * are dynamic, the others are mostly based on the hypervisor * architecture and the guest's CPUID. Do not touch the * dynamic bits.
*/
u32 mask =
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC;
/* * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits * (indicating "allowed-1") if they are supported in the guest's CPUID.
*/ staticvoid nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_cpuid_entry2 *entry;
for (i = 0; i < PT_CPUID_LEAVES; i++) {
best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return;
vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
}
/* Get the number of configurable Address Ranges for filtering */
vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges);
/* Initialize and clear the no dependency bits */
vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
RTIT_CTL_BRANCH_EN);
/* * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise * will inject an #GP
*/ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
/* * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and * PSBFreq can be set
*/ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
/* * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
*/ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
RTIT_CTL_MTC_RANGE);
/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
RTIT_CTL_PTW_EN);
/* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
/* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
/* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
/* unmask address range configure area */ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
/* * XSAVES is effectively enabled if and only if XSAVE is also exposed * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be * set if and only if XSAVE is supported.
*/ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
vmx_setup_uret_msrs(vmx);
if (cpu_has_secondary_exec_ctrls())
vmcs_set_secondary_exec_control(vmx,
vmx_secondary_exec_control(vmx));
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
x86_perf_get_lbr(&vmx_lbr_caps);
/* * KVM requires LBR callstack support, as the overhead due to * context switching LBRs without said support is too high. * See intel_pmu_create_guest_lbr_event() for more info.
*/ if (!vmx_lbr_caps.has_callstack)
memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); elseif (vmx_lbr_caps.nr)
perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
}
if (vmx_pebs_supported()) {
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
/* * Disallow adaptive PEBS as it is functionally broken, can be * used by the guest to read *host* LBRs, and can be used to * bypass userspace event filters. To correctly and safely * support adaptive PEBS, KVM needs to: * * 1. Account for the ADAPTIVE flag when (re)programming fixed * counters. * * 2. Gain support from perf (or take direct control of counter * programming) to support events without adaptive PEBS * enabled for the hardware counter. * * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. * * 4. Document which PMU events are effectively exposed to the * guest via adaptive PEBS, and make adaptive PEBS mutually * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
*/
perf_cap &= ~PERF_CAP_PEBS_BASELINE;
}
/* CPUID 0x1 */ if (nested)
kvm_cpu_cap_set(X86_FEATURE_VMX);
/* CPUID 0x7 */ if (kvm_mpx_supported())
kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); if (!cpu_has_vmx_invpcid())
kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); if (vmx_pebs_supported()) {
kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
}
if (!enable_pmu)
kvm_cpu_cap_clear(X86_FEATURE_PDCM);
kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
if (!enable_sgx) {
kvm_cpu_cap_clear(X86_FEATURE_SGX);
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
}
if (vmx_umip_emulated())
kvm_cpu_cap_set(X86_FEATURE_UMIP);
/* CPUID 0xD.1 */
kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
/* CPUID 0x80000001 and 0x7 (RDPID) */ if (!cpu_has_vmx_rdtscp()) {
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
kvm_cpu_cap_clear(X86_FEATURE_RDPID);
}
if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
/* * If the 'use IO bitmaps' VM-execution control is 0, IO instruction * VM-exits depend on the 'unconditional IO exiting' VM-execution * control. * * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
*/ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
switch (info->intercept) { case x86_intercept_rdpid: /* * RDPID causes #UD if not enabled through secondary execution * controls (ENABLE_RDTSCP). Note, the implicit MSR access to * TSC_AUX is NOT subject to interception, i.e. checking only * the dedicated execution control is architecturally correct.
*/ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT;
} return X86EMUL_CONTINUE;
case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) return X86EMUL_CONTINUE;
case x86_intercept_lgdt: case x86_intercept_lidt: case x86_intercept_lldt: case x86_intercept_ltr: case x86_intercept_sgdt: case x86_intercept_sidt: case x86_intercept_sldt: case x86_intercept_str: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE;
if (info->intercept == x86_intercept_lldt ||
info->intercept == x86_intercept_ltr ||
info->intercept == x86_intercept_sldt ||
info->intercept == x86_intercept_str)
vm_exit_reason = EXIT_REASON_LDTR_TR; else
vm_exit_reason = EXIT_REASON_GDTR_IDTR; /* * FIXME: Decode the ModR/M to generate the correct exit * qualification for memory operands.
*/ break;
case x86_intercept_hlt: if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) return X86EMUL_CONTINUE;
vm_exit_reason = EXIT_REASON_HLT; break;
case x86_intercept_pause: /* * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides * with vanilla NOPs in the emulator. Apply the interception * check only to actual PAUSE instructions. Don't check * PAUSE-loop-exiting, software can't expect a given PAUSE to * exit, i.e. KVM is within its rights to allow L2 to execute * the PAUSE.
*/ if ((info->rep_prefix != REPE_PREFIX) ||
!nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE;
#ifdef CONFIG_X86_64 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ staticinlineint u64_shl_div_u64(u64 a, unsignedint shift,
u64 divisor, u64 *result)
{
u64 low = a << shift, high = a >> (64 - shift);
/* To avoid the overflow on divq */ if (high >= divisor) return 1;
/* Low hold the result, high hold rem which is discarded */ asm("divq %2\n\t" : "=a" (low), "=d" (high) : "rm" (divisor), "0" (low), "1" (high));
*result = low;
/* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
delta_tsc && u64_shl_div_u64(delta_tsc,
kvm_caps.tsc_scaling_ratio_frac_bits,
vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE;
/* * If the delta tsc can't fit in the 32 bit after the multi shift, * we can't use the preemption timer. * It's possible that it fits on later vmentries, but checking * on every vmentry is costly so we just use an hrtimer.
*/ if (delta_tsc >> (cpu_preemption_timer_multi + 32)) return -ERANGE;
if (is_guest_mode(vcpu)) {
vmx->nested.update_vmcs01_cpu_dirty_logging = true; return;
}
/* * Note, nr_memslots_dirty_logging can be changed concurrent with this * code, but in that case another update request will be made and so * the guest will never run with a stale PML value.
*/ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); else
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
#ifdef CONFIG_KVM_SMM int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{ /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; return !is_smm(vcpu);
}
int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{ struct vcpu_vmx *vmx = to_vmx(vcpu);
/* * TODO: Implement custom flows for forcing the vCPU out/in of L2 on * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong * SMI and RSM only modify state that is saved and restored via SMRAM. * E.g. most MSRs are left untouched, but many are modified by VM-Exit * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
*/
vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode)
nested_vmx_vmexit(vcpu, -1, 0, 0);
/* * Note, the SDM states that the linear address is masked *after* the modified * canonicality check, whereas KVM masks (untags) the address and then performs * a "normal" canonicality check. Functionally, the two methods are identical, * and when the masking occurs relative to the canonicality check isn't visible * to software, i.e. KVM's behavior doesn't violate the SDM.
*/
gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsignedint flags)
{ int lam_bit; unsignedlong cr3_bits;
if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) return gva;
if (!is_64_bit_mode(vcpu)) return gva;
/* * Bit 63 determines if the address should be treated as user address * or a supervisor address.
*/ if (!(gva & BIT_ULL(63))) {
cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) return gva;
/* LAM_U48 is ignored if LAM_U57 is set. */
lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
} else { if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) return gva;
/* * Untag the address by sign-extending the lam_bit, but NOT to bit 63. * Bit 63 is retained from the raw virtual address so that untagging * doesn't change a user access to a supervisor access, and vice versa.
*/ return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}
/* * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm * will emulate SYSCALL in legacy mode if the vendor string in guest * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To * support this emulation, MSR_STAR is included in the list for i386, * but is never loaded into hardware. MSR_CSTAR is also never loaded * into hardware and is here purely for emulation purposes.
*/ const u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
MSR_IA32_TSX_CTRL,
}; int i;
/* * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, * boot_cpu_data.x86_phys_bits holds the actual physical address * w/o the KeyID bits, and kvm_host.maxphyaddr equals to * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
*/ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
kvm_host.maxphyaddr - 1);
/* * Unlike SME, host kernel doesn't support setting up any * MKTME KeyID on Intel platforms. No memory encryption * bits should be included into the SPTE.
*/
kvm_mmu_set_me_spte_mask(0, me_mask);
}
__init int vmx_hardware_setup(void)
{ unsignedlong host_bndcfgs; struct desc_ptr dt; int r;
store_idt(&dt);
host_idt_base = dt.address;
vmx_setup_user_return_msrs();
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO;
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
if (boot_cpu_has(X86_FEATURE_MPX)) {
rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
if (!cpu_has_vmx_mpx())
kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
XFEATURE_MASK_BNDCSR);
/* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
pr_err_ratelimited("NX (Execute Disable) not supported\n"); return -EOPNOTSUPP;
}
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
if (!cpu_has_virtual_nmis())
enable_vnmi = 0;
#ifdef CONFIG_X86_SGX_KVM if (!cpu_has_vmx_encls_vmexit())
enable_sgx = false; #endif
/* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not * using the APIC_ACCESS_ADDR VMCS field.
*/ if (!flexpriority_enabled)
vt_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
vt_x86_ops.update_cr8_intercept = NULL;
/* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work.
*/ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
enable_pml = 0;
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
if (tsc_khz)
use_timer_freq = (u64)tsc_khz * 1000;
use_timer_freq >>= cpu_preemption_timer_multi;
/* * KVM "disables" the preemption timer by setting it to its max * value. Don't use the timer if it might cause spurious exits * at a rate faster than 0.1 Hz (of uninterrupted guest time).
*/ if (use_timer_freq > 0xffffffffu / 10)
enable_preemption_timer = false;
}
if (!enable_preemption_timer) {
vt_x86_ops.set_hv_timer = NULL;
vt_x86_ops.cancel_hv_timer = NULL;
}
/* * On Intel CPUs that lack self-snoop feature, letting the guest control * memory types may result in unexpected behavior. So always ignore guest * PAT on those CPUs and map VM as writeback, not allowing userspace to * disable the quirk. * * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is * supported, UC is slow enough to cause issues with some older guests (e.g. * an old version of bochs driver uses ioremap() instead of ioremap_wc() to * map the video RAM, causing wayland desktop to fail to get started * correctly). To avoid breaking those older guests that rely on KVM to force * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the * safer (for performance) default behavior. * * On top of this, non-coherent DMA devices need the guest to flush CPU * caches properly. This also requires honoring guest PAT, and is forced * independent of the quirk in vmx_ignore_guest_pat().
*/ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return r;
}
staticvoid vmx_cleanup_l1d_flush(void)
{ if (vmx_l1d_flush_pages) {
free_pages((unsignedlong)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
vmx_l1d_flush_pages = NULL;
} /* Restore state so sysfs ignores VMX */
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
/* * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing * to unwind if a later step fails.
*/
hv_init_evmcs();
r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r;
/* * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode.
*/
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); if (r) goto err_l1d_flush;
/* * Shadow paging doesn't have a (further) performance penalty * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it * by default
*/ if (!enable_ept)
allow_smaller_maxphyaddr = true;
return 0;
err_l1d_flush:
kvm_x86_vendor_exit(); return r;
}
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.164 Sekunden
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.