// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com>
*/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
/* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions.
*/ staticbool __read_mostly nested = 1;
module_param(nested, bool, 0444);
/* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. * According to test, this time is usually smaller than 128 cycles. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. Tests indicate that most spinlocks are held for * less than 2^12 cycles * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3.
*/ staticunsignedint ple_gap = KVM_DEFAULT_PLE_GAP;
module_param(ple_gap, uint, 0444);
/* Default resets per-vcpu window every exit to ple_window. */ staticunsignedint ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(ple_window_shrink, uint, 0444);
/* Default is to compute the maximum so we can never overflow. */ staticunsignedint ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; #ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO); #endif
/* If set to auto use the default l1tf mitigation method */ if (l1tf == VMENTER_L1D_FLUSH_AUTO) { switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER; break; case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT:
l1tf = VMENTER_L1D_FLUSH_COND; break; case L1TF_MITIGATION_FULL: case L1TF_MITIGATION_FULL_FORCE:
l1tf = VMENTER_L1D_FLUSH_ALWAYS; break;
}
} elseif (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
}
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { /* * This allocation for vmx_l1d_flush_pages is not tied to a VM * lifetime and so should not be charged to a memcg.
*/
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); if (!page) return -ENOMEM;
vmx_l1d_flush_pages = page_address(page);
/* * Initialize each page with a different pattern in * order to protect against KSM in the nested * virtualization case.
*/ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
PAGE_SIZE);
}
}
l1tf_vmx_mitigation = l1tf;
if (l1tf != VMENTER_L1D_FLUSH_NEVER)
static_branch_enable(&vmx_l1d_should_flush); else
static_branch_disable(&vmx_l1d_should_flush);
if (l1tf == VMENTER_L1D_FLUSH_COND)
static_branch_enable(&vmx_l1d_flush_cond); else
static_branch_disable(&vmx_l1d_flush_cond); return 0;
}
if (s) { for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { if (vmentry_l1d_param[i].for_parse &&
sysfs_streq(s, vmentry_l1d_param[i].option)) return i;
}
} return -EINVAL;
}
staticint vmentry_l1d_flush_set(constchar *s, conststruct kernel_param *kp)
{ int l1tf, ret;
l1tf = vmentry_l1d_flush_parse(s); if (l1tf < 0) return l1tf;
if (!boot_cpu_has(X86_BUG_L1TF)) return 0;
/* * Has vmx_init() run already? If not then this is the pre init * parameter parsing. In that case just store the value and let * vmx_init() do the proper setup after enable_ept has been * established.
*/ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
vmentry_l1d_flush_param = l1tf; return 0;
}
mutex_lock(&vmx_l1d_flush_mutex);
ret = vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex); return ret;
}
msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
msr |= FB_CLEAR_DIS;
native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */
vmx->msr_ia32_mcu_opt_ctrl = msr;
}
static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
{ if (!vmx->disable_fb_clear) return;
staticvoid vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{ /* * Disable VERW's behavior of clearing CPU buffers for the guest if the * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled * the mitigation. Disabling the clearing behavior provides a * performance boost for guests that aren't aware that manually clearing * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry * and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
(kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
/* * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS * at VMEntry. Skip the MSR read/write when a guest has no use case to * execute VERW.
*/ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
(vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
(vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
(vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
(vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
vmx->disable_fb_clear = false;
}
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
/* * Enlightened VMCS usage should be recommended and the host needs * to support eVMCS v1 or above.
*/ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
(ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
KVM_EVMCS_VERSION) {
/* Check that we have assist pages on all online CPUs */
for_each_online_cpu(cpu) { if (!hv_get_vp_assist_page(cpu)) {
enlightened_vmcs = false; break;
}
}
if (enlightened_vmcs) {
pr_info("Using Hyper-V Enlightened VMCS\n");
static_branch_enable(&__kvm_is_using_evmcs);
}
/* * KVM should enable eVMCS if and only if all CPUs have a VP assist * page, and should reject CPU onlining if eVMCS is enabled the CPU * doesn't have a VP assist page allocated.
*/
vp_ap = hv_get_vp_assist_page(smp_processor_id()); if (WARN_ON_ONCE(!vp_ap)) return;
/* * Reset everything to support using non-enlightened VMCS access later * (e.g. when we reload the module with enlightened_vmcs=0)
*/
vp_ap->nested_control.features.directhypercall = 0;
vp_ap->current_nested_vmcs = 0;
vp_ap->enlighten_vmentry = 0;
}
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{ int i;
i = kvm_find_user_return_msr(msr); if (i >= 0) return &vmx->guest_uret_msrs[i]; return NULL;
}
staticint vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data)
{ unsignedint slot = msr - vmx->guest_uret_msrs; int ret = 0;
if (msr->load_into_hardware) {
preempt_disable();
ret = kvm_set_user_return_msr(slot, data, msr->mask);
preempt_enable();
} if (!ret)
msr->data = data; return ret;
}
/* * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) * * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to * atomically track post-VMXON state, e.g. this may be called in NMI context. * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. * faults are guaranteed to be due to the !post-VMXON check unless the CPU is * magically in RM, VM86, compat mode, or at CPL>0.
*/ staticint kvm_cpu_vmxoff(void)
{ asmgoto("1: vmxoff\n\t"
_ASM_EXTABLE(1b, %l[fault])
::: "cc", "memory" : fault);
void vmx_emergency_disable_virtualization_cpu(void)
{ int cpu = raw_smp_processor_id(); struct loaded_vmcs *v;
kvm_rebooting = true;
/* * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be * set in task context. If this races with VMX is disabled by an NMI, * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to * kvm_rebooting set.
*/ if (!(__read_cr4() & X86_CR4_VMXE)) return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs); if (v->shadow_vmcs)
vmcs_clear(v->shadow_vmcs);
}
kvm_cpu_vmxoff();
}
staticvoid __loaded_vmcs_clear(void *arg)
{ struct loaded_vmcs *loaded_vmcs = arg; int cpu = raw_smp_processor_id();
if (loaded_vmcs->cpu != cpu) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
vmcs_clear(loaded_vmcs->vmcs); if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
vmcs_clear(loaded_vmcs->shadow_vmcs);
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/* * Ensure all writes to loaded_vmcs, including deleting it from its * current percpu list, complete before setting loaded_vmcs->cpu to * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first * and add loaded_vmcs to its percpu list before it's deleted from this * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR); /* * #VE isn't used for VMX. To test against unexpected changes * related to #VE for VMX, intercept unexpected #VE and warn on it.
*/ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
eb |= 1u << VE_VECTOR; /* * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway * as VMware does.
*/ if (enable_vmware_backdoor)
eb |= (1u << GP_VECTOR); if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0; if (!vmx_need_pf_intercept(vcpu))
eb &= ~(1u << PF_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass * them to L1. When running L2, we will only handle the exceptions * specified above if L1 did not want them.
*/ if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap; else { int mask = 0, match = 0;
if (enable_ept && (eb & (1u << PF_VECTOR))) { /* * If EPT is enabled, #PF is currently only intercepted * if MAXPHYADDR is smaller on the guest than on the * host. In that case we only care about present, * non-reserved faults. For vmcs02, however, PFEC_MASK * and PFEC_MATCH are set in prepare_vmcs02_rare.
*/
mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
match = PFERR_PRESENT_MASK;
}
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}
/* * Disabling xfd interception indicates that dynamic xfeatures * might be used in the guest. Always trap #NM in this case * to save guest xfd_err timely.
*/ if (vcpu->arch.xfd_no_write_intercept)
eb |= (1u << NM_VECTOR);
vmcs_write32(EXCEPTION_BITMAP, eb);
}
/* * Check if MSR is intercepted for currently loaded MSR bitmap.
*/ staticbool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) returntrue;
if (vmx->loaded_vmcs->launched)
flags |= VMX_RUN_VMRESUME;
/* * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl.
*/ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
if (static_branch_unlikely(&cpu_buf_vm_clear) &&
kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer()) {
add_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_EFER,
VM_EXIT_LOAD_IA32_EFER,
GUEST_IA32_EFER,
HOST_IA32_EFER,
guest_val, host_val); return;
} break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl()) {
add_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
GUEST_IA32_PERF_GLOBAL_CTRL,
HOST_IA32_PERF_GLOBAL_CTRL,
guest_val, host_val); return;
} break; case MSR_IA32_PEBS_ENABLE: /* PEBS needs a quiescent period after being disabled (to write * a record). Disabling PEBS through VMX MSR swapping doesn't * provide that period, so a CPU could write host's record into * guest's memory.
*/
wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
}
i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (!entry_only)
j = vmx_find_loadstore_msr_slot(&m->host, msr);
/* Shadow paging assumes NX to be available. */ if (!enable_ept)
guest_efer |= EFER_NX;
/* * LMA and LME handled by hardware; SCE meaningless outside long mode.
*/
ignore_bits |= EFER_SCE; #ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME; /* SCE is meaningful only in long mode on Intel */ if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE; #endif
/* * On EPT, we can't emulate NX, so we must switch EFER atomically. * On CPUs that support "load IA32_EFER", always switch EFER * atomically, since it's faster than switching it manually.
*/ if (cpu_has_load_ia32_efer() ||
(enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME; if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, kvm_host.efer, false); else
clear_atomic_switch_msr(vmx, MSR_EFER); returnfalse;
}
i = kvm_find_user_return_msr(MSR_EFER); if (i < 0) returnfalse;
#ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the * VMCS rather than the segment table. KVM uses this helper to figure * out the current bases to poke them into the VMCS before entry.
*/ staticunsignedlong segment_base(u16 selector)
{ struct desc_struct *table; unsignedlong v;
staticinlinebool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
{ /* The base must be 128-byte aligned and a legal physical address. */ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
}
rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) {
rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
staticvoid pt_guest_enter(struct vcpu_vmx *vmx)
{ if (vmx_pt_mode_is_system()) return;
/* * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry.
*/
rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
wrmsrq(MSR_IA32_RTIT_CTL, 0);
pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
}
staticvoid pt_guest_exit(struct vcpu_vmx *vmx)
{ if (vmx_pt_mode_is_system()) return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
}
/* * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
*/ if (vmx->pt_desc.host.ctl)
wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
/* * Note that guest MSRs to be saved/restored can also be changed * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA.
*/ if (!vmx->guest_uret_msrs_loaded) {
vmx->guest_uret_msrs_loaded = true; for (i = 0; i < kvm_nr_uret_msrs; ++i) { if (!vmx->guest_uret_msrs[i].load_into_hardware) continue;
if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
if (vt->guest_state_loaded) return;
host_state = &vmx->loaded_vmcs->host_state;
/* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1.
*/
host_state->ldt_sel = kvm_read_ldt();
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
/* * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to * this cpu's percpu list, otherwise it may not yet be deleted * from its previous cpu's percpu list. Pairs with the * smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
if (!already_loaded) { void *gdt = get_current_gdt_ro();
/* * Flush all EPTP/VPID contexts, the new pCPU may have stale * TLB entries from its previous association with the vCPU.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
(unsignedlong)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsignedlong)gdt); /* 22.2.4 */
/* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken.
*/ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
/* * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU * is an unrestricted guest in order to mark L2 as needing emulation * if L1 runs L2 as a restricted guest.
*/ if (is_unrestricted_guest(vcpu)) {
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
vmx->rflags = rflags;
vmcs_writel(GUEST_RFLAGS, rflags); return;
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0;
if (interruptibility & GUEST_INTR_STATE_STI)
ret |= KVM_X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS)
ret |= KVM_X86_SHADOW_INT_MOV_SS;
/* * Any MSR write that attempts to change bits marked reserved will * case a #GP fault.
*/ if (data & vmx->pt_desc.ctl_bitmask) return 1;
/* * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will * result in a #GP unless the same write also clears TraceEn.
*/ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
(data & RTIT_CTL_TRACEEN) &&
data != vmx->pt_desc.guest.ctl) return 1;
/* * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit * and FabricEn would cause #GP, if * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
*/ if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
!(data & RTIT_CTL_FABRIC_EN) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)) return 1;
/* * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that * utilize encodings marked reserved will cause a #GP fault.
*/
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
!test_bit((data & RTIT_CTL_MTC_RANGE) >>
RTIT_CTL_MTC_RANGE_OFFSET, &value)) return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cycle_thresholds); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
!test_bit((data & RTIT_CTL_CYC_THRESH) >>
RTIT_CTL_CYC_THRESH_OFFSET, &value)) return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
!test_bit((data & RTIT_CTL_PSB_FREQ) >>
RTIT_CTL_PSB_FREQ_OFFSET, &value)) return 1;
/* * If ADDRx_CFG is reserved or the encodings is >2 will * cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) return 1;
return 0;
}
int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len)
{ /* * Emulation of instructions in SGX enclaves is impossible as RIP does * not point at the failing instruction, and even if it did, the code * stream is inaccessible. Inject #UD instead of exiting to userspace * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only).
*/ if (vmx_get_exit_reason(vcpu).enclave_mode) {
kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT;
}
/* Check that emulation is possible during event vectoring */ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
!kvm_can_emulate_event_vectoring(emul_type)) return X86EMUL_UNHANDLEABLE_VECTORING;
/* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on * undefined behavior: Intel's SDM doesn't mandate the VMCS field be * set when EPT misconfig occurs. In practice, real hardware updates * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors * (namely Hyper-V) don't set it due to it being undefined behavior, * i.e. we end up advancing IP with some random value.
*/ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
/* * Emulating an enclave's instructions isn't supported as KVM * cannot access the enclave's memory or its true RIP, e.g. the * vmcs.GUEST_RIP points at the exit point of the enclave, not * the RIP that actually triggered the VM-Exit. But, because * most instructions that cause VM-Exit will #UD in an enclave, * most instruction-based VM-Exits simply do not occur. * * There are a few exceptions, notably the debug instructions * INT1ICEBRK and INT3, as they are allowed in debug enclaves * and generate #DB/#BP as expected, which KVM might intercept. * But again, the CPU does the dirty work and saves an instr * length of zero so VMMs don't shoot themselves in the foot. * WARN if KVM tries to skip a non-zero length instruction on * a VM-Exit from an enclave.
*/ if (!instr_len) goto rip_updated;
WARN_ONCE(exit_reason.enclave_mode, "skipping instruction after SGX enclave VM-Exit");
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit * mode, but just finding out that we are in 64-bit mode is * quite expensive. Only do it if there was a carry.
*/ if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
rip = (u32)rip; #endif
kvm_rip_write(vcpu, rip);
} else { if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0;
}
rip_updated: /* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
return 1;
}
/* * Recognizes a pending MTF VM-exit and records the nested state for later * delivery.
*/ void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{ struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
if (!is_guest_mode(vcpu)) return;
/* * Per the SDM, MTF takes priority over debug-trap exceptions besides * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps * or ICEBP (in the emulator proper), and skipping of ICEBP after an * intercepted #DB deliberately avoids single-step #DB and MTF updates * as ICEBP is higher priority than both. As instruction emulation is * completed at this point (i.e. KVM is at the instruction boundary), * any #DB exception pending delivery must be a debug-trap of lower * priority than MTF. Record the pending MTF state to be delivered in * vmx_check_nested_events().
*/ if (nested_cpu_has_mtf(vmcs12) &&
(!vcpu->arch.exception.pending ||
vcpu->arch.exception.vector == DB_VECTOR) &&
(!vcpu->arch.exception_vmexit.pending ||
vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
vmx->nested.mtf_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
vmx->nested.mtf_pending = false;
}
}
int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu); return skip_emulated_instruction(vcpu);
}
staticvoid vmx_clear_hlt(struct kvm_vcpu *vcpu)
{ /* * Ensure that we clear the HLT state in the VMCS. We don't need to * explicitly skip the instruction because if the HLT state is set, * then the instruction is already executing and RIP has already been * advanced.
*/ if (kvm_hlt_in_guest(vcpu->kvm) &&
vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
if (ex->has_error_code) { /* * Despite the error code being architecturally defined as 32 * bits, and the VMCS field being 32 bits, Intel CPUs and thus * VMX don't actually supporting setting bits 31:16. Hardware * will (should) never provide a bogus error code, but AMD CPUs * do generate error codes with bits 31:16 set, and so KVM's * ABI lets userspace shove in arbitrary 32-bit values. Drop * the upper bits to avoid VM-Fail, losing information that * doesn't really exist is preferable to killing the VM.
*/
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) { int inc_eip = 0; if (kvm_exception_is_soft(ex->vector))
inc_eip = vcpu->arch.event_exit_inst_len;
kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); return;
}
/* * Configuring user return MSRs to automatically save, load, and restore MSRs * that need to be shoved into hardware when running the guest. Note, omitting * an MSR here does _NOT_ mean it's not emulated, only that it will not be * loaded into hardware when running the guest.
*/ staticvoid vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{ #ifdef CONFIG_X86_64 bool load_syscall_msrs;
/* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set.
*/
load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
(vmx->vcpu.arch.efer & EFER_SCE);
/* * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new * kernel and old userspace. If those guests run on a tsx=off host, do * allow guests to use TSX_CTRL, but don't change the value in hardware * so that TSX remains always disabled.
*/
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
/* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter.
*/
vmx->guest_uret_msrs_loaded = false;
}
/* * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain * backwards compatibility even though KVM doesn't support emulating SMX. And * because userspace set "VMX in SMX", the guest must also be allowed to set it, * e.g. if the MSR is left unlocked and the guest does a RMW operation.
*/ #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
FEAT_CTL_SGX_LC_ENABLED | \
FEAT_CTL_SGX_ENABLED | \
FEAT_CTL_LMCE_ENABLED)
/* * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are * exposed to the guest.
*/
WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
~KVM_SUPPORTED_FEATURE_CONTROL);
if (!msr->host_initiated &&
(vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) returnfalse;
if (msr->host_initiated)
valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; else
valid_bits = vmx->msr_ia32_feature_control_valid_bits;
return !(msr->data & ~valid_bits);
}
int vmx_get_feature_msr(u32 msr, u64 *data)
{ switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: return KVM_MSR_RET_UNSUPPORTED;
}
}
/* * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called.
*/ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr;
u32 index;
switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE:
msr_info->data = vmcs_readl(GUEST_FS_BASE); break; case MSR_GS_BASE:
msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE:
msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSX_CTRL: if (!msr_info->host_initiated &&
!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; goto find_uret_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1;
msr_info->data = vmx->msr_ia32_umwait_control; break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated &&
!guest_has_spec_ctrl_msr(vcpu)) return 1;
msr_info->data = to_vmx(vcpu)->spec_ctrl; break; case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
!guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_MCG_EXT_CTL: if (!msr_info->host_initiated &&
!(vmx->msr_ia32_feature_control &
FEAT_CTL_LMCE_ENABLED)) return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl; break; case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control; break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: if (!msr_info->host_initiated &&
!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) return 1;
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data)) return 1; #ifdef CONFIG_KVM_HYPERV /* * Enlightened VMCS v1 doesn't have certain VMCS fields but * instead of just ignoring the features, different Hyper-V * versions are either trying to use them and fail or do some * sanity checking and refuse to boot. Filter all unsupported * features out.
*/ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data); #endif break; case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest()) return 1;
msr_info->data = vmx->pt_desc.guest.ctl; break; case MSR_IA32_RTIT_STATUS: if (!vmx_pt_mode_is_host_guest()) return 1;
msr_info->data = vmx->pt_desc.guest.status; break; case MSR_IA32_RTIT_CR3_MATCH: if (!vmx_pt_mode_is_host_guest() ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering)) return 1;
msr_info->data = vmx->pt_desc.guest.cr3_match; break; case MSR_IA32_RTIT_OUTPUT_BASE: if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output))) return 1;
msr_info->data = vmx->pt_desc.guest.output_base; break; case MSR_IA32_RTIT_OUTPUT_MASK: if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output))) return 1;
msr_info->data = vmx->pt_desc.guest.output_mask; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (!vmx_pt_mode_is_host_guest() ||
(index >= 2 * vmx->pt_desc.num_address_ranges)) return 1; if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_DEBUGCTLMSR:
msr_info->data = vmx_guest_debugctl_read(); break; default:
find_uret_msr:
msr = vmx_find_uret_msr(vmx, msr_info->index); if (msr) {
msr_info->data = msr->data; break;
} return kvm_get_msr_common(vcpu, msr_info);
}
/* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called.
*/ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
u32 index;
switch (msr_index) { case MSR_EFER:
ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_FS_BASE, data); break; case MSR_GS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data); break; case MSR_IA32_XFD:
ret = kvm_set_msr_common(vcpu, msr_info); /* * Always intercepting WRMSR could incur non-negligible * overhead given xfd might be changed frequently in * guest context switch. Disable write interception * upon the first write with a non-zero value (indicating * potential usage on dynamic xfeatures). Also update * exception bitmap to trap #NM for proper virtualization * of guest xfd_err.
*/ if (!ret && data) {
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
MSR_TYPE_RW);
vcpu->arch.xfd_no_write_intercept = true;
vmx_update_exception_bitmap(vcpu);
} break; #endif case MSR_IA32_SYSENTER_CS: if (is_guest_mode(vcpu))
get_vmcs12(vcpu)->guest_sysenter_cs = data;
vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: if (is_guest_mode(vcpu)) {
data = nested_vmx_truncate_sysenter_addr(vcpu, data);
get_vmcs12(vcpu)->guest_sysenter_eip = data;
}
vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: if (is_guest_mode(vcpu)) {
data = nested_vmx_truncate_sysenter_addr(vcpu, data);
get_vmcs12(vcpu)->guest_sysenter_esp = data;
}
vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) return 1;
data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
vmx_guest_debugctl_write(vcpu, data);
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu); return 0; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
!guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD)) return 1;
vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1;
/* The reserved bit 1 and non-32 bit [63:32] should be zero */ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) return 1;
vmx->msr_ia32_umwait_control = data; break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated &&
!guest_has_spec_ctrl_msr(vcpu)) return 1;
if (kvm_spec_ctrl_test_value(data)) return 1;
vmx->spec_ctrl = data; if (!data) break;
/* * For non-nested: * When it's written (to non-zero) for the first time, pass * it through. * * For nested: * The handling of the MSR bitmap for L2 guests is done in * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. We update the vmcs01 here for L1 as well
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.18 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.