From 774911290c589e98e3638e73b24b0a4d4530e97c Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 17 Jun 2020 10:36:20 +0200 Subject: [PATCH 01/13] KVM: s390: reduce number of IO pins to 1 The current number of KVM_IRQCHIP_NUM_PINS results in an order 3 allocation (32kb) for each guest start/restart. This can result in OOM killer activity even with free swap when the memory is fragmented enough: kernel: qemu-system-s39 invoked oom-killer: gfp_mask=0x440dc0(GFP_KERNEL_ACCOUNT|__GFP_COMP|__GFP_ZERO), order=3, oom_score_adj=0 kernel: CPU: 1 PID: 357274 Comm: qemu-system-s39 Kdump: loaded Not tainted 5.4.0-29-generic #33-Ubuntu kernel: Hardware name: IBM 8562 T02 Z06 (LPAR) kernel: Call Trace: kernel: ([<00000001f848fe2a>] show_stack+0x7a/0xc0) kernel: [<00000001f8d3437a>] dump_stack+0x8a/0xc0 kernel: [<00000001f8687032>] dump_header+0x62/0x258 kernel: [<00000001f8686122>] oom_kill_process+0x172/0x180 kernel: [<00000001f8686abe>] out_of_memory+0xee/0x580 kernel: [<00000001f86e66b8>] __alloc_pages_slowpath+0xd18/0xe90 kernel: [<00000001f86e6ad4>] __alloc_pages_nodemask+0x2a4/0x320 kernel: [<00000001f86b1ab4>] kmalloc_order+0x34/0xb0 kernel: [<00000001f86b1b62>] kmalloc_order_trace+0x32/0xe0 kernel: [<00000001f84bb806>] kvm_set_irq_routing+0xa6/0x2e0 kernel: [<00000001f84c99a4>] kvm_arch_vm_ioctl+0x544/0x9e0 kernel: [<00000001f84b8936>] kvm_vm_ioctl+0x396/0x760 kernel: [<00000001f875df66>] do_vfs_ioctl+0x376/0x690 kernel: [<00000001f875e304>] ksys_ioctl+0x84/0xb0 kernel: [<00000001f875e39a>] __s390x_sys_ioctl+0x2a/0x40 kernel: [<00000001f8d55424>] system_call+0xd8/0x2c8 As far as I can tell s390x does not use the iopins as we bail our for anything other than KVM_IRQ_ROUTING_S390_ADAPTER and the chip/pin is only used for KVM_IRQ_ROUTING_IRQCHIP. So let us use a small number to reduce the memory footprint. Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20200617083620.5409-1-borntraeger@de.ibm.com --- arch/s390/include/asm/kvm_host.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index cee3cb6455a2..6ea0820e7c7f 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -31,12 +31,12 @@ #define KVM_USER_MEM_SLOTS 32 /* - * These seem to be used for allocating ->chip in the routing table, - * which we don't use. 4096 is an out-of-thin-air value. If we need - * to look at ->chip later on, we'll need to revisit this. + * These seem to be used for allocating ->chip in the routing table, which we + * don't use. 1 is as small as we can get to reduce the needed memory. If we + * need to look at ->chip later on, we'll need to revisit this. */ #define KVM_NR_IRQCHIPS 1 -#define KVM_IRQCHIP_NUM_PINS 4096 +#define KVM_IRQCHIP_NUM_PINS 1 #define KVM_HALT_POLL_NS_DEFAULT 50000 /* s390-specific vcpu->requests bit members */ From 7733306bd593c737c63110175da6c35b4b8bb32c Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Thu, 18 Jun 2020 18:12:54 +0100 Subject: [PATCH 02/13] KVM: arm64: Annotate hyp NMI-related functions as __always_inline The "inline" keyword is a hint for the compiler to inline a function. The functions system_uses_irq_prio_masking() and gic_write_pmr() are used by the code running at EL2 on a non-VHE system, so mark them as __always_inline to make sure they'll always be part of the .hyp.text section. This fixes the following splat when trying to run a VM: [ 47.625273] Kernel panic - not syncing: HYP panic: [ 47.625273] PS:a00003c9 PC:0000ca0b42049fc4 ESR:86000006 [ 47.625273] FAR:0000ca0b42049fc4 HPFAR:0000000010001000 PAR:0000000000000000 [ 47.625273] VCPU:0000000000000000 [ 47.647261] CPU: 1 PID: 217 Comm: kvm-vcpu-0 Not tainted 5.8.0-rc1-ARCH+ #61 [ 47.654508] Hardware name: Globalscale Marvell ESPRESSOBin Board (DT) [ 47.661139] Call trace: [ 47.663659] dump_backtrace+0x0/0x1cc [ 47.667413] show_stack+0x18/0x24 [ 47.670822] dump_stack+0xb8/0x108 [ 47.674312] panic+0x124/0x2f4 [ 47.677446] panic+0x0/0x2f4 [ 47.680407] SMP: stopping secondary CPUs [ 47.684439] Kernel Offset: disabled [ 47.688018] CPU features: 0x240402,20002008 [ 47.692318] Memory Limit: none [ 47.695465] ---[ end Kernel panic - not syncing: HYP panic: [ 47.695465] PS:a00003c9 PC:0000ca0b42049fc4 ESR:86000006 [ 47.695465] FAR:0000ca0b42049fc4 HPFAR:0000000010001000 PAR:0000000000000000 [ 47.695465] VCPU:0000000000000000 ]--- The instruction abort was caused by the code running at EL2 trying to fetch an instruction which wasn't mapped in the EL2 translation tables. Using objdump showed the two functions as separate symbols in the .text section. Fixes: 85738e05dc38 ("arm64: kvm: Unmask PMR before entering guest") Cc: stable@vger.kernel.org Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Acked-by: James Morse Link: https://lore.kernel.org/r/20200618171254.1596055-1-alexandru.elisei@arm.com --- arch/arm64/include/asm/arch_gicv3.h | 2 +- arch/arm64/include/asm/cpufeature.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index a358e97572c1..6647ae4f0231 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -109,7 +109,7 @@ static inline u32 gic_read_pmr(void) return read_sysreg_s(SYS_ICC_PMR_EL1); } -static inline void gic_write_pmr(u32 val) +static __always_inline void gic_write_pmr(u32 val) { write_sysreg_s(val, SYS_ICC_PMR_EL1); } diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 5d1f4ae42799..f7c3d1ff091d 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -675,7 +675,7 @@ static inline bool system_supports_generic_auth(void) cpus_have_const_cap(ARM64_HAS_GENERIC_AUTH); } -static inline bool system_uses_irq_prio_masking(void) +static __always_inline bool system_uses_irq_prio_masking(void) { return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && cpus_have_const_cap(ARM64_HAS_IRQ_PRIO_MASKING); From 66b7e05dc0239c5817859f261098ba9cc2efbd2b Mon Sep 17 00:00:00 2001 From: Steven Price Date: Wed, 17 Jun 2020 11:54:56 +0100 Subject: [PATCH 03/13] KVM: arm64: Fix kvm_reset_vcpu() return code being incorrect with SVE If SVE is enabled then 'ret' can be assigned the return value of kvm_vcpu_enable_sve() which may be 0 causing future "goto out" sites to erroneously return 0 on failure rather than -EINVAL as expected. Remove the initialisation of 'ret' and make setting the return value explicit to avoid this situation in the future. Fixes: 9a3cdf26e336 ("KVM: arm64/sve: Allow userspace to enable SVE for vcpus") Cc: stable@vger.kernel.org Reported-by: James Morse Signed-off-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200617105456.28245-1-steven.price@arm.com --- arch/arm64/kvm/reset.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index d3b209023727..6ed36be51b4b 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -245,7 +245,7 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { - int ret = -EINVAL; + int ret; bool loaded; u32 pstate; @@ -269,15 +269,19 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) || test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) { - if (kvm_vcpu_enable_ptrauth(vcpu)) + if (kvm_vcpu_enable_ptrauth(vcpu)) { + ret = -EINVAL; goto out; + } } switch (vcpu->arch.target) { default: if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) { + ret = -EINVAL; goto out; + } pstate = VCPU_RESET_PSTATE_SVC; } else { pstate = VCPU_RESET_PSTATE_EL1; From a25e91028ac2f544e0140aff2c9360a0e995dd86 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 22 Jun 2020 16:27:10 +0200 Subject: [PATCH 04/13] KVM: arm64: pvtime: Ensure task delay accounting is enabled Ensure we're actually accounting run_delay before we claim that we'll expose it to the guest. If we're not, then we just pretend like steal time isn't supported in order to avoid any confusion. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200622142710.18677-1-drjones@redhat.com --- arch/arm64/kvm/pvtime.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 1e0f4c284888..f7b52ce1557e 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -73,6 +74,11 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) return base; } +static bool kvm_arm_pvtime_supported(void) +{ + return !!sched_info_on(); +} + int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { @@ -82,7 +88,8 @@ int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, int ret = 0; int idx; - if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + if (!kvm_arm_pvtime_supported() || + attr->attr != KVM_ARM_VCPU_PVTIME_IPA) return -ENXIO; if (get_user(ipa, user)) @@ -110,7 +117,8 @@ int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, u64 __user *user = (u64 __user *)attr->addr; u64 ipa; - if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA) + if (!kvm_arm_pvtime_supported() || + attr->attr != KVM_ARM_VCPU_PVTIME_IPA) return -ENXIO; ipa = vcpu->arch.steal.base; @@ -125,7 +133,8 @@ int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, { switch (attr->attr) { case KVM_ARM_VCPU_PVTIME_IPA: - return 0; + if (kvm_arm_pvtime_supported()) + return 0; } return -ENXIO; } From a3f574cd65487cd993f79ab235d70229d9302c1e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 23 Jun 2020 10:44:08 +0100 Subject: [PATCH 05/13] KVM: arm64: vgic-v4: Plug race between non-residency and v4.1 doorbell When making a vPE non-resident because it has hit a blocking WFI, the doorbell can fire at any time after the write to the RD. Crucially, it can fire right between the write to GICR_VPENDBASER and the write to the pending_last field in the its_vpe structure. This means that we would overwrite pending_last with stale data, and potentially not wakeup until some unrelated event (such as a timer interrupt) puts the vPE back on the CPU. GICv4 isn't affected by this as we actively mask the doorbell on entering the guest, while GICv4.1 automatically manages doorbell delivery without any hypervisor-driven masking. Use the vpe_lock to synchronize such update, which solves the problem altogether. Fixes: ae699ad348cdc ("irqchip/gic-v4.1: Move doorbell management to the GICv4 abstraction layer") Reported-by: Zenghui Yu Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v4.c | 8 ++++++++ drivers/irqchip/irq-gic-v3-its.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 27ac833e5ec7..b5fa73c9fd35 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -90,7 +90,15 @@ static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) !irqd_irq_disabled(&irq_to_desc(irq)->irq_data)) disable_irq_nosync(irq); + /* + * The v4.1 doorbell can fire concurrently with the vPE being + * made non-resident. Ensure we only update pending_last + * *after* the non-residency sequence has completed. + */ + raw_spin_lock(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vpe_lock); vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; + raw_spin_unlock(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vpe_lock); + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); kvm_vcpu_kick(vcpu); diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index cd685f521c77..205f69592471 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -4054,16 +4054,24 @@ static void its_vpe_4_1_deschedule(struct its_vpe *vpe, u64 val; if (info->req_db) { + unsigned long flags; + /* * vPE is going to block: make the vPE non-resident with * PendingLast clear and DB set. The GIC guarantees that if * we read-back PendingLast clear, then a doorbell will be * delivered when an interrupt comes. + * + * Note the locking to deal with the concurrent update of + * pending_last from the doorbell interrupt handler that can + * run concurrently. */ + raw_spin_lock_irqsave(&vpe->vpe_lock, flags); val = its_clear_vpend_valid(vlpi_base, GICR_VPENDBASER_PendingLast, GICR_VPENDBASER_4_1_DB); vpe->pending_last = !!(val & GICR_VPENDBASER_PendingLast); + raw_spin_unlock_irqrestore(&vpe->vpe_lock, flags); } else { /* * We're not blocking, so just make the vPE non-resident From 9d3c447c72fb2337ca39f245c6ae89f2369de216 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 29 Jun 2020 18:26:31 +0800 Subject: [PATCH 06/13] KVM: X86: Fix async pf caused null-ptr-deref Syzbot reported that: CPU: 1 PID: 6780 Comm: syz-executor153 Not tainted 5.7.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__apic_accept_irq+0x46/0xb80 Call Trace: kvm_arch_async_page_present+0x7de/0x9e0 kvm_check_async_pf_completion+0x18d/0x400 kvm_arch_vcpu_ioctl_run+0x18bf/0x69f0 kvm_vcpu_ioctl+0x46a/0xe20 ksys_ioctl+0x11a/0x180 __x64_sys_ioctl+0x6f/0xb0 do_syscall_64+0xf6/0x7d0 entry_SYSCALL_64_after_hwframe+0x49/0xb3 The testcase enables APF mechanism in MSR_KVM_ASYNC_PF_EN with ASYNC_PF_INT enabled w/o setting MSR_KVM_ASYNC_PF_INT before, what's worse, interrupt based APF 'page ready' event delivery depends on in kernel lapic, however, we didn't bail out when lapic is not in kernel during guest setting MSR_KVM_ASYNC_PF_EN which causes the null-ptr-deref in host later. This patch fixes it. Reported-by: syzbot+1bf777dfdde86d64b89b@syzkaller.appspotmail.com Fixes: 2635b5c4a0 (KVM: x86: interrupt based APF 'page ready' event delivery) Signed-off-by: Wanpeng Li Message-Id: <1593426391-8231-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b92db412335..a026d926072c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2693,6 +2693,9 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; + if (!lapic_in_kernel(vcpu)) + return 1; + vcpu->arch.apf.msr_en_val = data; if (!kvm_pv_async_pf_enabled(vcpu)) { From 5ecad245de2ae23dc4e2dbece92f8ccfbaed2fa7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 30 Jun 2020 07:07:20 -0400 Subject: [PATCH 07/13] KVM: x86: bit 8 of non-leaf PDPEs is not reserved Bit 8 would be the "global" bit, which does not quite make sense for non-leaf page table entries. Intel ignores it; AMD ignores it in PDEs and PDPEs, but reserves it in PML4Es. Probably, earlier versions of the AMD manual documented it as reserved in PDPEs as well, and that behavior made it into KVM as well as kvm-unit-tests; fix it. Cc: stable@vger.kernel.org Reported-by: Nadav Amit Fixes: a0c0feb57992 ("KVM: x86: reserve bit 8 of non-leaf PDPEs and PML4Es in 64-bit mode on AMD", 2014-09-03) Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 76817d13c86e..6d6a0ae7800c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4449,7 +4449,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); From 1393b4aaf9e1e803d59726053d542cebd4e2b5b2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 2 Jul 2020 05:39:31 -0400 Subject: [PATCH 08/13] kvm: use more precise cast and do not drop __user Sparse complains on a call to get_compat_sigset, fix it. The "if" right above explains that sigmask_arg->sigset is basically a compat_sigset_t. Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a852af5c3214..0a68c9d3d3ab 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3350,7 +3350,8 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, if (kvm_sigmask.len != sizeof(compat_sigset_t)) goto out; r = -EFAULT; - if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) + if (get_compat_sigset(&sigset, + (compat_sigset_t __user *)sigmask_arg->sigset)) goto out; r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); } else From d74fcfc1f0ff4b6c26ecef1f9e48d8089ab4eaac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:17:14 -0700 Subject: [PATCH 09/13] KVM: x86: Inject #GP if guest attempts to toggle CR4.LA57 in 64-bit mode Inject a #GP on MOV CR4 if CR4.LA57 is toggled in 64-bit mode, which is illegal per Intel's SDM: CR4.LA57 57-bit linear addresses (bit 12 of CR4) ... blah blah blah ... This bit cannot be modified in IA-32e mode. Note, the pseudocode for MOV CR doesn't call out the fault condition, which is likely why the check was missed during initial development. This is arguably an SDM bug and will hopefully be fixed in future release of the SDM. Fixes: fd8cb433734ee ("KVM: MMU: Expose the LA57 feature to VM.") Cc: stable@vger.kernel.org Reported-by: Sebastien Boeuf Signed-off-by: Sean Christopherson Message-Id: <20200703021714.5549-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a026d926072c..88c593f83b28 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -975,6 +975,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; + if ((cr4 ^ old_cr4) & X86_CR4_LA57) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, From 7c83d096aed055a7763a03384f92115363448b71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 21:04:21 -0700 Subject: [PATCH 10/13] KVM: x86: Mark CR4.TSD as being possibly owned by the guest Mark CR4.TSD as being possibly owned by the guest as that is indeed the case on VMX. Without TSD being tagged as possibly owned by the guest, a targeted read of CR4 to get TSD could observe a stale value. This bug is benign in the current code base as the sole consumer of TSD is the emulator (for RDTSC) and the emulator always "reads" the entirety of CR4 when grabbing bits. Add a build-time assertion in to ensure VMX doesn't hand over more CR4 bits without also updating x86. Fixes: 52ce3c21aec3 ("x86,kvm,vmx: Don't trap writes to CR4.TSD") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20200703040422.31536-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 2 +- arch/x86/kvm/vmx/vmx.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index ff2d0e9ca3bc..cfe83d4ae625 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cb22f33bf1d8..5c9bfc0b9ab9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4034,6 +4034,8 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { + BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; if (enable_ept) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; From fa71e9527f6a0153ae6a880031b902818af1bdaf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 21:04:22 -0700 Subject: [PATCH 11/13] KVM: VMX: Use KVM_POSSIBLE_CR*_GUEST_BITS to initialize guest/host masks Use the "common" KVM_POSSIBLE_CR*_GUEST_BITS defines to initialize the CR0/CR4 guest host masks instead of duplicating most of the CR4 mask and open coding the CR0 mask. SVM doesn't utilize the masks, i.e. the masks are effectively VMX specific even if they're not named as such. This avoids duplicate code, better documents the guest owned CR0 bit, and eliminates the need for a build-time assertion to keep VMX and x86 synchronized. Signed-off-by: Sean Christopherson Message-Id: <20200703040422.31536-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 15 +++++---------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d1af20b050a8..b26655104d4a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4109,7 +4109,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it); */ - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; vmx_set_cr0(vcpu, vmcs12->host_cr0); /* Same as above - no reason to call set_cr4_guest_host_mask(). */ @@ -4259,7 +4259,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) */ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5c9bfc0b9ab9..13745f2a5ecd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -133,9 +133,6 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -4034,11 +4031,9 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS); - - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; - if (enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS; + if (!enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) vmx->vcpu.arch.cr4_guest_owned_bits &= ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; @@ -4335,8 +4330,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) /* 22.2.1, 20.8.1 */ vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); - vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; - vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); + vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); set_cr4_guest_host_mask(vmx); From 146f76cc84b787c4eec6ed73ebeec708a06e4ae4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 4 Jul 2020 13:30:55 +0100 Subject: [PATCH 12/13] KVM: arm64: PMU: Fix per-CPU access in preemptible context Commit 07da1ffaa137 ("KVM: arm64: Remove host_cpu_context member from vcpu structure") has, by removing the host CPU context pointer, exposed that kvm_vcpu_pmu_restore_guest is called in preemptible contexts: [ 266.932442] BUG: using smp_processor_id() in preemptible [00000000] code: qemu-system-aar/779 [ 266.939721] caller is debug_smp_processor_id+0x20/0x30 [ 266.944157] CPU: 2 PID: 779 Comm: qemu-system-aar Tainted: G E 5.8.0-rc3-00015-g8d4aa58b2fe3 #1374 [ 266.954268] Hardware name: amlogic w400/w400, BIOS 2020.04 05/22/2020 [ 266.960640] Call trace: [ 266.963064] dump_backtrace+0x0/0x1e0 [ 266.966679] show_stack+0x20/0x30 [ 266.969959] dump_stack+0xe4/0x154 [ 266.973338] check_preemption_disabled+0xf8/0x108 [ 266.977978] debug_smp_processor_id+0x20/0x30 [ 266.982307] kvm_vcpu_pmu_restore_guest+0x2c/0x68 [ 266.986949] access_pmcr+0xf8/0x128 [ 266.990399] perform_access+0x8c/0x250 [ 266.994108] kvm_handle_sys_reg+0x10c/0x2f8 [ 266.998247] handle_exit+0x78/0x200 [ 267.001697] kvm_arch_vcpu_ioctl_run+0x2ac/0xab8 Note that the bug was always there, it is only the switch to using percpu accessors that made it obvious. The fix is to wrap these accesses in a preempt-disabled section, so that we sample a coherent context on trap from the guest. Fixes: 435e53fb5e21 ("arm64: KVM: Enable VHE support for :G/:H perf event modifiers") Cc:: Andrew Murray Signed-off-by: Marc Zyngier --- arch/arm64/kvm/pmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index b5ae3a5d509e..3c224162b3dd 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -159,7 +159,10 @@ static void kvm_vcpu_pmu_disable_el0(unsigned long events) } /* - * On VHE ensure that only guest events have EL0 counting enabled + * On VHE ensure that only guest events have EL0 counting enabled. + * This is called from both vcpu_{load,put} and the sysreg handling. + * Since the latter is preemptible, special care must be taken to + * disable preemption. */ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) { @@ -169,12 +172,14 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) if (!has_vhe()) return; + preempt_disable(); host = this_cpu_ptr(&kvm_host_data); events_guest = host->pmu_events.events_guest; events_host = host->pmu_events.events_host; kvm_vcpu_pmu_enable_el0(events_guest); kvm_vcpu_pmu_disable_el0(events_host); + preempt_enable(); } /* From b9e10d4a6c9f5cbe6369ce2c17ebc67d2e5a4be5 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Mon, 6 Jul 2020 10:52:59 +0100 Subject: [PATCH 13/13] KVM: arm64: Stop clobbering x0 for HVC_SOFT_RESTART HVC_SOFT_RESTART is given values for x0-2 that it should installed before exiting to the new address so should not set x0 to stub HVC success or failure code. Fixes: af42f20480bf1 ("arm64: hyp-stub: Zero x0 on successful stub handling") Cc: stable@vger.kernel.org Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200706095259.1338221-1-ascull@google.com --- arch/arm64/kvm/hyp-init.S | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 6e6ed5581eed..e76c0e89d48e 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -136,11 +136,15 @@ SYM_CODE_START(__kvm_handle_stub_hvc) 1: cmp x0, #HVC_RESET_VECTORS b.ne 1f -reset: + /* - * Reset kvm back to the hyp stub. Do not clobber x0-x4 in - * case we coming via HVC_SOFT_RESTART. + * Set the HVC_RESET_VECTORS return code before entering the common + * path so that we do not clobber x0-x2 in case we are coming via + * HVC_SOFT_RESTART. */ + mov x0, xzr +reset: + /* Reset kvm back to the hyp stub. */ mrs x5, sctlr_el2 mov_q x6, SCTLR_ELx_FLAGS bic x5, x5, x6 // Clear SCTL_M and etc @@ -151,7 +155,6 @@ reset: /* Install stub vectors */ adr_l x5, __hyp_stub_vectors msr vbar_el2, x5 - mov x0, xzr eret 1: /* Bad stub call */