From c78af20374a1c9c230cc535857d2af3de5d4442c Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 19 Aug 2025 16:48:26 -0700 Subject: [PATCH 01/23] KVM: SEV: Drop GHCB_VERSION_DEFAULT and open code it Remove the GHCB_VERSION_DEFAULT macro and open code it with '2'. The macro is used conditionally and is not a true default. KVM ABI does not advertise/emumerates the default GHCB version. Any future change to this macro would silently alter the ABI and potentially break existing deployments that rely on the current behavior. Additionally, move the GHCB version assignment earlier in the code flow and update the comment to clarify that KVM_SEV_INIT2 defaults to version 2, while KVM_SEV_INIT forces version 1. No functional change intended. Cc: Thomas Lendacky Cc: Michael Roth Suggested-by: Sean Christopherson Signed-off-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 2fbdebf79fbb..212f790eedd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -37,7 +37,6 @@ #include "trace.h" #define GHCB_VERSION_MAX 2ULL -#define GHCB_VERSION_DEFAULT 2ULL #define GHCB_VERSION_MIN 1ULL #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) @@ -421,6 +420,14 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) return -EINVAL; + /* + * KVM supports the full range of mandatory features defined by version + * 2 of the GHCB protocol, so default to that for SEV-ES guests created + * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1). + */ + if (es_active && !data->ghcb_version) + data->ghcb_version = 2; + if (unlikely(sev->active)) return -EINVAL; @@ -429,14 +436,6 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, sev->vmsa_features = data->vmsa_features; sev->ghcb_version = data->ghcb_version; - /* - * Currently KVM supports the full range of mandatory features defined - * by version 2 of the GHCB protocol, so default to that for SEV-ES - * guests created via KVM_SEV_INIT2. - */ - if (sev->es_active && !sev->ghcb_version) - sev->ghcb_version = GHCB_VERSION_DEFAULT; - if (vm_type == KVM_X86_SNP_VM) sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; From 00f0b959ffb094ea677ca24a0bd14d300a3013a0 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 19 Aug 2025 16:48:27 -0700 Subject: [PATCH 02/23] KVM: SEV: Enforce minimum GHCB version requirement for SEV-SNP guests Require a minimum GHCB version of 2 when starting SEV-SNP guests through KVM_SEV_INIT2. When a VMM attempts to start an SEV-SNP guest with an incompatible GHCB version (less than 2), reject the request early rather than allowing the guest kernel to start with an incorrect protocol version and fail later with GHCB_SNP_UNSUPPORTED guest termination. Not enforcing the minimum version typically causes the guest to request termination with GHCB_SNP_UNSUPPORTED error code: kvm_amd: SEV-ES guest requested termination: 0x0:0x2 Fixes: 4af663c2f64a ("KVM: SEV: Allow per-guest configuration of GHCB protocol version") Cc: Thomas Lendacky Cc: Sean Christopherson Cc: Michael Roth Signed-off-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 212f790eedd4..e88dce598785 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -405,6 +405,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_platform_init_args init_args = {0}; bool es_active = vm_type != KVM_X86_SEV_VM; + bool snp_active = vm_type == KVM_X86_SNP_VM; u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; int ret; @@ -428,6 +429,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (es_active && !data->ghcb_version) data->ghcb_version = 2; + if (snp_active && data->ghcb_version < 2) + return -EINVAL; + if (unlikely(sev->active)) return -EINVAL; @@ -436,7 +440,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, sev->vmsa_features = data->vmsa_features; sev->ghcb_version = data->ghcb_version; - if (vm_type == KVM_X86_SNP_VM) + if (snp_active) sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; ret = sev_asid_new(sev); @@ -454,7 +458,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, } /* This needs to happen after SEV/SNP firmware initialization. */ - if (vm_type == KVM_X86_SNP_VM) { + if (snp_active) { ret = snp_guest_req_init(kvm); if (ret) goto e_free; From 7b59c73fd611eae87e60e8bb0ab83a1475d8a3a7 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 19 Aug 2025 16:48:28 -0700 Subject: [PATCH 03/23] x86/cpufeatures: Add SNP Secure TSC The Secure TSC feature for SEV-SNP allows guests to securely use the RDTSC and RDTSCP instructions, ensuring that the parameters used cannot be altered by the hypervisor once the guest is launched. For more details, refer to the AMD64 APM Vol 2, Section "Secure TSC". Acked-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Vaishali Thakkar Signed-off-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 06fc0479a23f..f53d4943ea63 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -444,6 +444,7 @@ #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ #define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */ #define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_SNP_SECURE_TSC (19*32+ 8) /* SEV-SNP Secure TSC */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ From 34bd82aab15b8b9e8e9d923267283c84aa8c2789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 16:48:29 -0700 Subject: [PATCH 04/23] KVM: SVM: Move SEV-ES VMSA allocation to a dedicated sev_vcpu_create() helper Add a dedicated sev_vcpu_create() helper to allocate the VMSA page for SEV-ES+ vCPUs, and to allow for consolidating a variety of related SEV+ code in the near future. No functional change intended. Reviewed-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 20 ++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 25 +++++++------------------ arch/x86/kvm/svm/svm.h | 2 ++ 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e88dce598785..c17cc4eb0fe1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4561,6 +4561,26 @@ void sev_init_vmcb(struct vcpu_svm *svm) sev_es_init_vmcb(svm); } +int sev_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct page *vmsa_page; + + if (!sev_es_guest(vcpu->kvm)) + return 0; + + /* + * SEV-ES guests require a separate (from the VMCB) VMSA page used to + * contain the encrypted register state of the guest. + */ + vmsa_page = snp_safe_alloc_page(); + if (!vmsa_page) + return -ENOMEM; + + svm->sev_es.vmsa = page_address(vmsa_page); + return 0; +} + void sev_es_vcpu_reset(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc6..3d4c14e0244f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1275,7 +1275,6 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb01_page; - struct page *vmsa_page = NULL; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); @@ -1286,24 +1285,18 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) if (!vmcb01_page) goto out; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests require a separate VMSA page used to contain - * the encrypted register state of the guest. - */ - vmsa_page = snp_safe_alloc_page(); - if (!vmsa_page) - goto error_free_vmcb_page; - } + err = sev_vcpu_create(vcpu); + if (err) + goto error_free_vmcb_page; err = avic_init_vcpu(svm); if (err) - goto error_free_vmsa_page; + goto error_free_sev; svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; - goto error_free_vmsa_page; + goto error_free_sev; } svm->x2avic_msrs_intercepted = true; @@ -1312,16 +1305,12 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); - if (vmsa_page) - svm->sev_es.vmsa = page_address(vmsa_page); - svm->guest_state_loaded = false; return 0; -error_free_vmsa_page: - if (vmsa_page) - __free_page(vmsa_page); +error_free_sev: + sev_free_vcpu(vcpu); error_free_vmcb_page: __free_page(vmcb01_page); out: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 58b9d168e0c8..cf2569b5451a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -854,6 +854,7 @@ static inline struct page *snp_safe_alloc_page(void) return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); } +int sev_vcpu_create(struct kvm_vcpu *vcpu); void sev_free_vcpu(struct kvm_vcpu *vcpu); void sev_vm_destroy(struct kvm *kvm); void __init sev_set_cpu_caps(void); @@ -880,6 +881,7 @@ static inline struct page *snp_safe_alloc_page(void) return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); } +static inline int sev_vcpu_create(struct kvm_vcpu *vcpu) { return 0; } static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {} static inline void sev_vm_destroy(struct kvm *kvm) {} static inline void __init sev_set_cpu_caps(void) {} From 3d4e882e3439593b406fa226bcdd48c92d2222a6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 16:48:30 -0700 Subject: [PATCH 05/23] KVM: SEV: Move init of SNP guest state into sev_init_vmcb() Move the initialization of SNP guest state from svm_vcpu_reset() into sev_init_vmcb() to reduce the number of paths that deal with INIT/RESET for SEV+ vCPUs from 4+ to 1. Plumb in @init_event as necessary. Opportunistically check for an SNP guest outside of sev_snp_init_protected_guest_state() so that sev_init_vmcb() is consistent with respect to checking for SEV-ES+ and SNP+ guests. No functional change intended. Reviewed-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 16 +++++++++------- arch/x86/kvm/svm/svm.c | 9 +++------ arch/x86/kvm/svm/svm.h | 4 +--- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c17cc4eb0fe1..c5726b091680 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1975,7 +1975,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) { dst_svm = to_svm(dst_vcpu); - sev_init_vmcb(dst_svm); + sev_init_vmcb(dst_svm, false); if (!dst->es_active) continue; @@ -3887,7 +3887,7 @@ next_range: /* * Invoked as part of svm_vcpu_reset() processing of an init event. */ -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_memory_slot *slot; @@ -3895,9 +3895,6 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) kvm_pfn_t pfn; gfn_t gfn; - if (!sev_snp_guest(vcpu->kvm)) - return; - guard(mutex)(&svm->sev_es.snp_vmsa_mutex); if (!svm->sev_es.snp_ap_waiting_for_reset) @@ -4546,8 +4543,10 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_XSETBV); } -void sev_init_vmcb(struct vcpu_svm *svm) +void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) { + struct kvm_vcpu *vcpu = &svm->vcpu; + svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); @@ -4557,7 +4556,10 @@ void sev_init_vmcb(struct vcpu_svm *svm) */ clr_exception_intercept(svm, GP_VECTOR); - if (sev_es_guest(svm->vcpu.kvm)) + if (init_event && sev_snp_guest(vcpu->kvm)) + sev_snp_init_protected_guest_state(vcpu); + + if (sev_es_guest(vcpu->kvm)) sev_es_init_vmcb(svm); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3d4c14e0244f..8ed135dbd649 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1083,7 +1083,7 @@ static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) svm_recalc_msr_intercepts(vcpu); } -static void init_vmcb(struct kvm_vcpu *vcpu) +static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -1221,7 +1221,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_BUSLOCK); if (sev_guest(vcpu->kvm)) - sev_init_vmcb(svm); + sev_init_vmcb(svm, init_event); svm_hv_init_vmcb(vmcb); @@ -1256,10 +1256,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (init_event) - sev_snp_init_protected_guest_state(vcpu); - - init_vmcb(vcpu); + init_vmcb(vcpu, init_event); if (!init_event) __svm_vcpu_reset(vcpu); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index cf2569b5451a..321480ebe62f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -826,7 +826,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ int pre_sev_run(struct vcpu_svm *svm, int cpu); -void sev_init_vmcb(struct vcpu_svm *svm); +void sev_init_vmcb(struct vcpu_svm *svm, bool init_event); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_vcpu_reset(struct vcpu_svm *svm); @@ -864,7 +864,6 @@ int sev_cpu_init(struct svm_cpu_data *sd); int sev_dev_get_attr(u32 group, u64 attr, u64 *val); extern unsigned int max_sev_asid; void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); @@ -891,7 +890,6 @@ static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } #define max_sev_asid 0 static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} -static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) { return 0; From baf6ed177290db5873f318626970fa7fd4060579 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 16:48:31 -0700 Subject: [PATCH 06/23] KVM: SEV: Set RESET GHCB MSR value during sev_es_init_vmcb() Set the RESET value for the GHCB "MSR" during sev_es_init_vmcb() instead of sev_es_vcpu_reset() to allow for dropping sev_es_vcpu_reset() entirely. Note, the call to sev_init_vmcb() from sev_migrate_from() also kinda sorta emulates a RESET, but sev_migrate_from() immediately overwrites ghcb_gpa with the source's current value, so whether or not stuffing the GHCB version is correct/desirable is moot. No functional change intended. Reviewed-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c5726b091680..ee7a05843548 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4480,7 +4480,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } -static void sev_es_init_vmcb(struct vcpu_svm *svm) +static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) { struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; @@ -4541,6 +4541,15 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); + + /* + * Set the GHCB MSR value as per the GHCB specification when emulating + * vCPU RESET for an SEV-ES guest. + */ + if (!init_event) + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, + GHCB_VERSION_MIN, + sev_enc_bit)); } void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) @@ -4560,7 +4569,7 @@ void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) sev_snp_init_protected_guest_state(vcpu); if (sev_es_guest(vcpu->kvm)) - sev_es_init_vmcb(svm); + sev_es_init_vmcb(svm, init_event); } int sev_vcpu_create(struct kvm_vcpu *vcpu) @@ -4585,17 +4594,6 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu) void sev_es_vcpu_reset(struct vcpu_svm *svm) { - struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); - - /* - * Set the GHCB MSR value as per the GHCB specification when emulating - * vCPU RESET for an SEV-ES guest. - */ - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, - GHCB_VERSION_MIN, - sev_enc_bit)); - mutex_init(&svm->sev_es.snp_vmsa_mutex); } From f7b1f0c1620db689e085569035a659340a3209c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Aug 2025 16:48:32 -0700 Subject: [PATCH 07/23] KVM: SEV: Fold sev_es_vcpu_reset() into sev_vcpu_create() Fold the remaining line of sev_es_vcpu_reset() into sev_vcpu_create() as there's no need for a dedicated RESET hook just to init a mutex, and the mutex should be initialized as early as possible anyways. No functional change intended. Reviewed-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250819234833.3080255-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 7 ++----- arch/x86/kvm/svm/svm.c | 3 --- arch/x86/kvm/svm/svm.h | 1 - 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ee7a05843548..7d1d34e45310 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4577,6 +4577,8 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct page *vmsa_page; + mutex_init(&svm->sev_es.snp_vmsa_mutex); + if (!sev_es_guest(vcpu->kvm)) return 0; @@ -4592,11 +4594,6 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu) return 0; } -void sev_es_vcpu_reset(struct vcpu_svm *svm) -{ - mutex_init(&svm->sev_es.snp_vmsa_mutex); -} - void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { struct kvm *kvm = svm->vcpu.kvm; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8ed135dbd649..b237b4081c91 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1244,9 +1244,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm->nmi_masked = false; svm->awaiting_iret_completion = false; - - if (sev_es_guest(vcpu->kvm)) - sev_es_vcpu_reset(svm); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 321480ebe62f..3c7f208b7935 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -829,7 +829,6 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu); void sev_init_vmcb(struct vcpu_svm *svm, bool init_event); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); -void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); From a311fce2b694599cede76c5d7a0905336bc09803 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 19 Aug 2025 16:48:33 -0700 Subject: [PATCH 08/23] KVM: SVM: Enable Secure TSC for SNP guests Add support for Secure TSC, allowing userspace to configure the Secure TSC feature for SNP guests. Use the SNP specification's desired TSC frequency parameter during the SNP_LAUNCH_START command to set the mean TSC frequency in KHz for Secure TSC enabled guests. Always use kvm->arch.arch.default_tsc_khz as the TSC frequency that is passed to SNP guests in the SNP_LAUNCH_START command. The default value is the host TSC frequency. The userspace can optionally change the TSC frequency via the KVM_SET_TSC_KHZ ioctl before calling the SNP_LAUNCH_START ioctl. Introduce the read-only MSR GUEST_TSC_FREQ (0xc0010134) that returns guest's effective frequency in MHZ when Secure TSC is enabled for SNP guests. Disable interception of this MSR when Secure TSC is enabled. Note that GUEST_TSC_FREQ MSR is accessible only to the guest and not from the hypervisor context. Co-developed-by: Ketan Chaturvedi Signed-off-by: Ketan Chaturvedi Reviewed-by: Kai Huang Reviewed-by: Tom Lendacky Signed-off-by: Nikunj A Dadhania [sean: contain Secure TSC to sev.c] Link: https://lore.kernel.org/r/20250819234833.3080255-9-seanjc@google.com [sean: return -EINVAL if TSC frequency is '0'] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 1 + arch/x86/kvm/svm/sev.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index ffc27f676243..17f6c3fedeee 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -299,6 +299,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) #define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) #define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define SVM_SEV_FEAT_SECURE_TSC BIT(9) #define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7d1d34e45310..86fd270a1d6e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -146,6 +146,14 @@ static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } +static bool snp_is_secure_tsc_enabled(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) && + !WARN_ON_ONCE(!sev_snp_guest(kvm)); +} + /* Must be called with the sev_bitmap_lock held */ static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { @@ -415,6 +423,9 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, if (data->flags) return -EINVAL; + if (!snp_active) + valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC; + if (data->vmsa_features & ~valid_vmsa_features) return -EINVAL; @@ -2187,6 +2198,13 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) return -EINVAL; + if (snp_is_secure_tsc_enabled(kvm)) { + if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz)) + return -EINVAL; + + start.desired_tsc_khz = kvm->arch.default_tsc_khz; + } + sev->policy = params.policy; sev->snp_context = snp_context_create(kvm, argp); @@ -2195,6 +2213,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) start.gctx_paddr = __psp_pa(sev->snp_context); start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); if (rc) { @@ -3085,6 +3104,9 @@ out: sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; + + if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) + sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC; } void sev_hardware_unsetup(void) @@ -4452,6 +4474,9 @@ void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R, + !snp_is_secure_tsc_enabled(vcpu->kvm)); + /* * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if * the host/guest supports its use. @@ -4591,6 +4616,9 @@ int sev_vcpu_create(struct kvm_vcpu *vcpu) return -ENOMEM; svm->sev_es.vmsa = page_address(vmsa_page); + + vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm); + return 0; } From 2f5f8fb9de095e9b255a89269827f1761c714690 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 21 Aug 2025 14:38:41 -0700 Subject: [PATCH 09/23] KVM: SEV: Save the SEV policy if and only if LAUNCH_START succeeds Wait until LAUNCH_START fully succeeds to set a VM's SEV/SNP policy so that KVM doesn't keep a potentially stale policy. In practice, the issue is benign as the policy is only used to detect if the VMSA can be decrypted, and the VMSA only needs to be decrypted if LAUNCH_UPDATE and thus LAUNCH_START succeeded. Fixes: 962e2b6152ef ("KVM: SVM: Decrypt SEV VMSA in dump_vmcb() if debugging is enabled") Cc: Tom Lendacky Cc: Kim Phillips Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250821213841.3462339-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 86fd270a1d6e..a95b862afa23 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -583,8 +583,6 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - sev->policy = params.policy; - memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -632,6 +630,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_session; } + sev->policy = params.policy; sev->handle = start.handle; sev->fd = argp->sev_fd; @@ -2205,8 +2204,6 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) start.desired_tsc_khz = kvm->arch.default_tsc_khz; } - sev->policy = params.policy; - sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) return -ENOTTY; @@ -2222,6 +2219,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_context; } + sev->policy = params.policy; sev->fd = argp->sev_fd; rc = snp_bind_asid(kvm, &argp->error); if (rc) { From fc55b4cda00aff08ea6dfe86411efa13bdb728c5 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 3 Sep 2025 02:29:50 +0200 Subject: [PATCH 10/23] KVM: nSVM: Replace kzalloc() + copy_from_user() with memdup_user() Replace kzalloc() followed by copy_from_user() with memdup_user() to improve and simplify svm_set_nested_state(). Return early if an error occurs instead of trying to allocate memory for 'save' when memory allocation for 'ctl' already failed. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20250903002951.118912-1-thorsten.blum@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b7fd2e869998..826473f2d7c7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1798,17 +1798,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) return -EINVAL; - ret = -ENOMEM; - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); - save = kzalloc(sizeof(*save), GFP_KERNEL); - if (!ctl || !save) - goto out_free; + ctl = memdup_user(&user_vmcb->control, sizeof(*ctl)); + if (IS_ERR(ctl)) + return PTR_ERR(ctl); - ret = -EFAULT; - if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl))) - goto out_free; - if (copy_from_user(save, &user_vmcb->save, sizeof(*save))) - goto out_free; + save = memdup_user(&user_vmcb->save, sizeof(*save)); + if (IS_ERR(save)) { + kfree(ctl); + return PTR_ERR(save); + } ret = -EINVAL; __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); From e0ff302b79c54567eb6b8f609e76c633978ff06d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:08 -0700 Subject: [PATCH 11/23] KVM: SEV: Rename kvm_ghcb_get_sw_exit_code() to kvm_get_cached_sw_exit_code() Rename kvm_ghcb_get_sw_exit_code() to kvm_get_cached_sw_exit_code() to make it clear that KVM is getting the cached value, not reading directly from the guest-controlled GHCB. More importantly, vacating kvm_ghcb_get_sw_exit_code() will allow adding a KVM-specific macro-built kvm_ghcb_get_##field() helper to read values from the GHCB. No functional change intended. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250919223258.1604852-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a95b862afa23..5456a71f9504 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3220,7 +3220,7 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control) { return (((u64)control->exit_code_hi) << 32) | control->exit_code; } @@ -3246,7 +3246,7 @@ static void dump_ghcb(struct vcpu_svm *svm) */ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); + kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", @@ -3335,7 +3335,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ - exit_code = kvm_ghcb_get_sw_exit_code(control); + exit_code = kvm_get_cached_sw_exit_code(control); /* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) { @@ -4340,7 +4340,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm_vmgexit_success(svm, 0); - exit_code = kvm_ghcb_get_sw_exit_code(control); + exit_code = kvm_get_cached_sw_exit_code(control); switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); From bd5f500d23170e5bde59ce97da523048b66a8183 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:09 -0700 Subject: [PATCH 12/23] KVM: SEV: Read save fields from GHCB exactly once Wrap all reads of GHCB save fields with READ_ONCE() via a KVM-specific GHCB get() utility to help guard against TOCTOU bugs. Using READ_ONCE() doesn't completely prevent such bugs, e.g. doesn't prevent KVM from redoing get() after checking the initial value, but at least addresses all potential TOCTOU issues in the current KVM code base. To prevent unintentional use of the generic helpers, take only @svm for the kvm_ghcb_get_xxx() helpers and retrieve the ghcb instead of explicitly passing it in. Opportunistically reduce the indentation of the macro-defined helpers and clean up the alignment. Fixes: 4e15a0ddc3ff ("KVM: SEV: snapshot the GHCB before accessing it") Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250919223258.1604852-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 22 +++++++++++----------- arch/x86/kvm/svm/svm.h | 25 +++++++++++++++---------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5456a71f9504..a2076ab3fe71 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3299,26 +3299,26 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb); - vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm); + vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm); - svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb); + svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); if (kvm_ghcb_xcr0_is_valid(svm)) { - vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); + vcpu->arch.xcr0 = kvm_ghcb_get_xcr0(svm); vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ - exit_code = ghcb_get_sw_exit_code(ghcb); + exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_code = lower_32_bits(exit_code); control->exit_code_hi = upper_32_bits(exit_code); - control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); - control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); - svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb); + control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); + control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); + svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); /* Clear the valid entries fields */ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3c7f208b7935..10d5cbc259e1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -913,16 +913,21 @@ void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ - static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \ - { \ - return test_bit(GHCB_BITMAP_IDX(field), \ - (unsigned long *)&svm->sev_es.valid_bitmap); \ - } \ - \ - static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \ - { \ - return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \ - } \ +static __always_inline u64 kvm_ghcb_get_##field(struct vcpu_svm *svm) \ +{ \ + return READ_ONCE(svm->sev_es.ghcb->save.field); \ +} \ + \ +static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \ +{ \ + return test_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&svm->sev_es.valid_bitmap); \ +} \ + \ +static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm) \ +{ \ + return kvm_ghcb_##field##_is_valid(svm) ? kvm_ghcb_get_##field(svm) : 0; \ +} DEFINE_KVM_GHCB_ACCESSORS(cpl) DEFINE_KVM_GHCB_ACCESSORS(rax) From 4135a9a8ccba2b685f2301429ea765fa0f78eb89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:10 -0700 Subject: [PATCH 13/23] KVM: SEV: Validate XCR0 provided by guest in GHCB Use __kvm_set_xcr() to propagate XCR0 changes from the GHCB to KVM's software model in order to validate the new XCR0 against KVM's view of the supported XCR0. Allowing garbage is thankfully mostly benign, as kvm_load_{guest,host}_xsave_state() bail early for vCPUs with protected state, xstate_required_size() will simply provide garbage back to the guest, and attempting to save/restore the bad value via KVM_{G,S}ET_XCRS will only harm the guest (setting XCR0 will fail). However, allowing the guest to put junk into a field that KVM assumes is valid is a CVE waiting to happen. And as a bonus, using the proper API eliminates the ugly open coding of setting arch.cpuid_dynamic_bits_dirty. Simply ignore bad values, as either the guest managed to get an unsupported value into hardware, or the guest is misbehaving and providing pure garbage. In either case, KVM can't fix the broken guest. Note, using __kvm_set_xcr() also avoids recomputing dynamic CPUID bits if XCR0 isn't actually changing (relatively to KVM's previous snapshot). Cc: Tom Lendacky Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250919223258.1604852-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/sev.c | 6 ++---- arch/x86/kvm/x86.c | 3 ++- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..822a5596a4a0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2187,6 +2187,7 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a2076ab3fe71..d2ffacd43234 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3307,10 +3307,8 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); - if (kvm_ghcb_xcr0_is_valid(svm)) { - vcpu->arch.xcr0 = kvm_ghcb_get_xcr0(svm); - vcpu->arch.cpuid_dynamic_bits_dirty = true; - } + if (kvm_ghcb_xcr0_is_valid(svm)) + __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); /* Copy the GHCB exit information into the VMCB fields */ exit_code = kvm_ghcb_get_sw_exit_code(svm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..1d7faf8bc785 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1237,7 +1237,7 @@ static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) } #endif -static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -1281,6 +1281,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } +EXPORT_SYMBOL_GPL(__kvm_set_xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { From 5b66e335ead6472f336b4d7d9cbf14488b844f27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:16:49 -0700 Subject: [PATCH 14/23] KVM: SEV: Reject non-positive effective lengths during LAUNCH_UPDATE Check for an invalid length during LAUNCH_UPDATE at the start of snp_launch_update() instead of subtly relying on kvm_gmem_populate() to detect the bad state. Code that directly handles userspace input absolutely should sanitize those inputs; failure to do so is asking for bugs where KVM consumes an invalid "npages". Keep the check in gmem, but wrap it in a WARN to flag any bad usage by the caller. Note, this is technically an ABI change as KVM would previously allow a length of '0'. But allowing a length of '0' is nonsensical and creates pointless conundrums in KVM. E.g. an empty range is arguably neither private nor shared, but LAUNCH_UPDATE will fail if the starting gpa can't be made private. In practice, no known or well-behaved VMM passes a length of '0'. Note #2, the PAGE_ALIGNED(params.len) check ensures that lengths between 1 and 4095 (inclusive) are also rejected, i.e. that KVM won't end up with npages=0 when doing "npages = params.len / PAGE_SIZE". Cc: Thomas Lendacky Cc: Michael Roth Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250919211649.1575654-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 2 +- virt/kvm/guest_memfd.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d2ffacd43234..019d920fe442 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2353,7 +2353,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, params.gfn_start, params.len, params.type, params.flags); - if (!PAGE_ALIGNED(params.len) || params.flags || + if (!params.len || !PAGE_ALIGNED(params.len) || params.flags || (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 7d85cc33c0bb..79552467add5 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -639,7 +639,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long long i; lockdep_assert_held(&kvm->slots_lock); - if (npages < 0) + + if (WARN_ON_ONCE(npages <= 0)) return -EINVAL; slot = gfn_to_memslot(kvm, start_gfn); From 9bc366350734246301b090802fc71f9924daad39 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Tue, 23 Sep 2025 08:37:37 -0700 Subject: [PATCH 15/23] KVM: x86: Add helper to retrieve current value of user return MSR In the user return MSR support, the cached value is always the hardware value of the specific MSR. Therefore, add a helper to retrieve the cached value, which can replace the need for RDMSR, for example, to allow SEV-ES guests to restore the correct host hardware value without using RDMSR. Cc: stable@vger.kernel.org Signed-off-by: Hou Wenlong [sean: drop "cache" from the name, make it a one-liner, tag for stable] Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250923153738.1875174-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 822a5596a4a0..0d6a4af79b78 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2357,6 +2357,7 @@ int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); void kvm_user_return_msr_update_cache(unsigned int index, u64 val); +u64 kvm_get_user_return_msr(unsigned int slot); static inline bool kvm_is_supported_user_return_msr(u32 msr) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1d7faf8bc785..5ac2183b9993 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -677,6 +677,12 @@ void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) } EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); +u64 kvm_get_user_return_msr(unsigned int slot) +{ + return this_cpu_ptr(user_return_msrs)->values[slot].curr; +} +EXPORT_SYMBOL_GPL(kvm_get_user_return_msr); + static void drop_user_return_notifiers(void) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); From 29da8c823abffdacb71c7c07ec48fcf9eb38757c Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Tue, 23 Sep 2025 08:37:38 -0700 Subject: [PATCH 16/23] KVM: SVM: Re-load current, not host, TSC_AUX on #VMEXIT from SEV-ES guest Prior to running an SEV-ES guest, set TSC_AUX in the host save area to the current value in hardware, as tracked by the user return infrastructure, instead of always loading the host's desired value for the CPU. If the pCPU is also running a non-SEV-ES vCPU, loading the host's value on #VMEXIT could clobber the other vCPU's value, e.g. if the SEV-ES vCPU preempted the non-SEV-ES vCPU, in which case KVM expects the other vCPU's TSC_AUX value to be resident in hardware. Note, unlike TDX, which blindly _zeroes_ TSC_AUX on TD-Exit, SEV-ES CPUs can load an arbitrary value. Stuff the current value in the host save area instead of refreshing the user return cache so that KVM doesn't need to track whether or not the vCPU actually enterred the guest and thus loaded TSC_AUX from the host save area. Opportunistically tag tsc_aux_uret_slot as read-only after init to guard against unexpected modifications, and to make it obvious that using the variable in sev_es_prepare_switch_to_guest() is safe. Fixes: 916e3e5f26ab ("KVM: SVM: Do not use user return MSR support for virtualized TSC_AUX") Cc: stable@vger.kernel.org Suggested-by: Lai Jiangshan Signed-off-by: Hou Wenlong [sean: handle the SEV-ES case in sev_es_prepare_switch_to_guest()] Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250923153738.1875174-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 10 ++++++++++ arch/x86/kvm/svm/svm.c | 25 ++++++------------------- arch/x86/kvm/svm/svm.h | 2 ++ 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 019d920fe442..5529e4c3362b 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4666,6 +4666,16 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); } + + /* + * TSC_AUX is always virtualized for SEV-ES guests when the feature is + * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area. + * Set the save area to the current hardware value, i.e. the current + * user return value, so that the correct value is restored on #VMEXIT. + */ + if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) && + !WARN_ON_ONCE(tsc_aux_uret_slot < 0)) + hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot); } void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b237b4081c91..6f486fb82144 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -195,7 +195,7 @@ static DEFINE_MUTEX(vmcb_dump_mutex); * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to * defer the restoration of TSC_AUX until the CPU returns to userspace. */ -static int tsc_aux_uret_slot __read_mostly = -1; +int tsc_aux_uret_slot __ro_after_init = -1; static int get_npt_level(void) { @@ -577,18 +577,6 @@ static int svm_enable_virtualization_cpu(void) amd_pmu_enable_virt(); - /* - * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type - * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests. - * Since Linux does not change the value of TSC_AUX once set, prime the - * TSC_AUX field now to avoid a RDMSR on every vCPU run. - */ - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - u32 __maybe_unused msr_hi; - - rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); - } - return 0; } @@ -1406,10 +1394,10 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); /* - * TSC_AUX is always virtualized for SEV-ES guests when the feature is - * available. The user return MSR support is not required in this case - * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_enable_virtualization_cpu()). + * TSC_AUX is always virtualized (context switched by hardware) for + * SEV-ES guests when the feature is available. For non-SEV-ES guests, + * context switch TSC_AUX via the user_return MSR infrastructure (not + * all CPUs support TSC_AUX virtualization). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) @@ -3004,8 +2992,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * TSC_AUX is always virtualized for SEV-ES guests when the * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT - * from the host save area (which has been initialized in - * svm_enable_virtualization_cpu()). + * from the host save area. */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 10d5cbc259e1..ec3fb318ca83 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -52,6 +52,8 @@ extern bool x2avic_enabled; extern bool vnmi; extern int lbrv; +extern int tsc_aux_uret_slot __ro_after_init; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to From 44bfe1f0490d5620c7962ab7384797672b4c4293 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:28 -0700 Subject: [PATCH 17/23] KVM: SVM: Make svm_x86_ops globally visible, clean up on-HyperV usage Make svm_x86_ops globally visible in anticipation of modifying the struct in avic.c, and clean up the KVM-on-HyperV usage, as declaring _and using_ a local variable in a header that's only defined in one specific .c-file is all kinds of ugly. Opportunistically make svm_hv_enable_l2_tlb_flush() local to svm_onhyperv.c, as the only reason it was visible was due to the aforementioned shenanigans in svm_onhyperv.h. Alternatively, svm_x86_ops could be explicitly passed to svm_hv_hardware_setup() as a parameter. While that approach is slightly safer, e.g. avoids "hidden" updates, for better or worse, the Intel side of KVM has already chosen to expose vt_x86_ops (and vt_init_ops). Given that svm_x86_ops is only truly consumed by kvm_ops_update, the odds of a "hidden" update causing problems are extremely low. So, absent a strong reason to rework the VMX/TDX code, make svm_x86_ops visible, as having all updates use exactly "svm_x86_ops." is advantageous in its own right. No functional change intended. Link: https://lore.kernel.org/r/20250919215934.1590410-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 2 ++ arch/x86/kvm/svm/svm_onhyperv.c | 28 +++++++++++++++++++++++++++- arch/x86/kvm/svm/svm_onhyperv.h | 31 +------------------------------ 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6f486fb82144..bfbd34818412 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5012,7 +5012,7 @@ static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) return page_address(page); } -static struct kvm_x86_ops svm_x86_ops __initdata = { +struct kvm_x86_ops svm_x86_ops __initdata = { .name = KBUILD_MODNAME, .check_processor_compatibility = svm_check_processor_compat, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index ec3fb318ca83..bc46a3539487 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -54,6 +54,8 @@ extern int lbrv; extern int tsc_aux_uret_slot __ro_after_init; +extern struct kvm_x86_ops svm_x86_ops __initdata; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 3971b3ea5d04..a8e78c0e5956 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -15,7 +15,7 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) +static int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_vmcb_enlightenments *hve; hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); @@ -35,3 +35,29 @@ int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +__init void svm_hv_hardware_setup(void) +{ + if (npt_enabled && + ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { + pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); + svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { + int cpu; + + pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); + for_each_online_cpu(cpu) { + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->nested_control.features.directhypercall = 1; + } + svm_x86_ops.enable_l2_tlb_flush = + svm_hv_enable_l2_tlb_flush; + } +} diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index f85bc617ffe4..08f14e6f195c 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -13,9 +13,7 @@ #include "kvm_onhyperv.h" #include "svm/hyperv.h" -static struct kvm_x86_ops svm_x86_ops; - -int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); +__init void svm_hv_hardware_setup(void); static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) { @@ -40,33 +38,6 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) hve->hv_enlightenments_control.msr_bitmap = 1; } -static inline __init void svm_hv_hardware_setup(void) -{ - if (npt_enabled && - ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { - pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); - svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; - svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; - } - - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { - int cpu; - - pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); - for_each_online_cpu(cpu) { - struct hv_vp_assist_page *vp_ap = - hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->nested_control.features.directhypercall = 1; - } - svm_x86_ops.enable_l2_tlb_flush = - svm_hv_enable_l2_tlb_flush; - } -} - static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct kvm_vcpu *vcpu) { From eb44ea6a7aace13becd8d99905284ad272b3bd98 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:29 -0700 Subject: [PATCH 18/23] KVM: SVM: Move x2AVIC MSR interception helper to avic.c Move svm_set_x2apic_msr_interception() to avic.c as it's only relevant when x2AVIC is enabled/supported and only called by AVIC code. In addition to scoping AVIC code to avic.c, this will allow burying the global x2avic_enabled variable in avic. Opportunistically rename the helper to explicitly scope it to "avic". No functional change intended. Reviewed-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 57 ++++++++++++++++++++++++++++++++++++++--- arch/x86/kvm/svm/svm.c | 49 ----------------------------------- arch/x86/kvm/svm/svm.h | 1 - 3 files changed, 54 insertions(+), 53 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a34c5c3b164e..478a18208a76 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -79,6 +79,57 @@ static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); bool x2avic_enabled; + +static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, + bool intercept) +{ + static const u32 x2avic_passthrough_msrs[] = { + X2APIC_MSR(APIC_ID), + X2APIC_MSR(APIC_LVR), + X2APIC_MSR(APIC_TASKPRI), + X2APIC_MSR(APIC_ARBPRI), + X2APIC_MSR(APIC_PROCPRI), + X2APIC_MSR(APIC_EOI), + X2APIC_MSR(APIC_RRR), + X2APIC_MSR(APIC_LDR), + X2APIC_MSR(APIC_DFR), + X2APIC_MSR(APIC_SPIV), + X2APIC_MSR(APIC_ISR), + X2APIC_MSR(APIC_TMR), + X2APIC_MSR(APIC_IRR), + X2APIC_MSR(APIC_ESR), + X2APIC_MSR(APIC_ICR), + X2APIC_MSR(APIC_ICR2), + + /* + * Note! Always intercept LVTT, as TSC-deadline timer mode + * isn't virtualized by hardware, and the CPU will generate a + * #GP instead of a #VMEXIT. + */ + X2APIC_MSR(APIC_LVTTHMR), + X2APIC_MSR(APIC_LVTPC), + X2APIC_MSR(APIC_LVT0), + X2APIC_MSR(APIC_LVT1), + X2APIC_MSR(APIC_LVTERR), + X2APIC_MSR(APIC_TMICT), + X2APIC_MSR(APIC_TMCCT), + X2APIC_MSR(APIC_TDCR), + }; + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (!x2avic_enabled) + return; + + for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) + svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], + MSR_TYPE_RW, intercept); + + svm->x2avic_msrs_intercepted = intercept; +} + static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -99,7 +150,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) vmcb->control.int_ctl |= X2APIC_MODE_MASK; vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; /* Disabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, false); + avic_set_x2apic_msr_interception(svm, false); } else { /* * Flush the TLB, the guest may have inserted a non-APIC @@ -110,7 +161,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) /* For xAVIC and hybrid-xAVIC modes */ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; /* Enabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, true); + avic_set_x2apic_msr_interception(svm, true); } } @@ -130,7 +181,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) return; /* Enabling MSR intercept for x2APIC registers */ - svm_set_x2apic_msr_interception(svm, true); + avic_set_x2apic_msr_interception(svm, true); } /* Note: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bfbd34818412..44032c78aab0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -724,55 +724,6 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); } -void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) -{ - static const u32 x2avic_passthrough_msrs[] = { - X2APIC_MSR(APIC_ID), - X2APIC_MSR(APIC_LVR), - X2APIC_MSR(APIC_TASKPRI), - X2APIC_MSR(APIC_ARBPRI), - X2APIC_MSR(APIC_PROCPRI), - X2APIC_MSR(APIC_EOI), - X2APIC_MSR(APIC_RRR), - X2APIC_MSR(APIC_LDR), - X2APIC_MSR(APIC_DFR), - X2APIC_MSR(APIC_SPIV), - X2APIC_MSR(APIC_ISR), - X2APIC_MSR(APIC_TMR), - X2APIC_MSR(APIC_IRR), - X2APIC_MSR(APIC_ESR), - X2APIC_MSR(APIC_ICR), - X2APIC_MSR(APIC_ICR2), - - /* - * Note! Always intercept LVTT, as TSC-deadline timer mode - * isn't virtualized by hardware, and the CPU will generate a - * #GP instead of a #VMEXIT. - */ - X2APIC_MSR(APIC_LVTTHMR), - X2APIC_MSR(APIC_LVTPC), - X2APIC_MSR(APIC_LVT0), - X2APIC_MSR(APIC_LVT1), - X2APIC_MSR(APIC_LVTERR), - X2APIC_MSR(APIC_TMICT), - X2APIC_MSR(APIC_TMCCT), - X2APIC_MSR(APIC_TDCR), - }; - int i; - - if (intercept == svm->x2avic_msrs_intercepted) - return; - - if (!x2avic_enabled) - return; - - for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) - svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], - MSR_TYPE_RW, intercept); - - svm->x2avic_msrs_intercepted = intercept; -} - void svm_vcpu_free_msrpm(void *msrpm) { __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index bc46a3539487..cb1d26cb5113 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -703,7 +703,6 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); -void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); From a9095e4fc4368052593c84472a8d374fefd3df71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:30 -0700 Subject: [PATCH 19/23] KVM: SVM: Update "APICv in x2APIC without x2AVIC" in avic.c, not svm.c Set the "allow_apicv_in_x2apic_without_x2apic_virtualization" flag as part of avic_hardware_setup() instead of handling in svm_hardware_setup(), and make x2avic_enabled local to avic.c (setting the flag was the only use in svm.c). Tag avic_hardware_setup() with __init as necessary (it should have been tagged __init long ago). No functional change intended (aside from the side effects of tagging avic_hardware_setup() with __init). Acked-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 6 ++++-- arch/x86/kvm/svm/svm.c | 2 -- arch/x86/kvm/svm/svm.h | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 478a18208a76..b4577401ce5f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -77,7 +77,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); -bool x2avic_enabled; +static bool x2avic_enabled; static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, @@ -1147,7 +1147,7 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) * - Hypervisor can support both xAVIC and x2AVIC in the same guest. * - The mode can be switched at run-time. */ -bool avic_hardware_setup(void) +bool __init avic_hardware_setup(void) { if (!npt_enabled) return false; @@ -1182,6 +1182,8 @@ bool avic_hardware_setup(void) x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); if (x2avic_enabled) pr_info("x2AVIC enabled\n"); + else + svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; /* * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 44032c78aab0..3fd2f4097a3b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5337,8 +5337,6 @@ static __init int svm_hardware_setup(void) svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; - } else if (!x2avic_enabled) { - svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; } if (vls) { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index cb1d26cb5113..739f4f52f46d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -48,7 +48,6 @@ extern bool npt_enabled; extern int nrips; extern int vgif; extern bool intercept_smi; -extern bool x2avic_enabled; extern bool vnmi; extern int lbrv; @@ -804,7 +803,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \ ) -bool avic_hardware_setup(void); +bool __init avic_hardware_setup(void); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); From ce4253e21fa8a4468474e26884196770cb560eae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:31 -0700 Subject: [PATCH 20/23] KVM: SVM: Always print "AVIC enabled" separately, even when force enabled Print the customary "AVIC enabled" informational message even when AVIC is force enabled on a system that doesn't advertise supported for AVIC in CPUID, as not printing the standard message can confuse users and tools. Opportunistically clean up the scary message when AVIC is force enabled, but keep it as separate message so that it is printed at level "warn", versus the standard message only being printed for level "info". Suggested-by: Naveen N Rao (AMD) Reviewed-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b4577401ce5f..b8b73c4103c6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1167,16 +1167,15 @@ bool __init avic_hardware_setup(void) return false; } - if (boot_cpu_has(X86_FEATURE_AVIC)) { - pr_info("AVIC enabled\n"); - } else if (force_avic) { - /* - * Some older systems does not advertise AVIC support. - * See Revision Guide for specific AMD processor for more detail. - */ - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } + /* + * Print a scary message if AVIC is force enabled to make it abundantly + * clear that ignoring CPUID could have repercussions. See Revision + * Guide for specific AMD processor for more details. + */ + if (!boot_cpu_has(X86_FEATURE_AVIC)) + pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); + + pr_info("AVIC enabled\n"); /* AVIC is a prerequisite for x2AVIC. */ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); From ad65dca2ca4cf8c377135362c0c1e031ad92019d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:32 -0700 Subject: [PATCH 21/23] KVM: SVM: Don't advise the user to do force_avic=y (when x2AVIC is detected) Don't advise the end user to try to force enable AVIC when x2AVIC is reported as supported in CPUID, as forcefully enabling AVIC isn't something that should be done lightly. E.g. some Zen4 client systems hide AVIC but leave x2AVIC behind, and while such a configuration is indeed due to buggy firmware in the sense the reporting x2AVIC without AVIC is nonsensical, KVM has no idea _why_ firmware disabled AVIC in the first place. Suggesting that the user try to run with force_avic=y is sketchy even if the user explicitly tries to enable AVIC, and will be downright irresponsible once KVM starts enabling AVIC by default. Alternatively, KVM could print the message only when the user explicitly asks for AVIC, but running with force_avic=y isn't something that should be encouraged for random users. force_avic is a useful knob for developers and perhaps even advanced users, but isn't something that KVM should advertise broadly. Opportunistically append a newline to the pr_warn() so that it prints out immediately, and tweak the message to say that AVIC is unsupported instead of disabled (disabled suggests that the kernel/KVM is somehow responsible). Suggested-by: Naveen N Rao (AMD) Reviewed-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b8b73c4103c6..35dde7d89f56 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1154,10 +1154,8 @@ bool __init avic_hardware_setup(void) /* AVIC is a prerequisite for x2AVIC. */ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { - if (boot_cpu_has(X86_FEATURE_X2AVIC)) { - pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); - pr_warn(FW_BUG "Try enable AVIC using force_avic option"); - } + if (boot_cpu_has(X86_FEATURE_X2AVIC)) + pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n"); return false; } From b14665353162db70e445aacdd18b0566edba00c2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 14:59:33 -0700 Subject: [PATCH 22/23] KVM: SVM: Move global "avic" variable to avic.c Move "avic" to avic.c so that it's colocated with the other AVIC specific globals and module params, and so that avic_hardware_setup() is a bit more self-contained, e.g. similar to sev_hardware_setup(). Deliberately set enable_apicv in svm.c as it's already globally visible (defined by kvm.ko, not by kvm-amd.ko), and to clearly capture the dependency on enable_apicv being initialized (svm_hardware_setup() clears several AVIC-specific hooks when enable_apicv is disabled). Alternatively, clearing of the hooks (and enable_ipiv) could be moved to avic_hardware_setup(), but that's not obviously better, e.g. it's helpful to isolate the setting of enable_apicv when reading code from the generic x86 side of the world. No functional change intended. Acked-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250919215934.1590410-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 33 +++++++++++++++++++++++++-------- arch/x86/kvm/svm/svm.c | 11 +---------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 35dde7d89f56..ec214062d136 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -64,6 +64,14 @@ static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); +/* + * enable / disable AVIC. Because the defaults differ for APICv + * support between VMX and SVM we cannot use module_param_named. + */ +static bool avic; +module_param(avic, bool, 0444); +module_param(enable_ipiv, bool, 0444); + static bool force_avic; module_param_unsafe(force_avic, bool, 0444); @@ -1141,15 +1149,9 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } -/* - * Note: - * - The module param avic enable both xAPIC and x2APIC mode. - * - Hypervisor can support both xAVIC and x2AVIC in the same guest. - * - The mode can be switched at run-time. - */ -bool __init avic_hardware_setup(void) +static bool __init avic_want_avic_enabled(void) { - if (!npt_enabled) + if (!avic || !npt_enabled) return false; /* AVIC is a prerequisite for x2AVIC. */ @@ -1173,6 +1175,21 @@ bool __init avic_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_AVIC)) pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n"); + return true; +} + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool __init avic_hardware_setup(void) +{ + avic = avic_want_avic_enabled(); + if (!avic) + return false; + pr_info("AVIC enabled\n"); /* AVIC is a prerequisite for x2AVIC. */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3fd2f4097a3b..748881a3dedb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -158,14 +158,6 @@ module_param(lbrv, int, 0444); static int tsc_scaling = true; module_param(tsc_scaling, int, 0444); -/* - * enable / disable AVIC. Because the defaults differ for APICv - * support between VMX and SVM we cannot use module_param_named. - */ -static bool avic; -module_param(avic, bool, 0444); -module_param(enable_ipiv, bool, 0444); - module_param(enable_device_posted_irqs, bool, 0444); bool __read_mostly dump_invalid_vmcb; @@ -5330,8 +5322,7 @@ static __init int svm_hardware_setup(void) goto err; } - enable_apicv = avic = avic && avic_hardware_setup(); - + enable_apicv = avic_hardware_setup(); if (!enable_apicv) { enable_ipiv = false; svm_x86_ops.vcpu_blocking = NULL; From ca2967de5a5b098b43c5ad665672945ce7e7d4f7 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 19 Sep 2025 14:59:34 -0700 Subject: [PATCH 23/23] KVM: SVM: Enable AVIC by default for Zen4+ if x2AVIC is support AVIC and x2AVIC are fully functional since Zen 4, with no known hardware errata. Enable AVIC and x2AVIC by default on Zen4+ so long as x2AVIC is supported (to avoid enabling partial support for APIC virtualization by default). Internally, convert "avic" to an integer so that KVM can identify if the user has asked to explicitly enable or disable AVIC, i.e. so that KVM doesn't override an explicit 'y' from the user. Arbitrarily use -1 to denote auto-mode, and accept the string "auto" for the module param in addition to standard boolean values, i.e. continue to allow the user to configure the "avic" module parameter to explicitly enable/disable AVIC. To again maintain backward compatibility with a standard boolean param, set KERNEL_PARAM_OPS_FL_NOARG, which tells the params infrastructure to allow empty values for %true, i.e. to interpret a bare "avic" as "avic=y". Take care to check for a NULL @val when looking for "auto"! Lastly, always print "avic" as a boolean, since auto-mode is resolved during module initialization, i.e. the user should never see "auto" in sysfs. Signed-off-by: Naveen N Rao (AMD) Tested-by: Naveen N Rao (AMD) Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20250919215934.1590410-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ec214062d136..f286b5706d7c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -64,12 +64,32 @@ static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); +#define AVIC_AUTO_MODE -1 + +static int avic_param_set(const char *val, const struct kernel_param *kp) +{ + if (val && sysfs_streq(val, "auto")) { + *(int *)kp->arg = AVIC_AUTO_MODE; + return 0; + } + + return param_set_bint(val, kp); +} + +static const struct kernel_param_ops avic_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = avic_param_set, + .get = param_get_bool, +}; + /* - * enable / disable AVIC. Because the defaults differ for APICv - * support between VMX and SVM we cannot use module_param_named. + * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled + * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met). */ -static bool avic; -module_param(avic, bool, 0444); +static int avic = AVIC_AUTO_MODE; +module_param_cb(avic, &avic_ops, &avic, 0444); +__MODULE_PARM_TYPE(avic, "bool"); + module_param(enable_ipiv, bool, 0444); static bool force_avic; @@ -1151,6 +1171,18 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) static bool __init avic_want_avic_enabled(void) { + /* + * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is + * supported (to avoid enabling partial support by default, and because + * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for + * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags + * aren't inclusive of previous generations, i.e. the kernel will set + * at most one ZenX feature flag. + */ + if (avic == AVIC_AUTO_MODE) + avic = boot_cpu_has(X86_FEATURE_X2AVIC) && + (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4)); + if (!avic || !npt_enabled) return false;