From 93c9e107386dbe1243287a5b14ceca894de372b9 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 22 Sep 2025 09:29:22 -0700 Subject: [PATCH 01/21] KVM: SVM: Mark VMCB_PERM_MAP as dirty on nested VMRUN Mark the VMCB_PERM_MAP bit as dirty in nested_vmcb02_prepare_control() on every nested VMRUN. If L1 changes MSR interception (INTERCEPT_MSR_PROT) between two VMRUN instructions on the same L1 vCPU, the msrpm_base_pa in the associated vmcb02 will change, and the VMCB_PERM_MAP clean bit should be cleared. Fixes: 4bb170a5430b ("KVM: nSVM: do not mark all VMCB02 fields dirty on nested vmexit") Reported-by: Matteo Rizzo Cc: stable@vger.kernel.org Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20250922162935.621409-2-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a6443feab252..35cea27862c6 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -752,6 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP); /* * Stash vmcb02's counter if the guest hasn't moved past the guilty From 7c8b465a1c91f674655ea9cec5083744ec5f796a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 22 Sep 2025 09:29:23 -0700 Subject: [PATCH 02/21] KVM: SVM: Mark VMCB_NPT as dirty on nested VMRUN Mark the VMCB_NPT bit as dirty in nested_vmcb02_prepare_save() on every nested VMRUN. If L1 changes the PAT MSR between two VMRUN instructions on the same L1 vCPU, the g_pat field in the associated vmcb02 will change, and the VMCB_NPT clean bit should be cleared. Fixes: 4bb170a5430b ("KVM: nSVM: do not mark all VMCB02 fields dirty on nested vmexit") Cc: stable@vger.kernel.org Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20250922162935.621409-3-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 35cea27862c6..83de3456df70 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -613,6 +613,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 struct kvm_vcpu *vcpu = &svm->vcpu; nested_vmcb02_compute_g_pat(svm); + vmcb_mark_dirty(vmcb02, VMCB_NPT); /* Load the nested guest state */ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { From 4793f990ea1523309a58d8fb5237b3a815e6f537 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 30 Sep 2025 17:14:07 -0700 Subject: [PATCH 03/21] KVM: x86: Advertise EferLmsleUnsupported to userspace CPUID.80000008H:EBX.EferLmsleUnsupported[bit 20] is a defeature bit. When this bit is clear, EFER.LMSLE is supported. When this bit is set, EFER.LMLSE is unsupported. KVM has never _emulated_ EFER.LMSLE, so KVM cannot truly support a 0-setting of this bit. However, KVM has allowed the guest to enable EFER.LMSLE in hardware since commit eec4b140c924 ("KVM: SVM: Allow EFER.LMSLE to be set with nested svm"), i.e. KVM partially virtualizes long-mode segment limits _if_ they are supported by the underlying hardware. Pass through the bit in KVM_GET_SUPPORTED_CPUID to advertise the unavailability of EFER.LMSLE to userspace based on the raw underlying hardware. Attempting to enable EFER.LSMLE on such CPUs simply doesn't work, e.g. immediately crashes on VMRUN. Signed-off-by: Jim Mattson Reviewed-by: Nikunj A Dadhania Reviewed-by: Yosry Ahmed Link: https://lore.kernel.org/r/20251001001529.1119031-2-jmattson@google.com [sean: add context about partial virtualization, use PASSTHROUGH_F] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4091a776e37a..6bdf868c8f8e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -338,6 +338,7 @@ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ #define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ +#define X86_FEATURE_EFER_LMSLE_MBZ (13*32+20) /* EFER.LMSLE must be zero */ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 52524e0ca97f..d563a948318b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1135,6 +1135,7 @@ void kvm_set_cpu_caps(void) F(AMD_STIBP), F(AMD_STIBP_ALWAYS_ON), F(AMD_IBRS_SAME_MODE), + PASSTHROUGH_F(EFER_LMSLE_MBZ), F(AMD_PSFD), F(AMD_IBPB_RET), ); From c53c632592a427bc01266a8ce7e2f17555a3c247 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 30 Sep 2025 17:14:08 -0700 Subject: [PATCH 04/21] KVM: SVM: Disallow EFER.LMSLE when not supported by hardware Modern AMD CPUs do not support segment limit checks in 64-bit mode (i.e. EFER.LMSLE must be zero). Do not allow a guest to set EFER.LMSLE on a CPU that requires the bit to be zero. For backwards compatibility, allow EFER.LMSLE to be set on CPUs that support segment limit checks in 64-bit mode, even though KVM's implementation of the feature is incomplete (e.g. KVM's emulator does not enforce segment limits in 64-bit mode). Fixes: eec4b140c924 ("KVM: SVM: Allow EFER.LMSLE to be set with nested svm") Signed-off-by: Jim Mattson Reviewed-by: Nikunj A Dadhania Reviewed-by: Yosry Ahmed Link: https://lore.kernel.org/r/20251001001529.1119031-3-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 153c12dbf3eb..dadb562bd4b9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5320,7 +5320,9 @@ static __init int svm_hardware_setup(void) if (nested) { pr_info("Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + kvm_enable_efer_bits(EFER_SVME); + if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ)) + kvm_enable_efer_bits(EFER_LMSLE); r = nested_svm_init_msrpm_merge_offsets(); if (r) From 574ef752d4aea04134bc121294d717f4422c2755 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:01 +0530 Subject: [PATCH 05/21] KVM: SVM: Limit AVIC physical max index based on configured max_vcpu_ids KVM allows VMMs to specify the maximum possible APIC ID for a virtual machine through KVM_CAP_MAX_VCPU_ID capability so as to limit data structures related to APIC/x2APIC. Utilize the same to set the AVIC physical max index in the VMCB, similar to VMX. This helps hardware limit the number of entries to be scanned in the physical APIC ID table speeding up IPI broadcasts for virtual machines with smaller number of vCPUs. Unlike VMX, SVM AVIC requires a single page to be allocated for the Physical APIC ID table and the Logical APIC ID table, so retain the existing approach of allocating those during VM init. Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/adb07ccdb3394cd79cb372ba6bcc69a4e4d4ef54.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f286b5706d7c..1a0573f5ff7c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -161,6 +161,7 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; + struct kvm *kvm = svm->vcpu.kvm; vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; @@ -176,7 +177,8 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; - vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + vmcb->control.avic_physical_id |= min(kvm->arch.max_vcpu_ids - 1, + X2AVIC_MAX_PHYSICAL_ID); /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { @@ -187,7 +189,8 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); /* For xAVIC and hybrid-xAVIC modes */ - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + vmcb->control.avic_physical_id |= min(kvm->arch.max_vcpu_ids - 1, + AVIC_MAX_PHYSICAL_ID); /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } From f2f6e67a56dc88fea7e9b10c4e79bb01d97386b7 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:02 +0530 Subject: [PATCH 06/21] KVM: SVM: Add a helper to look up the max physical ID for AVIC To help with a future change, add a helper to look up the maximum physical ID depending on the vCPU AVIC mode. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/0ab9bf5e20a3463a4aa3a5ea9bbbac66beedf1d1.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1a0573f5ff7c..96bad58ee3b4 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -158,13 +158,31 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, svm->x2avic_msrs_intercepted = intercept; } +static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +{ + u32 arch_max; + + if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) + arch_max = X2AVIC_MAX_PHYSICAL_ID; + else + arch_max = AVIC_MAX_PHYSICAL_ID; + + /* + * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID + * plus one, so the max possible APIC ID is one less than that. + */ + return min(vcpu->kvm->arch.max_vcpu_ids - 1, arch_max); +} + static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; - struct kvm *kvm = svm->vcpu.kvm; + struct kvm_vcpu *vcpu = &svm->vcpu; vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; @@ -177,8 +195,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; - vmcb->control.avic_physical_id |= min(kvm->arch.max_vcpu_ids - 1, - X2AVIC_MAX_PHYSICAL_ID); + /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { @@ -188,9 +205,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); - /* For xAVIC and hybrid-xAVIC modes */ - vmcb->control.avic_physical_id |= min(kvm->arch.max_vcpu_ids - 1, - AVIC_MAX_PHYSICAL_ID); /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } From 83f3cbcd3a9f62adfa52257e3b7ae6bf8af54baa Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:03 +0530 Subject: [PATCH 07/21] KVM: SVM: Replace hard-coded value 0x1FF with the corresponding macro The lower 9-bit field in EXITINFO2 represents an index into the AVIC Physical/Logical APIC ID table for a AVIC_INCOMPLETE_IPI #VMEXIT. Since the index into the Logical APIC ID table is just 8 bits, this field is actually bound by the bit-width of the index into the AVIC Physical ID table which is represented by AVIC_PHYSICAL_MAX_INDEX_MASK. So, use that macro to mask EXITINFO2.Index instead of hard coding 0x1FF in avic_incomplete_ipi_interception(). Co-developed-by: Suravee Suthikulpanit Signed-off-by: Suravee Suthikulpanit Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/95795f449c68bffcb3e1789ee2b0b7393711d37d.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 96bad58ee3b4..b31be39a118f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -579,7 +579,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; + u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK; struct kvm_lapic *apic = vcpu->arch.apic; trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); From ca11d9d35e958d2b4020d1a360ddc277be3ee86c Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:04 +0530 Subject: [PATCH 08/21] KVM: SVM: Expand AVIC_PHYSICAL_MAX_INDEX_MASK to be a 12-bit field In the latest APM describing AVIC support for 4k vCPUs, VMCB AVIC_PHYSICAL_MAX_INDEX (Offset 0xF8) and EXITINFO2.Index are both updated from 9-bit wide to 12-bit wide fields unconditionally (i.e., regardless of AVIC support for 4k vCPUs). Expand AVIC_PHYSICAL_MAX_INDEX_MASK accordingly. While AVIC_PHYSICAL_MAX_INDEX_MASK is updated to a 12-bit field, KVM will limit the max vCPU/APIC ID based on the maximum supported on a specific processor and enforce that limit during vCPU creation. I.e., KVM doesn't need to rely on the mask to ensure that the max APIC ID being programmed in the VMCB is in range. The additional bits (11:9) were previously marked reserved and were never set/read by older processors. Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/a24ae953cea716bf9c56c136f7ca4bf5e97b1080.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 17f6c3fedeee..d227e710c6b4 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -279,7 +279,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; -#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(11, 0) /* * For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as From 54ffe74cc4ab2e7c6dd0a37a2298fffb642acba7 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:05 +0530 Subject: [PATCH 09/21] KVM: SVM: Move AVIC Physical ID table allocation to vcpu_precreate() With support for 4k vCPUs in x2AVIC, the size of the AVIC Physical ID table is expanded from a single 4k page to a maximum of 8 contiguous 4k pages. The actual number of pages allocated depends on the maximum possible APIC ID in the guest, which is only known by the time the first vCPU is created. In preparation for supporting a dynamic AVIC Physical ID table size, move its allocation to vcpu_precreate(). Suggested-by: Sean Christopherson Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/7dc764e0af7f01440bbac3d9215ed174027c2384.1757009416.git.naveen@kernel.org [sean: drop enable_apicv check from svm_vcpu_precreate()] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 21 +++++++++++++++++---- arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b31be39a118f..9866ef73501e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -264,6 +264,23 @@ int avic_ga_log_notifier(u32 ga_tag) return 0; } +int avic_alloc_physical_id_table(struct kvm *kvm) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_apicv) + return 0; + + if (kvm_svm->avic_physical_id_table) + return 0; + + kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_physical_id_table) + return -ENOMEM; + + return 0; +} + void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; @@ -291,10 +308,6 @@ int avic_vm_init(struct kvm *kvm) if (!enable_apicv) return 0; - kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!kvm_svm->avic_physical_id_table) - goto free_avic; - kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!kvm_svm->avic_logical_id_table) goto free_avic; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dadb562bd4b9..f14709a511aa 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1207,6 +1207,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) svm->vmcb = target_vmcb->ptr; } +static int svm_vcpu_precreate(struct kvm *kvm) +{ + return avic_alloc_physical_id_table(kvm); +} + static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; @@ -5016,6 +5021,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, + .vcpu_precreate = svm_vcpu_precreate, .vcpu_create = svm_vcpu_create, .vcpu_free = svm_vcpu_free, .vcpu_reset = svm_vcpu_reset, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e4b04f435b3d..6765a5e433ce 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -806,6 +806,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; bool __init avic_hardware_setup(void); int avic_ga_log_notifier(u32 ga_tag); +int avic_alloc_physical_id_table(struct kvm *kvm); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); From 5d0316e25defee47c8c7b1b6324244f630b6621f Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:06 +0530 Subject: [PATCH 10/21] x86/cpufeatures: Add X86_FEATURE_X2AVIC_EXT Add CPUID feature bit for x2AVIC extension that enables AMD SVM to support up to 4096 vCPUs in x2AVIC mode. The primary change is in the size of the AVIC Physical ID table, which can now go up to 8 contiguous 4k pages. The number of pages allocated is controlled by the maximum APIC ID for a guest, and that controls the number of pages to allocate for the AVIC Physical ID table. AVIC hardware is enhanced to look up Physical ID table entries for vCPUs > 512 for locating the target APIC backing page and the host APIC ID of the physical core on which the guest vCPU is running. Signed-off-by: Naveen N Rao (AMD) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/e5c9c471ab99a130bf9b728b77050ab308cf8624.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6bdf868c8f8e..7129eb44adad 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -500,6 +500,7 @@ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ +#define X86_FEATURE_X2AVIC_EXT (21*32+17) /* AMD SVM x2AVIC support for 4k vCPUs */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index caa4dc885c21..aa7f21f5f46b 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -49,6 +49,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 }, { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, From 940fc47cfb0d78b0f5db1f71dfce07c4711b9457 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 5 Sep 2025 00:03:07 +0530 Subject: [PATCH 11/21] KVM: SVM: Add AVIC support for 4k vCPUs in x2AVIC mode With AVIC support for 4k vCPUs, the maximum supported physical ID in x2AVIC mode is 4095. Since this is no longer fixed, introduce a variable (x2avic_max_physical_id) to capture the maximum supported physical ID on the current platform and use that in place of the existing macro (X2AVIC_MAX_PHYSICAL_ID). With AVIC support for 4k vCPUs, the AVIC Physical ID table is no longer a single page and can occupy up to 8 contiguous 4k pages. Since AVIC hardware accesses of the physical ID table are limited by the physical max index programmed in the VMCB, it is sufficient to allocate only as many pages as are required to have a physical table entry for the max guest APIC ID. Since the guest APIC mode is not available at this point, provision for the maximum possible x2AVIC ID. For this purpose, add a variant of avic_get_max_physical_id() that works with a NULL vCPU pointer and returns the max x2AVIC ID. Wrap this in a new helper for obtaining the allocation order. To make it easy to identify support for 4k vCPUs in x2AVIC mode, update the message printed to the kernel log to print the maximum number of vCPUs supported. Do this on all platforms supporting x2AVIC since it is useful to know what is supported on a specific platform. Co-developed-by: Suravee Suthikulpanit Signed-off-by: Suravee Suthikulpanit Signed-off-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/7fc5962f6da028f7dd3c79dbbd5c574fa02c99dd.1757009416.git.naveen@kernel.org Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 3 +++ arch/x86/kvm/svm/avic.c | 50 ++++++++++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index d227e710c6b4..e69b6d0dedcf 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -289,11 +289,14 @@ enum avic_ipi_failure_cause { /* * For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511). + * With X86_FEATURE_X2AVIC_EXT, the max index is increased to 0xfff (4095). */ #define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL +#define X2AVIC_4K_MAX_PHYSICAL_ID 0xFFFUL static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID); static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); +static_assert((X2AVIC_4K_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_4K_MAX_PHYSICAL_ID); #define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 9866ef73501e..9c17f4269a71 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -106,7 +106,7 @@ static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); static bool x2avic_enabled; - +static u32 x2avic_max_physical_id; static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) @@ -158,12 +158,16 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, svm->x2avic_msrs_intercepted = intercept; } -static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu) { u32 arch_max; - if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) - arch_max = X2AVIC_MAX_PHYSICAL_ID; + /* + * Return the largest size (x2APIC) when querying without a vCPU, e.g. + * to allocate the per-VM table.. + */ + if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic))) + arch_max = x2avic_max_physical_id; else arch_max = AVIC_MAX_PHYSICAL_ID; @@ -171,7 +175,12 @@ static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID * plus one, so the max possible APIC ID is one less than that. */ - return min(vcpu->kvm->arch.max_vcpu_ids - 1, arch_max); + return min(kvm->arch.max_vcpu_ids - 1, arch_max); +} + +static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +{ + return __avic_get_max_physical_id(vcpu->kvm, vcpu); } static void avic_activate_vmcb(struct vcpu_svm *svm) @@ -264,6 +273,12 @@ int avic_ga_log_notifier(u32 ga_tag) return 0; } +static int avic_get_physical_id_table_order(struct kvm *kvm) +{ + /* Provision for the maximum physical ID supported in x2avic mode */ + return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64)); +} + int avic_alloc_physical_id_table(struct kvm *kvm) { struct kvm_svm *kvm_svm = to_kvm_svm(kvm); @@ -274,7 +289,8 @@ int avic_alloc_physical_id_table(struct kvm *kvm) if (kvm_svm->avic_physical_id_table) return 0; - kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + avic_get_physical_id_table_order(kvm)); if (!kvm_svm->avic_physical_id_table) return -ENOMEM; @@ -290,7 +306,8 @@ void avic_vm_destroy(struct kvm *kvm) return; free_page((unsigned long)kvm_svm->avic_logical_id_table); - free_page((unsigned long)kvm_svm->avic_physical_id_table); + free_pages((unsigned long)kvm_svm->avic_physical_id_table, + avic_get_physical_id_table_order(kvm)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_del(&kvm_svm->hnode); @@ -372,7 +389,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) * fully initialized AVIC. */ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || - (id > X2AVIC_MAX_PHYSICAL_ID)) { + (id > x2avic_max_physical_id)) { kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); vcpu->arch.apic->apicv_active = false; return 0; @@ -992,7 +1009,8 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; - if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= + PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) return; /* @@ -1054,7 +1072,8 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) lockdep_assert_preemption_disabled(); - if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= + PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) return; /* @@ -1256,10 +1275,15 @@ bool __init avic_hardware_setup(void) /* AVIC is a prerequisite for x2AVIC. */ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); - if (x2avic_enabled) - pr_info("x2AVIC enabled\n"); - else + if (x2avic_enabled) { + if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT)) + x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID; + else + x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID; + pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1); + } else { svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; + } /* * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) From 3d31bdf9cc79a3752bd1b6ba91af4e5ba37c47a8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 Oct 2025 19:29:16 +0000 Subject: [PATCH 12/21] KVM: nSVM: Remove redundant cases in nested_svm_intercept() Both the CRx and DRx cases are doing exactly what the default case is doing, remove them. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251024192918.3191141-2-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 83de3456df70..71664d54d8b2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1438,16 +1438,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm) case SVM_EXIT_IOIO: vmexit = nested_svm_intercept_ioio(svm); break; - case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) - vmexit = NESTED_EXIT_DONE; - break; - } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { /* * Host-intercepted exceptions have been checked already in From 5674a76db0213f9db1e4d08e847ff649b46889c0 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 Oct 2025 19:29:17 +0000 Subject: [PATCH 13/21] KVM: nSVM: Propagate SVM_EXIT_CR0_SEL_WRITE correctly for LMSW emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When emulating L2 instructions, svm_check_intercept() checks whether a write to CR0 should trigger a synthesized #VMEXIT with SVM_EXIT_CR0_SEL_WRITE. For MOV-to-CR0, SVM_EXIT_CR0_SEL_WRITE is only triggered if any bit other than CR0.MP and CR0.TS is updated. However, according to the APM (24593—Rev. 3.42—March 2024, Table 15-7): The LMSW instruction treats the selective CR0-write intercept as a non-selective intercept (i.e., it intercepts regardless of the value being written). Skip checking the changed bits for x86_intercept_lmsw and always inject SVM_EXIT_CR0_SEL_WRITE. Fixes: cfec82cb7d31 ("KVM: SVM: Add intercept check for emulated cr accesses") Cc: stable@vger.kernel.org Reported-by: Matteo Rizzo Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251024192918.3191141-3-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f14709a511aa..bd8df212a59d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4546,20 +4546,20 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, INTERCEPT_SELECTIVE_CR0))) break; - cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; - val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; - + /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */ if (info->intercept == x86_intercept_lmsw) { - cr0 &= 0xfUL; - val &= 0xfUL; - /* lmsw can't clear PE - catch this here */ - if (cr0 & X86_CR0_PE) - val |= X86_CR0_PE; + icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + break; } + /* + * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit + * other than SVM_CR0_SELECTIVE_MASK is changed. + */ + cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; + val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; if (cr0 ^ val) icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; - break; } case SVM_EXIT_READ_DR0: From 3d80f4c93d3d26d0f9a0dd2844961a632eeea634 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 Oct 2025 19:29:18 +0000 Subject: [PATCH 14/21] KVM: nSVM: Avoid incorrect injection of SVM_EXIT_CR0_SEL_WRITE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When emulating L2 instructions, svm_check_intercept() checks whether a write to CR0 should trigger a synthesized #VMEXIT with SVM_EXIT_CR0_SEL_WRITE. However, it does not check whether L1 enabled the intercept for SVM_EXIT_WRITE_CR0, which has higher priority according to the APM (24593—Rev. 3.42—March 2024, Table 15-7): When both selective and non-selective CR0-write intercepts are active at the same time, the non-selective intercept takes priority. With respect to exceptions, the priority of this intercept is the same as the generic CR0-write intercept. Make sure L1 does NOT intercept SVM_EXIT_WRITE_CR0 before checking if SVM_EXIT_CR0_SEL_WRITE needs to be injected. Opportunistically tweak the "not CR0" logic to explicitly bail early so that it's more obvious that only CR0 has a selective intercept, and that modifying icpt_info.exit_code is functionally necessary so that the call to nested_svm_exit_handled() checks the correct exit code. Fixes: cfec82cb7d31 ("KVM: SVM: Add intercept check for emulated cr accesses") Cc: stable@vger.kernel.org Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251024192918.3191141-4-yosry.ahmed@linux.dev [sean: isolate non-CR0 write logic, tweak comments accordingly] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bd8df212a59d..1ae7b3c5a7c5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4535,15 +4535,29 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, case SVM_EXIT_WRITE_CR0: { unsigned long cr0, val; - if (info->intercept == x86_intercept_cr_write) + /* + * Adjust the exit code accordingly if a CR other than CR0 is + * being written, and skip straight to the common handling as + * only CR0 has an additional selective intercept. + */ + if (info->intercept == x86_intercept_cr_write && info->modrm_reg) { icpt_info.exit_code += info->modrm_reg; + break; + } - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || - info->intercept == x86_intercept_clts) + /* + * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a + * selective CR0 intercept is triggered (the common logic will + * treat the selective intercept as being enabled). Note, the + * unconditional intercept has higher priority, i.e. this is + * only relevant if *only* the selective intercept is enabled. + */ + if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) || + !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) break; - if (!(vmcb12_is_intercept(&svm->nested.ctl, - INTERCEPT_SELECTIVE_CR0))) + /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */ + if (info->intercept == x86_intercept_clts) break; /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */ From c331b400e291a510eb9a0dbdc783b38e6f8321f0 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 6 Nov 2025 11:12:30 -0800 Subject: [PATCH 15/21] KVM: SVM: Ensure SPEC_CTRL[63:32] is context switched between guest and host SPEC_CTRL is an MSR, i.e. a 64-bit value, but the VMRUN assembly code assumes bits 63:32 are always zero. The bug is _currently_ benign because neither KVM nor the kernel support setting any of bits 63:32, but it's still a bug that needs to be fixed. Signed-off-by: Uros Bizjak Suggested-by: Sean Christopherson Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251106191230.182393-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 47 ++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 235c4af6b692..98bfa2e00d88 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -52,11 +52,23 @@ * there must not be any returns or indirect branches between this code * and vmentry. */ - movl SVM_spec_ctrl(%_ASM_DI), %eax - cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax +#ifdef CONFIG_X86_64 + mov SVM_spec_ctrl(%rdi), %rdx + cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx je 801b + movl %edx, %eax + shr $32, %rdx +#else + mov SVM_spec_ctrl(%edi), %eax + mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx + xor %eax, %ecx + mov SVM_spec_ctrl + 4(%edi), %edx + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi + xor %edx, %esi + or %esi, %ecx + je 801b +#endif mov $MSR_IA32_SPEC_CTRL, %ecx - xor %edx, %edx wrmsr jmp 801b .endm @@ -81,13 +93,25 @@ jnz 998f rdmsr movl %eax, SVM_spec_ctrl(%_ASM_DI) + movl %edx, SVM_spec_ctrl + 4(%_ASM_DI) 998: - /* Now restore the host value of the MSR if different from the guest's. */ - movl PER_CPU_VAR(x86_spec_ctrl_current), %eax - cmp SVM_spec_ctrl(%_ASM_DI), %eax +#ifdef CONFIG_X86_64 + mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx + cmp SVM_spec_ctrl(%rdi), %rdx je 901b - xor %edx, %edx + movl %edx, %eax + shr $32, %rdx +#else + mov PER_CPU_VAR(x86_spec_ctrl_current), %eax + mov SVM_spec_ctrl(%edi), %esi + xor %eax, %esi + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx + mov SVM_spec_ctrl + 4(%edi), %edi + xor %edx, %edi + or %edi, %esi + je 901b +#endif wrmsr jmp 901b .endm @@ -134,7 +158,7 @@ SYM_FUNC_START(__svm_vcpu_run) mov %_ASM_ARG1, %_ASM_DI .endif - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */ RESTORE_GUEST_SPEC_CTRL /* @@ -211,7 +235,10 @@ SYM_FUNC_START(__svm_vcpu_run) /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT - /* Clobbers RAX, RCX, RDX. */ + /* + * Clobbers RAX, RCX, RDX (and ESI, EDI on 32-bit), consumes RDI (@svm) + * and RSP (pointer to @spec_ctrl_intercepted). + */ RESTORE_HOST_SPEC_CTRL /* @@ -331,7 +358,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov %rdi, SEV_ES_RDI (%rdx) mov %rsi, SEV_ES_RSI (%rdx) - /* Clobbers RAX, RCX, RDX (@hostsa). */ + /* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ From 4da3768e1820cf15cced390242d8789aed34f54d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 4 Nov 2025 09:55:26 -0800 Subject: [PATCH 16/21] KVM: SVM: Don't skip unrelated instruction if INT3/INTO is replaced When re-injecting a soft interrupt from an INT3, INT0, or (select) INTn instruction, discard the exception and retry the instruction if the code stream is changed (e.g. by a different vCPU) between when the CPU executes the instruction and when KVM decodes the instruction to get the next RIP. As effectively predicted by commit 6ef88d6e36c2 ("KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction"), failure to verify that the correct INTn instruction was decoded can effectively clobber guest state due to decoding the wrong instruction and thus specifying the wrong next RIP. The bug most often manifests as "Oops: int3" panics on static branch checks in Linux guests. Enabling or disabling a static branch in Linux uses the kernel's "text poke" code patching mechanism. To modify code while other CPUs may be executing that code, Linux (temporarily) replaces the first byte of the original instruction with an int3 (opcode 0xcc), then patches in the new code stream except for the first byte, and finally replaces the int3 with the first byte of the new code stream. If a CPU hits the int3, i.e. executes the code while it's being modified, then the guest kernel must look up the RIP to determine how to handle the #BP, e.g. by emulating the new instruction. If the RIP is incorrect, then this lookup fails and the guest kernel panics. The bug reproduces almost instantly by hacking the guest kernel to repeatedly check a static branch[1] while running a drgn script[2] on the host to constantly swap out the memory containing the guest's TSS. [1]: https://gist.github.com/osandov/44d17c51c28c0ac998ea0334edf90b5a [2]: https://gist.github.com/osandov/10e45e45afa29b11e0c7209247afc00b Fixes: 6ef88d6e36c2 ("KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction") Cc: stable@vger.kernel.org Co-developed-by: Sean Christopherson Signed-off-by: Omar Sandoval Link: https://patch.msgid.link/1cc6dcdf36e3add7ee7c8d90ad58414eeb6c3d34.1762278762.git.osandov@fb.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 9 +++++++++ arch/x86/kvm/svm/svm.c | 24 +++++++++++++----------- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48598d017d6f..974d64bf0a4d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2143,6 +2143,11 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); * the gfn, i.e. retrying the instruction will hit a * !PRESENT fault, which results in a new shadow page * and sends KVM back to square one. + * + * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip + * an instruction if it could generate a given software + * interrupt, which must be encoded via + * EMULTYPE_SET_SOFT_INT_VECTOR(). */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) @@ -2153,6 +2158,10 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) #define EMULTYPE_WRITE_PF_TO_SP (1 << 8) +#define EMULTYPE_SKIP_SOFT_INT (1 << 9) + +#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16) +#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff) static inline bool kvm_can_emulate_event_vectoring(int emul_type) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1ae7b3c5a7c5..459db8154929 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -272,6 +272,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + int emul_type, bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); @@ -293,7 +294,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; - if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + if (!kvm_emulate_instruction(vcpu, emul_type)) return 0; if (unlikely(!commit_side_effects)) @@ -311,11 +312,13 @@ done: static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - return __svm_skip_emulated_instruction(vcpu, true); + return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true); } -static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector) { + const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT | + EMULTYPE_SET_SOFT_INT_VECTOR(vector); unsigned long rip, old_rip = kvm_rip_read(vcpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -331,7 +334,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) * in use, the skip must not commit any side effects such as clearing * the interrupt shadow or RFLAGS.RF. */ - if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips)) return -EIO; rip = kvm_rip_read(vcpu); @@ -367,7 +370,7 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu, ex); if (kvm_exception_is_soft(ex->vector) && - svm_update_soft_interrupt_rip(vcpu)) + svm_update_soft_interrupt_rip(vcpu, ex->vector)) return; svm->vmcb->control.event_inj = ex->vector @@ -3642,11 +3645,12 @@ static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { + struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt; struct vcpu_svm *svm = to_svm(vcpu); u32 type; - if (vcpu->arch.interrupt.soft) { - if (svm_update_soft_interrupt_rip(vcpu)) + if (intr->soft) { + if (svm_update_soft_interrupt_rip(vcpu, intr->nr)) return; type = SVM_EVTINJ_TYPE_SOFT; @@ -3654,12 +3658,10 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) type = SVM_EVTINJ_TYPE_INTR; } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr, - vcpu->arch.interrupt.soft, reinjected); + trace_kvm_inj_virq(intr->nr, intr->soft, reinjected); ++vcpu->stat.irq_injections; - svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | type; + svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42ecd093bb4c..8e14c0292809 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9338,6 +9338,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } +static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt, + int emulation_type) +{ + u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type); + + switch (ctxt->b) { + case 0xcc: + return vector == BP_VECTOR; + case 0xcd: + return vector == ctxt->src.val; + case 0xce: + return vector == OF_VECTOR; + default: + return false; + } +} + /* * Decode an instruction for emulation. The caller is responsible for handling * code breakpoints. Note, manually detecting code breakpoints is unnecessary @@ -9448,6 +9465,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { + if (emulation_type & EMULTYPE_SKIP_SOFT_INT && + !is_soft_int_instruction(ctxt, emulation_type)) + return 0; + if (ctxt->mode != X86EMUL_MODE_PROT64) ctxt->eip = (u32)ctxt->_eip; else From 9f4ce4878878cb9694c4284f7a483984d52d4d9a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 30 Oct 2025 22:37:57 +0000 Subject: [PATCH 17/21] KVM: x86: Document a virtualization gap for GIF on AMD CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the APM Volume #2, Section 15.17, Table 15-10 (24593—Rev. 3.42—March 2024), When "GIF==0", an "Debug exception or trap, due to breakpoint register match" should be "Ignored and discarded". KVM lacks any handling of this. Even when vGIF is enabled and vGIF==0, the CPU does not ignore #DBs and relies on the VMM to do so. Handling this is possible, but the complexity is unjustified given the rarity of using HW breakpoints when GIF==0 (e.g. near VMRUN). KVM would need to intercept the #DB, temporarily disable the breakpoint, singe-step over the instruction (probably reusing NMI singe-stepping), and re-enable the breakpoint. Instead, document this as an erratum. Signed-off-by: Yosry Ahmed Reviewed-by: Bagas Sanjaya Link: https://patch.msgid.link/20251030223757.2950309-1-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/errata.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/x86/errata.rst b/Documentation/virt/kvm/x86/errata.rst index 37c79362a48f..a9cf0e004651 100644 --- a/Documentation/virt/kvm/x86/errata.rst +++ b/Documentation/virt/kvm/x86/errata.rst @@ -48,7 +48,14 @@ versus "has_error_code", i.e. KVM's ABI follows AMD behavior. Nested virtualization features ------------------------------ -TBD +On AMD CPUs, when GIF is cleared, #DB exceptions or traps due to a breakpoint +register match are ignored and discarded by the CPU. The CPU relies on the VMM +to fully virtualize this behavior, even when vGIF is enabled for the guest +(i.e. vGIF=0 does not cause the CPU to drop #DBs when the guest is running). +KVM does not virtualize this behavior as the complexity is unjustified given +the rarity of the use case. One way to handle this would be for KVM to +intercept the #DB, temporarily disable the breakpoint, single-step over the +instruction, then re-enable the breakpoint. x2APIC ------ From ce62118a2e4838bcef1050fff55001a0bf87f0cb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:49 -0500 Subject: [PATCH 18/21] KVM: SEV: Consolidate the SEV policy bits in a single header file Consolidate SEV policy bit definitions into a single file. Use include/linux/psp-sev.h to hold the definitions and remove the current definitions from the arch/x86/kvm/svm/sev.c and arch/x86/include/svm.h files. No functional change intended. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/d9639f88a0b521a1a67aeac77cc609fdea1f90bd.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 16 ++++------------ arch/x86/kvm/svm/svm.h | 3 --- include/linux/psp-sev.h | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0835c664fbfd..f04589ae76bb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -65,15 +65,7 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 -/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ -#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) -#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) -#define SNP_POLICY_MASK_SMT BIT_ULL(16) -#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) -#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) -#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) - -#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ +#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ SNP_POLICY_MASK_API_MAJOR | \ SNP_POLICY_MASK_SMT | \ SNP_POLICY_MASK_RSVD_MBO | \ @@ -2207,7 +2199,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.flags) return -EINVAL; - if (params.policy & ~SNP_POLICY_MASK_VALID) + if (params.policy & ~KVM_SNP_POLICY_MASK_VALID) return -EINVAL; /* Check for policy bits that must be set */ @@ -5085,10 +5077,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) /* Check if the SEV policy allows debugging */ if (sev_snp_guest(vcpu->kvm)) { - if (!(sev->policy & SNP_POLICY_DEBUG)) + if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) return NULL; } else { - if (sev->policy & SEV_POLICY_NODBG) + if (sev->policy & SEV_POLICY_MASK_NODBG) return NULL; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6765a5e433ce..a9f6c1ece63d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -117,9 +117,6 @@ struct kvm_sev_info { cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ }; -#define SEV_POLICY_NODBG BIT_ULL(0) -#define SNP_POLICY_DEBUG BIT_ULL(19) - struct kvm_svm { struct kvm kvm; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index e0dbcb4b4fd9..27c92543bf38 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -14,6 +14,25 @@ #include +/* As defined by SEV API, under "Guest Policy". */ +#define SEV_POLICY_MASK_NODBG BIT(0) +#define SEV_POLICY_MASK_NOKS BIT(1) +#define SEV_POLICY_MASK_ES BIT(2) +#define SEV_POLICY_MASK_NOSEND BIT(3) +#define SEV_POLICY_MASK_DOMAIN BIT(4) +#define SEV_POLICY_MASK_SEV BIT(5) +#define SEV_POLICY_MASK_API_MAJOR GENMASK(23, 16) +#define SEV_POLICY_MASK_API_MINOR GENMASK(31, 24) + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ /** From c9434e64e8b4d17511f514f7495008f573595e3e Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:50 -0500 Subject: [PATCH 19/21] crypto: ccp - Add an API to return the supported SEV-SNP policy bits Supported policy bits are dependent on the level of SEV firmware that is currently running. Create an API to return the supported policy bits for the current level of firmware. Signed-off-by: Tom Lendacky Acked-by: Herbert Xu Link: https://patch.msgid.link/e3f711366ddc22e3dd215c987fd2e28dc1c07f54.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- drivers/crypto/ccp/sev-dev.c | 37 ++++++++++++++++++++++++++++++++++++ include/linux/psp-sev.h | 18 ++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 0d13d47c164b..db7c7c50cebc 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2777,6 +2777,43 @@ void sev_platform_shutdown(void) } EXPORT_SYMBOL_GPL(sev_platform_shutdown); +u64 sev_get_snp_policy_bits(void) +{ + struct psp_device *psp = psp_master; + struct sev_device *sev; + u64 policy_bits; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + if (!psp || !psp->sev_data) + return 0; + + sev = psp->sev_data; + + policy_bits = SNP_POLICY_MASK_BASE; + + if (sev->snp_plat_status.feature_info) { + if (sev->snp_feat_info_0.ecx & SNP_RAPL_DISABLE_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_RAPL_DIS; + + if (sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM; + + if (sev->snp_feat_info_0.ecx & SNP_AES_256_XTS_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_MEM_AES_256_XTS; + + if (sev->snp_feat_info_0.ecx & SNP_CXL_ALLOW_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CXL_ALLOW; + + if (sev_version_greater_or_equal(1, 58)) + policy_bits |= SNP_POLICY_MASK_PAGE_SWAP_DISABLE; + } + + return policy_bits; +} +EXPORT_SYMBOL_GPL(sev_get_snp_policy_bits); + void sev_dev_destroy(struct psp_device *psp) { struct sev_device *sev = psp->sev_data; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 27c92543bf38..abcdee256c65 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -32,6 +32,20 @@ #define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) #define SNP_POLICY_MASK_DEBUG BIT_ULL(19) #define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) +#define SNP_POLICY_MASK_CXL_ALLOW BIT_ULL(21) +#define SNP_POLICY_MASK_MEM_AES_256_XTS BIT_ULL(22) +#define SNP_POLICY_MASK_RAPL_DIS BIT_ULL(23) +#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM BIT_ULL(24) +#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE BIT_ULL(25) + +/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */ +#define SNP_POLICY_MASK_BASE (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_MIGRATE_MA | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ @@ -868,7 +882,10 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_RAPL_DISABLE_SUPPORTED BIT(2) #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) +#define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4) +#define SNP_CXL_ALLOW_POLICY_SUPPORTED BIT(5) #ifdef CONFIG_CRYPTO_DEV_SP_PSP @@ -1014,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); bool sev_is_snp_ciphertext_hiding_supported(void); +u64 sev_get_snp_policy_bits(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ From 7a61d61396b97fd6bb9b9bde321c68513034ad11 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:51 -0500 Subject: [PATCH 20/21] KVM: SEV: Publish supported SEV-SNP policy bits Define the set of policy bits that KVM currently knows as not requiring any implementation support within KVM. Provide this value to userspace via the KVM_GET_DEVICE_ATTR ioctl. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/c596f7529518f3f826a57970029451d9385949e5.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/svm/sev.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d420c9c066d4..7ceff6583652 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -502,6 +502,7 @@ struct kvm_sync_regs { /* vendor-specific groups and attributes for system fd */ #define KVM_X86_GRP_SEV 1 # define KVM_X86_SEV_VMSA_FEATURES 0 +# define KVM_X86_SNP_POLICY_BITS 1 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f04589ae76bb..a425674fe993 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -72,6 +72,8 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 SNP_POLICY_MASK_DEBUG | \ SNP_POLICY_MASK_SINGLE_SOCKET) +static u64 snp_supported_policy_bits __ro_after_init; + #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 static u8 sev_enc_bit; @@ -2135,6 +2137,10 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val) *val = sev_supported_vmsa_features; return 0; + case KVM_X86_SNP_POLICY_BITS: + *val = snp_supported_policy_bits; + return 0; + default: return -ENXIO; } @@ -2199,7 +2205,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.flags) return -EINVAL; - if (params.policy & ~KVM_SNP_POLICY_MASK_VALID) + if (params.policy & ~snp_supported_policy_bits) return -EINVAL; /* Check for policy bits that must be set */ @@ -3092,8 +3098,11 @@ out: else if (sev_snp_supported) sev_snp_supported = is_sev_snp_initialized(); - if (sev_snp_supported) + if (sev_snp_supported) { + snp_supported_policy_bits = sev_get_snp_policy_bits() & + KVM_SNP_POLICY_MASK_VALID; nr_ciphertext_hiding_asids = init_args.max_snp_asid; + } /* * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP From 275d6d1189e6d5f8e7c1da43ffd4b09d7089f174 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:52 -0500 Subject: [PATCH 21/21] KVM: SEV: Add known supported SEV-SNP policy bits Add to the known supported SEV-SNP policy bits that don't require any implementation support from KVM in order to successfully use them. At this time, this includes: - CXL_ALLOW - MEM_AES_256_XTS - RAPL_DIS - CIPHERTEXT_HIDING_DRAM - PAGE_SWAP_DISABLE Arguably, RAPL_DIS and CIPHERTEXT_HIDING_DRAM require KVM and the CCP driver to enable these features in order for the setting of the policy bits to be successfully handled. But, a guest owner may not wish their guest to run on a system that doesn't provide support for those features, so allowing the specification of these bits accomplishes that. Whether or not the bit is supported by SEV firmware, a system that doesn't support these features will either fail during the KVM validation of supported policy bits before issuing the LAUNCH_START or fail during the LAUNCH_START. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/ec040de9864099cf592a97c201dc4cc110b2b0cf.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a425674fe993..f59c65abe3cf 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -65,12 +65,22 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 -#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ - SNP_POLICY_MASK_API_MAJOR | \ - SNP_POLICY_MASK_SMT | \ - SNP_POLICY_MASK_RSVD_MBO | \ - SNP_POLICY_MASK_DEBUG | \ - SNP_POLICY_MASK_SINGLE_SOCKET) +/* + * SEV-SNP policy bits that can be supported by KVM. These include policy bits + * that have implementation support within KVM or policy bits that do not + * require implementation support within KVM to enforce the policy. + */ +#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET | \ + SNP_POLICY_MASK_CXL_ALLOW | \ + SNP_POLICY_MASK_MEM_AES_256_XTS | \ + SNP_POLICY_MASK_RAPL_DIS | \ + SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \ + SNP_POLICY_MASK_PAGE_SWAP_DISABLE) static u64 snp_supported_policy_bits __ro_after_init;