diff options
author | 2022-10-07 07:11:59 -0400 | |
---|---|---|
committer | 2022-10-07 07:11:59 -0400 | |
commit | e27f74f7a987777e413fae28ed697b00889a687a (patch) | |
tree | 1edcddee2db408eed9323ba25d2d814e922287b1 | |
parent | Linuxpatch 5.4.216 (diff) | |
download | linux-patches-5.4-222.tar.gz linux-patches-5.4-222.tar.bz2 linux-patches-5.4-222.zip |
Linux patch 5.4.2175.4-222
Signed-off-by: Mike Pagano <mpagano@gentoo.org>
-rw-r--r-- | 0000_README | 4 | ||||
-rw-r--r-- | 1216_linux-5.4.217.patch | 3111 |
2 files changed, 3115 insertions, 0 deletions
diff --git a/0000_README b/0000_README index 91501850..c7ac01c3 100644 --- a/0000_README +++ b/0000_README @@ -907,6 +907,10 @@ Patch: 1215_linux-5.4.216.patch From: http://www.kernel.org Desc: Linux 5.4.216 +Patch: 1216_linux-5.4.217.patch +From: http://www.kernel.org +Desc: Linux 5.4.217 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1216_linux-5.4.217.patch b/1216_linux-5.4.217.patch new file mode 100644 index 00000000..342e7a14 --- /dev/null +++ b/1216_linux-5.4.217.patch @@ -0,0 +1,3111 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index db9d53b879f89..8f71a17ad5442 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4298,6 +4298,18 @@ + + retain_initrd [RAM] Keep initrd memory after extraction + ++ retbleed= [X86] Control mitigation of RETBleed (Arbitrary ++ Speculative Code Execution with Return Instructions) ++ vulnerability. ++ ++ off - unconditionally disable ++ auto - automatically select a migitation ++ ++ Selecting 'auto' will choose a mitigation method at run ++ time according to the CPU. ++ ++ Not specifying this option is equivalent to retbleed=auto. ++ + rfkill.default_state= + 0 "airplane mode". All wifi, bluetooth, wimax, gps, fm, + etc. communication is blocked by default. +@@ -4541,6 +4553,7 @@ + eibrs - enhanced IBRS + eibrs,retpoline - enhanced IBRS + Retpolines + eibrs,lfence - enhanced IBRS + LFENCE ++ ibrs - use IBRS to protect kernel + + Not specifying this option is equivalent to + spectre_v2=auto. +diff --git a/Documentation/process/code-of-conduct-interpretation.rst b/Documentation/process/code-of-conduct-interpretation.rst +index e899f14a4ba24..4f8a06b00f608 100644 +--- a/Documentation/process/code-of-conduct-interpretation.rst ++++ b/Documentation/process/code-of-conduct-interpretation.rst +@@ -51,7 +51,7 @@ the Technical Advisory Board (TAB) or other maintainers if you're + uncertain how to handle situations that come up. It will not be + considered a violation report unless you want it to be. If you are + uncertain about approaching the TAB or any other maintainers, please +-reach out to our conflict mediator, Mishi Choudhary <mishi@linux.com>. ++reach out to our conflict mediator, Joanna Lee <joanna.lee@gesmer.com>. + + In the end, "be kind to each other" is really what the end goal is for + everybody. We know everyone is human and we all fail at times, but the +diff --git a/Makefile b/Makefile +index 3d9d7ef6f8bf1..201ac8e410a94 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 5 + PATCHLEVEL = 4 +-SUBLEVEL = 216 ++SUBLEVEL = 217 + EXTRAVERSION = + NAME = Kleptomaniac Octopus + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index b3f1214787386..29e5675c6d4f2 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -6,6 +6,8 @@ + #include <asm/percpu.h> + #include <asm/asm-offsets.h> + #include <asm/processor-flags.h> ++#include <asm/msr.h> ++#include <asm/nospec-branch.h> + + /* + +@@ -146,27 +148,19 @@ For 32-bit we have the following conventions - kernel is built with + + .endm + +-.macro POP_REGS pop_rdi=1 skip_r11rcx=0 ++.macro POP_REGS pop_rdi=1 + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +- .if \skip_r11rcx +- popq %rsi +- .else + popq %r11 +- .endif + popq %r10 + popq %r9 + popq %r8 + popq %rax +- .if \skip_r11rcx +- popq %rsi +- .else + popq %rcx +- .endif + popq %rdx + popq %rsi + .if \pop_rdi +@@ -316,6 +310,62 @@ For 32-bit we have the following conventions - kernel is built with + + #endif + ++/* ++ * IBRS kernel mitigation for Spectre_v2. ++ * ++ * Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers ++ * the regs it uses (AX, CX, DX). Must be called before the first RET ++ * instruction (NOTE! UNTRAIN_RET includes a RET instruction) ++ * ++ * The optional argument is used to save/restore the current value, ++ * which is used on the paranoid paths. ++ * ++ * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. ++ */ ++.macro IBRS_ENTER save_reg ++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS ++ movl $MSR_IA32_SPEC_CTRL, %ecx ++ ++.ifnb \save_reg ++ rdmsr ++ shl $32, %rdx ++ or %rdx, %rax ++ mov %rax, \save_reg ++ test $SPEC_CTRL_IBRS, %eax ++ jz .Ldo_wrmsr_\@ ++ lfence ++ jmp .Lend_\@ ++.Ldo_wrmsr_\@: ++.endif ++ ++ movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx ++ movl %edx, %eax ++ shr $32, %rdx ++ wrmsr ++.Lend_\@: ++.endm ++ ++/* ++ * Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX) ++ * regs. Must be called after the last RET. ++ */ ++.macro IBRS_EXIT save_reg ++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS ++ movl $MSR_IA32_SPEC_CTRL, %ecx ++ ++.ifnb \save_reg ++ mov \save_reg, %rdx ++.else ++ movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx ++ andl $(~SPEC_CTRL_IBRS), %edx ++.endif ++ ++ movl %edx, %eax ++ shr $32, %rdx ++ wrmsr ++.Lend_\@: ++.endm ++ + /* + * Mitigate Spectre v1 for conditional swapgs code paths. + * +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index bde3e0f85425f..2d837fb54c31b 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -750,7 +750,6 @@ ENTRY(__switch_to_asm) + movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset + #endif + +-#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated +@@ -759,7 +758,6 @@ ENTRY(__switch_to_asm) + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +-#endif + + /* restore callee-saved registers */ + popfl +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 2ba3d53ac5b11..c82136030d58f 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -172,6 +172,10 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) + /* IRQs are off. */ + movq %rax, %rdi + movq %rsp, %rsi ++ ++ /* clobbers %rax, make sure it is after saving the syscall nr */ ++ IBRS_ENTER ++ + call do_syscall_64 /* returns with IRQs disabled */ + + TRACE_IRQS_IRETQ /* we're about to change IF */ +@@ -248,8 +252,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) + * perf profiles. Nothing jumps here. + */ + syscall_return_via_sysret: +- /* rcx and r11 are already restored (see code above) */ +- POP_REGS pop_rdi=0 skip_r11rcx=1 ++ IBRS_EXIT ++ POP_REGS pop_rdi=0 + + /* + * Now all regs are restored except RSP and RDI. +@@ -301,7 +305,6 @@ ENTRY(__switch_to_asm) + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset + #endif + +-#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated +@@ -310,7 +313,6 @@ ENTRY(__switch_to_asm) + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +-#endif + + /* restore callee-saved registers */ + popq %r15 +@@ -622,6 +624,7 @@ GLOBAL(retint_user) + TRACE_IRQS_IRETQ + + GLOBAL(swapgs_restore_regs_and_return_to_usermode) ++ IBRS_EXIT + #ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testb $3, CS(%rsp) +@@ -1248,7 +1251,13 @@ ENTRY(paranoid_entry) + */ + FENCE_SWAPGS_KERNEL_ENTRY + +- ret ++ /* ++ * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like ++ * CR3 above, keep the old value in a callee saved register. ++ */ ++ IBRS_ENTER save_reg=%r15 ++ ++ RET + END(paranoid_entry) + + /* +@@ -1276,12 +1285,20 @@ ENTRY(paranoid_exit) + jmp .Lparanoid_exit_restore + .Lparanoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG ++ ++ /* ++ * Must restore IBRS state before both CR3 and %GS since we need access ++ * to the per-CPU x86_spec_ctrl_shadow variable. ++ */ ++ IBRS_EXIT save_reg=%r15 ++ + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + .Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel + END(paranoid_exit) + ++ + /* + * Save all registers in pt_regs, and switch GS if needed. + */ +@@ -1301,6 +1318,7 @@ ENTRY(error_entry) + FENCE_SWAPGS_USER_ENTRY + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax ++ IBRS_ENTER + + .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ +@@ -1356,6 +1374,7 @@ ENTRY(error_entry) + SWAPGS + FENCE_SWAPGS_USER_ENTRY + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax ++ IBRS_ENTER + + /* + * Pretend that the exception came from user mode: set up pt_regs +@@ -1461,6 +1480,8 @@ ENTRY(nmi) + PUSH_AND_CLEAR_REGS rdx=(%rdx) + ENCODE_FRAME_POINTER + ++ IBRS_ENTER ++ + /* + * At this point we no longer need to worry about stack damage + * due to nesting -- we're on the normal thread stack and we're +@@ -1684,6 +1705,9 @@ end_repeat_nmi: + movq $-1, %rsi + call do_nmi + ++ /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ ++ IBRS_EXIT save_reg=%r15 ++ + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 39913770a44d5..c3c4ea4a6711a 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -4,7 +4,6 @@ + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ +-#include "calling.h" + #include <asm/asm-offsets.h> + #include <asm/current.h> + #include <asm/errno.h> +@@ -17,6 +16,8 @@ + #include <linux/linkage.h> + #include <linux/err.h> + ++#include "calling.h" ++ + .section .entry.text, "ax" + + /* +@@ -106,6 +107,8 @@ ENTRY(entry_SYSENTER_compat) + xorl %r15d, %r15d /* nospec r15 */ + cld + ++ IBRS_ENTER ++ + /* + * SYSENTER doesn't filter flags, so we need to clear NT and AC + * ourselves. To save a few cycles, we can check whether +@@ -253,6 +256,8 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) + */ + TRACE_IRQS_OFF + ++ IBRS_ENTER ++ + movq %rsp, %rdi + call do_fast_syscall_32 + /* XEN PV guests always use IRET path */ +@@ -267,6 +272,9 @@ sysret32_from_system_call: + */ + STACKLEAK_ERASE + TRACE_IRQS_ON /* User mode traces as IRQs on. */ ++ ++ IBRS_EXIT ++ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ + movq RBP(%rsp), %rbp /* pt_regs->rbp */ + movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ +@@ -408,6 +416,7 @@ ENTRY(entry_INT80_compat) + * gate turned them off. + */ + TRACE_IRQS_OFF ++ IBRS_ENTER + + movq %rsp, %rdi + call do_int80_syscall_32 +diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h +index 0c814cd9ea42c..cdf39decf7340 100644 +--- a/arch/x86/include/asm/cpu_device_id.h ++++ b/arch/x86/include/asm/cpu_device_id.h +@@ -5,15 +5,22 @@ + /* + * Declare drivers belonging to specific x86 CPUs + * Similar in spirit to pci_device_id and related PCI functions ++ * ++ * The wildcard initializers are in mod_devicetable.h because ++ * file2alias needs them. Sigh. + */ +- + #include <linux/mod_devicetable.h> ++/* Get the INTEL_FAM* model defines */ ++#include <asm/intel-family.h> ++/* And the X86_VENDOR_* ones */ ++#include <asm/processor.h> + ++/* Centaur FAM6 models */ ++#define X86_CENTAUR_FAM6_C7_A 0xa + #define X86_CENTAUR_FAM6_C7_D 0xd + #define X86_CENTAUR_FAM6_NANO 0xf + + #define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) +- + /** + * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY +@@ -26,8 +33,11 @@ + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * +- * Backport version to keep the SRBDS pile consistant. No shorter variants +- * required for this. ++ * Use only if you need all selectors. Otherwise use one of the shorter ++ * macros of the X86_MATCH_* family. If there is no matching shorthand ++ * macro, consider to add one. If you really need to wrap one of the macros ++ * into another macro at the usage site for good reasons, then please ++ * start this local macro with X86_MATCH to allow easy grepping. + */ + #define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ +@@ -39,6 +49,120 @@ + .driver_data = (unsigned long) _data \ + } + ++/** ++ * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching ++ * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY ++ * The name is expanded to X86_VENDOR_@_vendor ++ * @_family: The family number or X86_FAMILY_ANY ++ * @_model: The model number, model constant or X86_MODEL_ANY ++ * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY ++ * @_data: Driver specific data or NULL. The internal storage ++ * format is unsigned long. The supplied value, pointer ++ * etc. is casted to unsigned long internally. ++ * ++ * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is ++ * set to wildcards. ++ */ ++#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ ++ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ ++ X86_STEPPING_ANY, feature, data) ++ ++/** ++ * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature ++ * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY ++ * The name is expanded to X86_VENDOR_@vendor ++ * @family: The family number or X86_FAMILY_ANY ++ * @feature: A X86_FEATURE bit ++ * @data: Driver specific data or NULL. The internal storage ++ * format is unsigned long. The supplied value, pointer ++ * etc. is casted to unsigned long internally. ++ * ++ * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are ++ * set to wildcards. ++ */ ++#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \ ++ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, \ ++ X86_MODEL_ANY, feature, data) ++ ++/** ++ * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature ++ * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY ++ * The name is expanded to X86_VENDOR_@vendor ++ * @feature: A X86_FEATURE bit ++ * @data: Driver specific data or NULL. The internal storage ++ * format is unsigned long. The supplied value, pointer ++ * etc. is casted to unsigned long internally. ++ * ++ * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are ++ * set to wildcards. ++ */ ++#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \ ++ X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data) ++ ++/** ++ * X86_MATCH_FEATURE - Macro for matching a CPU feature ++ * @feature: A X86_FEATURE bit ++ * @data: Driver specific data or NULL. The internal storage ++ * format is unsigned long. The supplied value, pointer ++ * etc. is casted to unsigned long internally. ++ * ++ * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are ++ * set to wildcards. ++ */ ++#define X86_MATCH_FEATURE(feature, data) \ ++ X86_MATCH_VENDOR_FEATURE(ANY, feature, data) ++ ++/* Transitional to keep the existing code working */ ++#define X86_FEATURE_MATCH(feature) X86_MATCH_FEATURE(feature, NULL) ++ ++/** ++ * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model ++ * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY ++ * The name is expanded to X86_VENDOR_@vendor ++ * @family: The family number or X86_FAMILY_ANY ++ * @model: The model number, model constant or X86_MODEL_ANY ++ * @data: Driver specific data or NULL. The internal storage ++ * format is unsigned long. The supplied value, pointer ++ * etc. is casted to unsigned long internally. ++ * ++ * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are ++ * set to wildcards. ++ */ ++#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \ ++ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, \ ++ X86_FEATURE_ANY, data) ++ ++/** ++ * X86_MATCH_VENDOR_FAM - Match vendor and family ++ * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY ++ * The name is expanded to X86_VENDOR_@vendor ++ * @family: The family number or X86_FAMILY_ANY ++ * @data: Driver specific data or NULL. The internal storage ++ * format is unsigned long. The supplied value, pointer ++ * etc. is casted to unsigned long internally. ++ * ++ * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are ++ * set of wildcards. ++ */ ++#define X86_MATCH_VENDOR_FAM(vendor, family, data) \ ++ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) ++ ++/** ++ * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model ++ * @model: The model name without the INTEL_FAM6_ prefix or ANY ++ * The model name is expanded to INTEL_FAM6_@model internally ++ * @data: Driver specific data or NULL. The internal storage ++ * format is unsigned long. The supplied value, pointer ++ * etc. is casted to unsigned long internally. ++ * ++ * The vendor is set to INTEL, the family to 6 and all other missing ++ * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards. ++ * ++ * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information. ++ */ ++#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ ++ X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) ++ + /* + * Match specific microcode revisions. + * +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 736b0e412344b..2ec85d7bfdff2 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -203,8 +203,8 @@ + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ + #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ +-#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +-#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ ++#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ ++#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ + #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ + #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ + #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ +@@ -286,7 +286,10 @@ + #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ + #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ + #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ +-#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+ 6) /* "" Fill RSB on VM exit when EIBRS is enabled */ ++#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ ++#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ ++#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ ++#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ + + /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ + #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +@@ -303,6 +306,7 @@ + #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ + #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ + #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ ++#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +@@ -407,7 +411,8 @@ + #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ + #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ + #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ +-#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ ++#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ + #define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ ++#define X86_BUG_MMIO_UNKNOWN X86_BUG(28) /* CPU is too old and its MMIO Stale Data status is unknown */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h +index 5b07573c3bc87..c1d6d8bbb7dad 100644 +--- a/arch/x86/include/asm/intel-family.h ++++ b/arch/x86/include/asm/intel-family.h +@@ -35,6 +35,9 @@ + * The #define line may optionally include a comment including platform names. + */ + ++/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ ++#define INTEL_FAM6_ANY X86_MODEL_ANY ++ + #define INTEL_FAM6_CORE_YONAH 0x0E + + #define INTEL_FAM6_CORE2_MEROM 0x0F +@@ -126,6 +129,9 @@ + #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ + #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ + ++/* Family 5 */ ++#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ ++ + /* Useful macros */ + #define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \ + { \ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index cef4eba03ff36..713886d5493a8 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -47,6 +47,8 @@ + #define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ + #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ + #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ ++#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ ++#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) + + #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ + #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ +@@ -82,6 +84,7 @@ + #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a + #define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ + #define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ ++#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */ + #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ + #define ARCH_CAP_SSB_NO BIT(4) /* + * Not susceptible to Speculative Store Bypass +@@ -129,6 +132,13 @@ + * bit available to control VERW + * behavior. + */ ++#define ARCH_CAP_RRSBA BIT(19) /* ++ * Indicates RET may use predictors ++ * other than the RSB. With eIBRS ++ * enabled predictions in kernel mode ++ * are restricted to targets in ++ * kernel. ++ */ + #define ARCH_CAP_PBRSB_NO BIT(24) /* + * Not susceptible to Post-Barrier + * Return Stack Buffer Predictions. +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index a1ee1a760c3eb..8c898eed28941 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -4,11 +4,14 @@ + #define _ASM_X86_NOSPEC_BRANCH_H_ + + #include <linux/static_key.h> ++#include <linux/frame.h> + + #include <asm/alternative.h> + #include <asm/alternative-asm.h> + #include <asm/cpufeatures.h> + #include <asm/msr-index.h> ++#include <asm/unwind_hints.h> ++#include <asm/percpu.h> + + /* + * This should be used immediately before a retpoline alternative. It tells +@@ -60,9 +63,9 @@ + lfence; \ + jmp 775b; \ + 774: \ ++ add $(BITS_PER_LONG/8) * 2, sp; \ + dec reg; \ + jnz 771b; \ +- add $(BITS_PER_LONG/8) * nr, sp; \ + /* barrier for jnz misprediction */ \ + lfence; + #else +@@ -79,13 +82,6 @@ + add $(BITS_PER_LONG/8) * nr, sp; + #endif + +-#define __ISSUE_UNBALANCED_RET_GUARD(sp) \ +- call 881f; \ +- int3; \ +-881: \ +- add $(BITS_PER_LONG/8), sp; \ +- lfence; +- + #ifdef __ASSEMBLY__ + + /* +@@ -155,26 +151,28 @@ + #endif + .endm + +-.macro ISSUE_UNBALANCED_RET_GUARD ftr:req +- ANNOTATE_NOSPEC_ALTERNATIVE +- ALTERNATIVE "jmp .Lskip_pbrsb_\@", \ +- __stringify(__ISSUE_UNBALANCED_RET_GUARD(%_ASM_SP)) \ +- \ftr +-.Lskip_pbrsb_\@: ++.macro ISSUE_UNBALANCED_RET_GUARD ++ call .Lunbalanced_ret_guard_\@ ++ int3 ++.Lunbalanced_ret_guard_\@: ++ add $(BITS_PER_LONG/8), %_ASM_SP ++ lfence + .endm + + /* + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP + * monstrosity above, manually. + */ +-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +-#ifdef CONFIG_RETPOLINE +- ANNOTATE_NOSPEC_ALTERNATIVE +- ALTERNATIVE "jmp .Lskip_rsb_\@", \ +- __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ +- \ftr ++.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2 ++.ifb \ftr2 ++ ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr ++.else ++ ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2 ++.endif ++ __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) ++.Lunbalanced_\@: ++ ISSUE_UNBALANCED_RET_GUARD + .Lskip_rsb_\@: +-#endif + .endm + + #else /* __ASSEMBLY__ */ +@@ -249,6 +247,7 @@ enum spectre_v2_mitigation { + SPECTRE_V2_EIBRS, + SPECTRE_V2_EIBRS_RETPOLINE, + SPECTRE_V2_EIBRS_LFENCE, ++ SPECTRE_V2_IBRS, + }; + + /* The indirect branch speculation control variants */ +@@ -312,6 +311,9 @@ static inline void indirect_branch_prediction_barrier(void) + + /* The Intel SPEC CTRL MSR base value cache */ + extern u64 x86_spec_ctrl_base; ++DECLARE_PER_CPU(u64, x86_spec_ctrl_current); ++extern void write_spec_ctrl_current(u64 val, bool force); ++extern u64 spec_ctrl_current(void); + + /* + * With retpoline, we must use IBRS to restrict branch prediction +@@ -321,18 +323,16 @@ extern u64 x86_spec_ctrl_base; + */ + #define firmware_restrict_branch_speculation_start() \ + do { \ +- u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \ +- \ + preempt_disable(); \ +- alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ ++ alternative_msr_write(MSR_IA32_SPEC_CTRL, \ ++ spec_ctrl_current() | SPEC_CTRL_IBRS, \ + X86_FEATURE_USE_IBRS_FW); \ + } while (0) + + #define firmware_restrict_branch_speculation_end() \ + do { \ +- u64 val = x86_spec_ctrl_base; \ +- \ +- alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ ++ alternative_msr_write(MSR_IA32_SPEC_CTRL, \ ++ spec_ctrl_current(), \ + X86_FEATURE_USE_IBRS_FW); \ + preempt_enable(); \ + } while (0) +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index 88cef978380bf..5571b28d35b60 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -894,12 +894,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c) + node_reclaim_distance = 32; + #endif + +- /* +- * Fix erratum 1076: CPB feature bit not being set in CPUID. +- * Always set it, except when running under a hypervisor. +- */ +- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) +- set_cpu_cap(c, X86_FEATURE_CPB); ++ /* Fix up CPUID bits, but only if not virtualised. */ ++ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { ++ ++ /* Erratum 1076: CPB feature bit not being set in CPUID. */ ++ if (!cpu_has(c, X86_FEATURE_CPB)) ++ set_cpu_cap(c, X86_FEATURE_CPB); ++ ++ /* ++ * Zen3 (Fam19 model < 0x10) parts are not susceptible to ++ * Branch Type Confusion, but predate the allocation of the ++ * BTC_NO bit. ++ */ ++ if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) ++ set_cpu_cap(c, X86_FEATURE_BTC_NO); ++ } + } + + static void init_amd(struct cpuinfo_x86 *c) +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index c90d91cb14341..cf5a18e261e36 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -37,6 +37,8 @@ + + static void __init spectre_v1_select_mitigation(void); + static void __init spectre_v2_select_mitigation(void); ++static void __init retbleed_select_mitigation(void); ++static void __init spectre_v2_user_select_mitigation(void); + static void __init ssb_select_mitigation(void); + static void __init l1tf_select_mitigation(void); + static void __init mds_select_mitigation(void); +@@ -46,16 +48,40 @@ static void __init taa_select_mitigation(void); + static void __init mmio_select_mitigation(void); + static void __init srbds_select_mitigation(void); + +-/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ ++/* The base value of the SPEC_CTRL MSR without task-specific bits set */ + u64 x86_spec_ctrl_base; + EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); ++ ++/* The current value of the SPEC_CTRL MSR with task-specific bits set */ ++DEFINE_PER_CPU(u64, x86_spec_ctrl_current); ++EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); ++ + static DEFINE_MUTEX(spec_ctrl_mutex); + + /* +- * The vendor and possibly platform specific bits which can be modified in +- * x86_spec_ctrl_base. ++ * Keep track of the SPEC_CTRL MSR value for the current task, which may differ ++ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). + */ +-static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; ++void write_spec_ctrl_current(u64 val, bool force) ++{ ++ if (this_cpu_read(x86_spec_ctrl_current) == val) ++ return; ++ ++ this_cpu_write(x86_spec_ctrl_current, val); ++ ++ /* ++ * When KERNEL_IBRS this MSR is written on return-to-user, unless ++ * forced the update can be delayed until that time. ++ */ ++ if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) ++ wrmsrl(MSR_IA32_SPEC_CTRL, val); ++} ++ ++u64 spec_ctrl_current(void) ++{ ++ return this_cpu_read(x86_spec_ctrl_current); ++} ++EXPORT_SYMBOL_GPL(spec_ctrl_current); + + /* + * AMD specific MSR info for Speculative Store Bypass control. +@@ -105,13 +131,21 @@ void __init check_bugs(void) + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + +- /* Allow STIBP in MSR_SPEC_CTRL if supported */ +- if (boot_cpu_has(X86_FEATURE_STIBP)) +- x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; +- + /* Select the proper CPU mitigations before patching alternatives: */ + spectre_v1_select_mitigation(); + spectre_v2_select_mitigation(); ++ /* ++ * retbleed_select_mitigation() relies on the state set by ++ * spectre_v2_select_mitigation(); specifically it wants to know about ++ * spectre_v2=ibrs. ++ */ ++ retbleed_select_mitigation(); ++ /* ++ * spectre_v2_user_select_mitigation() relies on the state set by ++ * retbleed_select_mitigation(); specifically the STIBP selection is ++ * forced for UNRET. ++ */ ++ spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + md_clear_select_mitigation(); +@@ -151,31 +185,17 @@ void __init check_bugs(void) + #endif + } + ++/* ++ * NOTE: For VMX, this function is not called in the vmexit path. ++ * It uses vmx_spec_ctrl_restore_host() instead. ++ */ + void + x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) + { +- u64 msrval, guestval, hostval = x86_spec_ctrl_base; ++ u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); + struct thread_info *ti = current_thread_info(); + +- /* Is MSR_SPEC_CTRL implemented ? */ + if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { +- /* +- * Restrict guest_spec_ctrl to supported values. Clear the +- * modifiable bits in the host base value and or the +- * modifiable bits from the guest value. +- */ +- guestval = hostval & ~x86_spec_ctrl_mask; +- guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; +- +- /* SSBD controlled in MSR_SPEC_CTRL */ +- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || +- static_cpu_has(X86_FEATURE_AMD_SSBD)) +- hostval |= ssbd_tif_to_spec_ctrl(ti->flags); +- +- /* Conditional STIBP enabled? */ +- if (static_branch_unlikely(&switch_to_cond_stibp)) +- hostval |= stibp_tif_to_spec_ctrl(ti->flags); +- + if (hostval != guestval) { + msrval = setguest ? guestval : hostval; + wrmsrl(MSR_IA32_SPEC_CTRL, msrval); +@@ -705,12 +725,103 @@ static int __init nospectre_v1_cmdline(char *str) + } + early_param("nospectre_v1", nospectre_v1_cmdline); + +-#undef pr_fmt +-#define pr_fmt(fmt) "Spectre V2 : " fmt +- + static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = + SPECTRE_V2_NONE; + ++#undef pr_fmt ++#define pr_fmt(fmt) "RETBleed: " fmt ++ ++enum retbleed_mitigation { ++ RETBLEED_MITIGATION_NONE, ++ RETBLEED_MITIGATION_IBRS, ++ RETBLEED_MITIGATION_EIBRS, ++}; ++ ++enum retbleed_mitigation_cmd { ++ RETBLEED_CMD_OFF, ++ RETBLEED_CMD_AUTO, ++}; ++ ++const char * const retbleed_strings[] = { ++ [RETBLEED_MITIGATION_NONE] = "Vulnerable", ++ [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", ++ [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", ++}; ++ ++static enum retbleed_mitigation retbleed_mitigation __ro_after_init = ++ RETBLEED_MITIGATION_NONE; ++static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = ++ RETBLEED_CMD_AUTO; ++ ++static int __init retbleed_parse_cmdline(char *str) ++{ ++ if (!str) ++ return -EINVAL; ++ ++ if (!strcmp(str, "off")) ++ retbleed_cmd = RETBLEED_CMD_OFF; ++ else if (!strcmp(str, "auto")) ++ retbleed_cmd = RETBLEED_CMD_AUTO; ++ else ++ pr_err("Unknown retbleed option (%s). Defaulting to 'auto'\n", str); ++ ++ return 0; ++} ++early_param("retbleed", retbleed_parse_cmdline); ++ ++#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" ++#define RETBLEED_COMPILER_MSG "WARNING: kernel not compiled with RETPOLINE or -mfunction-return capable compiler!\n" ++#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" ++ ++static void __init retbleed_select_mitigation(void) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) ++ return; ++ ++ switch (retbleed_cmd) { ++ case RETBLEED_CMD_OFF: ++ return; ++ ++ case RETBLEED_CMD_AUTO: ++ default: ++ /* ++ * The Intel mitigation (IBRS) was already selected in ++ * spectre_v2_select_mitigation(). ++ */ ++ ++ break; ++ } ++ ++ switch (retbleed_mitigation) { ++ default: ++ break; ++ } ++ ++ /* ++ * Let IBRS trump all on Intel without affecting the effects of the ++ * retbleed= cmdline option. ++ */ ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { ++ switch (spectre_v2_enabled) { ++ case SPECTRE_V2_IBRS: ++ retbleed_mitigation = RETBLEED_MITIGATION_IBRS; ++ break; ++ case SPECTRE_V2_EIBRS: ++ case SPECTRE_V2_EIBRS_RETPOLINE: ++ case SPECTRE_V2_EIBRS_LFENCE: ++ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; ++ break; ++ default: ++ pr_err(RETBLEED_INTEL_MSG); ++ } ++ } ++ ++ pr_info("%s\n", retbleed_strings[retbleed_mitigation]); ++} ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "Spectre V2 : " fmt ++ + static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = + SPECTRE_V2_USER_NONE; + static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = +@@ -740,6 +851,7 @@ static inline const char *spectre_v2_module_string(void) { return ""; } + #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" + #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" + #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" ++#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" + + #ifdef CONFIG_BPF_SYSCALL + void unpriv_ebpf_notify(int new_state) +@@ -781,6 +893,7 @@ enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, ++ SPECTRE_V2_CMD_IBRS, + }; + + enum spectre_v2_user_cmd { +@@ -821,13 +934,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) + pr_info("spectre_v2_user=%s forced on command line.\n", reason); + } + ++static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; ++ + static enum spectre_v2_user_cmd __init +-spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) ++spectre_v2_parse_user_cmdline(void) + { + char arg[20]; + int ret, i; + +- switch (v2_cmd) { ++ switch (spectre_v2_cmd) { + case SPECTRE_V2_CMD_NONE: + return SPECTRE_V2_USER_CMD_NONE; + case SPECTRE_V2_CMD_FORCE: +@@ -853,15 +968,16 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) + return SPECTRE_V2_USER_CMD_AUTO; + } + +-static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) ++static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) + { +- return (mode == SPECTRE_V2_EIBRS || +- mode == SPECTRE_V2_EIBRS_RETPOLINE || +- mode == SPECTRE_V2_EIBRS_LFENCE); ++ return mode == SPECTRE_V2_IBRS || ++ mode == SPECTRE_V2_EIBRS || ++ mode == SPECTRE_V2_EIBRS_RETPOLINE || ++ mode == SPECTRE_V2_EIBRS_LFENCE; + } + + static void __init +-spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) ++spectre_v2_user_select_mitigation(void) + { + enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; + bool smt_possible = IS_ENABLED(CONFIG_SMP); +@@ -874,7 +990,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + smt_possible = false; + +- cmd = spectre_v2_parse_user_cmdline(v2_cmd); ++ cmd = spectre_v2_parse_user_cmdline(); + switch (cmd) { + case SPECTRE_V2_USER_CMD_NONE: + goto set_mode; +@@ -922,12 +1038,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) + } + + /* +- * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not +- * required. ++ * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible, ++ * STIBP is not required. + */ + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || +- spectre_v2_in_eibrs_mode(spectre_v2_enabled)) ++ spectre_v2_in_ibrs_mode(spectre_v2_enabled)) + return; + + /* +@@ -952,6 +1068,7 @@ static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", ++ [SPECTRE_V2_IBRS] = "Mitigation: IBRS", + }; + + static const struct { +@@ -969,6 +1086,7 @@ static const struct { + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, + { "auto", SPECTRE_V2_CMD_AUTO, false }, ++ { "ibrs", SPECTRE_V2_CMD_IBRS, false }, + }; + + static void __init spec_v2_print_cond(const char *reason, bool secure) +@@ -1031,6 +1149,24 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) + return SPECTRE_V2_CMD_AUTO; + } + ++ if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { ++ pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", ++ mitigation_options[i].option); ++ return SPECTRE_V2_CMD_AUTO; ++ } ++ ++ if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { ++ pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", ++ mitigation_options[i].option); ++ return SPECTRE_V2_CMD_AUTO; ++ } ++ ++ if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { ++ pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", ++ mitigation_options[i].option); ++ return SPECTRE_V2_CMD_AUTO; ++ } ++ + spec_v2_print_cond(mitigation_options[i].option, + mitigation_options[i].secure); + return cmd; +@@ -1046,6 +1182,22 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) + return SPECTRE_V2_RETPOLINE; + } + ++/* Disable in-kernel use of non-RSB RET predictors */ ++static void __init spec_ctrl_disable_kernel_rrsba(void) ++{ ++ u64 ia32_cap; ++ ++ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) ++ return; ++ ++ ia32_cap = x86_read_arch_cap_msr(); ++ ++ if (ia32_cap & ARCH_CAP_RRSBA) { ++ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; ++ write_spec_ctrl_current(x86_spec_ctrl_base, true); ++ } ++} ++ + static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) + { + /* +@@ -1070,10 +1222,6 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ + */ + switch (mode) { + case SPECTRE_V2_NONE: +- /* These modes already fill RSB at vmexit */ +- case SPECTRE_V2_LFENCE: +- case SPECTRE_V2_RETPOLINE: +- case SPECTRE_V2_EIBRS_RETPOLINE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: +@@ -1083,6 +1231,14 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; ++ ++ case SPECTRE_V2_EIBRS_RETPOLINE: ++ case SPECTRE_V2_RETPOLINE: ++ case SPECTRE_V2_LFENCE: ++ case SPECTRE_V2_IBRS: ++ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); ++ pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); ++ return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); +@@ -1113,6 +1269,14 @@ static void __init spectre_v2_select_mitigation(void) + break; + } + ++ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && ++ retbleed_cmd != RETBLEED_CMD_OFF && ++ boot_cpu_has(X86_FEATURE_IBRS) && ++ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { ++ mode = SPECTRE_V2_IBRS; ++ break; ++ } ++ + mode = spectre_v2_select_retpoline(); + break; + +@@ -1129,6 +1293,10 @@ static void __init spectre_v2_select_mitigation(void) + mode = spectre_v2_select_retpoline(); + break; + ++ case SPECTRE_V2_CMD_IBRS: ++ mode = SPECTRE_V2_IBRS; ++ break; ++ + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; +@@ -1145,10 +1313,9 @@ static void __init spectre_v2_select_mitigation(void) + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + +- if (spectre_v2_in_eibrs_mode(mode)) { +- /* Force it so VMEXIT will restore correctly */ ++ if (spectre_v2_in_ibrs_mode(mode)) { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; +- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); ++ write_spec_ctrl_current(x86_spec_ctrl_base, true); + } + + switch (mode) { +@@ -1156,6 +1323,12 @@ static void __init spectre_v2_select_mitigation(void) + case SPECTRE_V2_EIBRS: + break; + ++ case SPECTRE_V2_IBRS: ++ setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); ++ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) ++ pr_warn(SPECTRE_V2_IBRS_PERF_MSG); ++ break; ++ + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); +@@ -1167,16 +1340,56 @@ static void __init spectre_v2_select_mitigation(void) + break; + } + ++ /* ++ * Disable alternate RSB predictions in kernel when indirect CALLs and ++ * JMPs gets protection against BHI and Intramode-BTI, but RET ++ * prediction from a non-RSB predictor is still a risk. ++ */ ++ if (mode == SPECTRE_V2_EIBRS_LFENCE || ++ mode == SPECTRE_V2_EIBRS_RETPOLINE || ++ mode == SPECTRE_V2_RETPOLINE) ++ spec_ctrl_disable_kernel_rrsba(); ++ + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); + + /* +- * If spectre v2 protection has been enabled, unconditionally fill +- * RSB during a context switch; this protects against two independent +- * issues: ++ * If Spectre v2 protection has been enabled, fill the RSB during a ++ * context switch. In general there are two types of RSB attacks ++ * across context switches, for which the CALLs/RETs may be unbalanced. ++ * ++ * 1) RSB underflow ++ * ++ * Some Intel parts have "bottomless RSB". When the RSB is empty, ++ * speculated return targets may come from the branch predictor, ++ * which could have a user-poisoned BTB or BHB entry. ++ * ++ * AMD has it even worse: *all* returns are speculated from the BTB, ++ * regardless of the state of the RSB. ++ * ++ * When IBRS or eIBRS is enabled, the "user -> kernel" attack ++ * scenario is mitigated by the IBRS branch prediction isolation ++ * properties, so the RSB buffer filling wouldn't be necessary to ++ * protect against this type of attack. ++ * ++ * The "user -> user" attack scenario is mitigated by RSB filling. + * +- * - RSB underflow (and switch to BTB) on Skylake+ +- * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs ++ * 2) Poisoned RSB entry ++ * ++ * If the 'next' in-kernel return stack is shorter than 'prev', ++ * 'next' could be tricked into speculating with a user-poisoned RSB ++ * entry. ++ * ++ * The "user -> kernel" attack scenario is mitigated by SMEP and ++ * eIBRS. ++ * ++ * The "user -> user" scenario, also known as SpectreBHB, requires ++ * RSB clearing. ++ * ++ * So to mitigate all cases, unconditionally fill RSB on context ++ * switches. ++ * ++ * FIXME: Is this pointless for retbleed-affected AMD? + */ + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); +@@ -1184,28 +1397,29 @@ static void __init spectre_v2_select_mitigation(void) + spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + + /* +- * Retpoline means the kernel is safe because it has no indirect +- * branches. Enhanced IBRS protects firmware too, so, enable restricted +- * speculation around firmware calls only when Enhanced IBRS isn't +- * supported. ++ * Retpoline protects the kernel, but doesn't protect firmware. IBRS ++ * and Enhanced IBRS protect firmware too, so enable IBRS around ++ * firmware calls only when IBRS / Enhanced IBRS aren't otherwise ++ * enabled. + * + * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because + * the user might select retpoline on the kernel command line and if + * the CPU supports Enhanced IBRS, kernel might un-intentionally not + * enable IBRS around firmware calls. + */ +- if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { ++ if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); + pr_info("Enabling Restricted Speculation for firmware calls\n"); + } + + /* Set up IBPB and STIBP depending on the general spectre V2 command */ +- spectre_v2_user_select_mitigation(cmd); ++ spectre_v2_cmd = cmd; + } + + static void update_stibp_msr(void * __unused) + { +- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); ++ u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); ++ write_spec_ctrl_current(val, true); + } + + /* Update x86_spec_ctrl_base in case SMT state changed. */ +@@ -1421,16 +1635,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) + break; + } + +- /* +- * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper +- * bit in the mask to allow guests to use the mitigation even in the +- * case where the host does not enable it. +- */ +- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || +- static_cpu_has(X86_FEATURE_AMD_SSBD)) { +- x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; +- } +- + /* + * We have three CPU feature flags that are in play here: + * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. +@@ -1448,7 +1652,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) + x86_amd_ssb_disable(); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_SSBD; +- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); ++ write_spec_ctrl_current(x86_spec_ctrl_base, true); + } + } + +@@ -1665,7 +1869,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) + void x86_spec_ctrl_setup_ap(void) + { + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) +- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); ++ write_spec_ctrl_current(x86_spec_ctrl_base, true); + + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) + x86_amd_ssb_disable(); +@@ -1900,7 +2104,7 @@ static ssize_t mmio_stale_data_show_state(char *buf) + + static char *stibp_state(void) + { +- if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) ++ if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) + return ""; + + switch (spectre_v2_user_stibp) { +@@ -1934,7 +2138,7 @@ static char *pbrsb_eibrs_state(void) + { + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || +- boot_cpu_has(X86_FEATURE_RETPOLINE)) ++ boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; +@@ -1970,6 +2174,11 @@ static ssize_t srbds_show_state(char *buf) + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); + } + ++static ssize_t retbleed_show_state(char *buf) ++{ ++ return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); ++} ++ + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, + char *buf, unsigned int bug) + { +@@ -2016,6 +2225,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr + case X86_BUG_MMIO_UNKNOWN: + return mmio_stale_data_show_state(buf); + ++ case X86_BUG_RETBLEED: ++ return retbleed_show_state(buf); ++ + default: + break; + } +@@ -2075,4 +2287,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *at + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); + } ++ ++ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); ++} + #endif +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 59413e741ecf1..5e1e32f1086ba 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1102,48 +1102,60 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + {} + }; + ++#define VULNBL(vendor, family, model, blacklist) \ ++ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) ++ + #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + ++#define VULNBL_AMD(family, blacklist) \ ++ VULNBL(AMD, family, X86_MODEL_ANY, blacklist) ++ ++#define VULNBL_HYGON(family, blacklist) \ ++ VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) ++ + #define SRBDS BIT(0) + /* CPU is affected by X86_BUG_MMIO_STALE_DATA */ + #define MMIO BIT(1) + /* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ + #define MMIO_SBDS BIT(2) ++/* CPU is affected by RETbleed, speculating where you would not expect it */ ++#define RETBLEED BIT(3) + + static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), +- VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO), +- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO), ++ VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), ++ VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), +- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), +- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), +- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) | +- BIT(7) | BIT(0xB), MMIO), +- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), +- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), +- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO), +- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS), +- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO), +- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS), +- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS), +- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO), +- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO), +- VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS), +- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), +- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO), +- VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), +- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO), +- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), ++ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), ++ VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO), ++ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO), ++ VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED), ++ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), +- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS), ++ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), ++ ++ VULNBL_AMD(0x15, RETBLEED), ++ VULNBL_AMD(0x16, RETBLEED), ++ VULNBL_AMD(0x17, RETBLEED), ++ VULNBL_HYGON(0x18, RETBLEED), + {} + }; + +@@ -1251,6 +1263,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } + ++ if (!cpu_has(c, X86_FEATURE_BTC_NO)) { ++ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) ++ setup_force_cpu_bug(X86_BUG_RETBLEED); ++ } ++ + if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && + !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) +diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c +index 2f163e6646b6f..ad6776081e60d 100644 +--- a/arch/x86/kernel/cpu/match.c ++++ b/arch/x86/kernel/cpu/match.c +@@ -16,12 +16,17 @@ + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU +- * { X86_VENDOR_INTEL, 6, 0x12 } +- * or to match a specific CPU feature +- * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } ++ * ++ * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, ++ * X86_FEATURE_ANY, NULL); + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, +- * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) ++ * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) ++ * ++ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts ++ * for various common selections. The above can be shortened to: ++ * ++ * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86cpu, ...) +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c +index 53004dbd55c47..a03e309a0ac5f 100644 +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -26,6 +26,7 @@ struct cpuid_bit { + static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, ++ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 068715a52ac10..87cfd2ee9ca0d 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -449,7 +449,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, + } + + if (updmsr) +- wrmsrl(MSR_IA32_SPEC_CTRL, msr); ++ write_spec_ctrl_current(msr, false); + } + + static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 1efcc7d4bc88e..3db407e3c4166 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -47,6 +47,7 @@ + #include <asm/kvm_para.h> + #include <asm/irq_remapping.h> + #include <asm/spec-ctrl.h> ++#include <asm/cpu_device_id.h> + + #include <asm/virtext.h> + #include "trace.h" +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index 34ee4835b0177..a7b62a00913e5 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -11,6 +11,7 @@ + #include "mmu.h" + #include "nested.h" + #include "trace.h" ++#include "vmx.h" + #include "x86.h" + + static bool __read_mostly enable_shadow_vmcs = 1; +@@ -2863,35 +2864,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) + vmx->loaded_vmcs->host_state.cr4 = cr4; + } + +- asm( +- "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ +- "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" +- "je 1f \n\t" +- __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" +- "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" +- "1: \n\t" +- "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ +- +- /* Check if vmlaunch or vmresume is needed */ +- "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" +- +- /* +- * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set +- * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail +- * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the +- * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. +- */ +- "call vmx_vmenter\n\t" +- +- CC_SET(be) +- : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) +- : [HOST_RSP]"r"((unsigned long)HOST_RSP), +- [loaded_vmcs]"r"(vmx->loaded_vmcs), +- [launched]"i"(offsetof(struct loaded_vmcs, launched)), +- [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), +- [wordsize]"i"(sizeof(ulong)) +- : "memory" +- ); ++ vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, ++ __vmx_vcpu_run_flags(vmx)); + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); +diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h +new file mode 100644 +index 0000000000000..edc3f16cc1896 +--- /dev/null ++++ b/arch/x86/kvm/vmx/run_flags.h +@@ -0,0 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef __KVM_X86_VMX_RUN_FLAGS_H ++#define __KVM_X86_VMX_RUN_FLAGS_H ++ ++#define VMX_RUN_VMRESUME (1 << 0) ++#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) ++ ++#endif /* __KVM_X86_VMX_RUN_FLAGS_H */ +diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S +index 946d9205c3b6d..2850670c38bb0 100644 +--- a/arch/x86/kvm/vmx/vmenter.S ++++ b/arch/x86/kvm/vmx/vmenter.S +@@ -4,6 +4,7 @@ + #include <asm/bitsperlong.h> + #include <asm/kvm_vcpu_regs.h> + #include <asm/nospec-branch.h> ++#include "run_flags.h" + + #define WORD_SIZE (BITS_PER_LONG / 8) + +@@ -29,78 +30,12 @@ + + .text + +-/** +- * vmx_vmenter - VM-Enter the current loaded VMCS +- * +- * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME +- * +- * Returns: +- * %RFLAGS.CF is set on VM-Fail Invalid +- * %RFLAGS.ZF is set on VM-Fail Valid +- * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit +- * +- * Note that VMRESUME/VMLAUNCH fall-through and return directly if +- * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump +- * to vmx_vmexit. +- */ +-ENTRY(vmx_vmenter) +- /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ +- je 2f +- +-1: vmresume +- ret +- +-2: vmlaunch +- ret +- +-3: cmpb $0, kvm_rebooting +- je 4f +- ret +-4: ud2 +- +- .pushsection .fixup, "ax" +-5: jmp 3b +- .popsection +- +- _ASM_EXTABLE(1b, 5b) +- _ASM_EXTABLE(2b, 5b) +- +-ENDPROC(vmx_vmenter) +- +-/** +- * vmx_vmexit - Handle a VMX VM-Exit +- * +- * Returns: +- * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit +- * +- * This is vmx_vmenter's partner in crime. On a VM-Exit, control will jump +- * here after hardware loads the host's state, i.e. this is the destination +- * referred to by VMCS.HOST_RIP. +- */ +-ENTRY(vmx_vmexit) +-#ifdef CONFIG_RETPOLINE +- ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE +- /* Preserve guest's RAX, it's used to stuff the RSB. */ +- push %_ASM_AX +- +- /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ +- FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE +- +- /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */ +- or $1, %_ASM_AX +- +- pop %_ASM_AX +-.Lvmexit_skip_rsb: +-#endif +- ISSUE_UNBALANCED_RET_GUARD X86_FEATURE_RSB_VMEXIT_LITE +- ret +-ENDPROC(vmx_vmexit) +- + /** + * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode +- * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp) ++ * @vmx: struct vcpu_vmx * + * @regs: unsigned long * (to guest registers) +- * @launched: %true if the VMCS has been launched ++ * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH ++ * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl + * + * Returns: + * 0 on VM-Exit, 1 on VM-Fail +@@ -119,24 +54,29 @@ ENTRY(__vmx_vcpu_run) + #endif + push %_ASM_BX + ++ /* Save @vmx for SPEC_CTRL handling */ ++ push %_ASM_ARG1 ++ ++ /* Save @flags for SPEC_CTRL handling */ ++ push %_ASM_ARG3 ++ + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + push %_ASM_ARG2 + +- /* Copy @launched to BL, _ASM_ARG3 is volatile. */ ++ /* Copy @flags to BL, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3B, %bl + +- /* Adjust RSP to account for the CALL to vmx_vmenter(). */ +- lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 ++ lea (%_ASM_SP), %_ASM_ARG2 + call vmx_update_host_rsp + + /* Load @regs to RAX. */ + mov (%_ASM_SP), %_ASM_AX + + /* Check if vmlaunch or vmresume is needed */ +- cmpb $0, %bl ++ testb $VMX_RUN_VMRESUME, %bl + + /* Load guest registers. Don't clobber flags. */ + mov VCPU_RBX(%_ASM_AX), %_ASM_BX +@@ -158,11 +98,25 @@ ENTRY(__vmx_vcpu_run) + /* Load guest RAX. This kills the @regs pointer! */ + mov VCPU_RAX(%_ASM_AX), %_ASM_AX + +- /* Enter guest mode */ +- call vmx_vmenter ++ /* Check EFLAGS.ZF from 'testb' above */ ++ jz .Lvmlaunch + +- /* Jump on VM-Fail. */ +- jbe 2f ++/* ++ * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at ++ * the 'vmx_vmexit' label below. ++ */ ++.Lvmresume: ++ vmresume ++ jmp .Lvmfail ++ ++.Lvmlaunch: ++ vmlaunch ++ jmp .Lvmfail ++ ++ _ASM_EXTABLE(.Lvmresume, .Lfixup) ++ _ASM_EXTABLE(.Lvmlaunch, .Lfixup) ++ ++SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) + + /* Temporarily save guest's RAX. */ + push %_ASM_AX +@@ -189,19 +143,21 @@ ENTRY(__vmx_vcpu_run) + mov %r15, VCPU_R15(%_ASM_AX) + #endif + +- /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ +- xor %eax, %eax ++ /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */ ++ xor %ebx, %ebx + ++.Lclear_regs: + /* +- * Clear all general purpose registers except RSP and RAX to prevent ++ * Clear all general purpose registers except RSP and RBX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RAX are exempt as RSP is restored by hardware during +- * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. ++ * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return ++ * value. + */ +-1: xor %ebx, %ebx ++ xor %eax, %eax + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi +@@ -220,8 +176,32 @@ ENTRY(__vmx_vcpu_run) + + /* "POP" @regs. */ + add $WORD_SIZE, %_ASM_SP +- pop %_ASM_BX + ++ /* ++ * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before ++ * the first unbalanced RET after vmexit! ++ * ++ * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB ++ * entries and (in some cases) RSB underflow. ++ * ++ * eIBRS has its own protection against poisoned RSB, so it doesn't ++ * need the RSB filling sequence. But it does need to be enabled, and a ++ * single call to retire, before the first unbalanced RET. ++ */ ++ ++ FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ ++ X86_FEATURE_RSB_VMEXIT_LITE ++ ++ ++ pop %_ASM_ARG2 /* @flags */ ++ pop %_ASM_ARG1 /* @vmx */ ++ ++ call vmx_spec_ctrl_restore_host ++ ++ /* Put return value in AX */ ++ mov %_ASM_BX, %_ASM_AX ++ ++ pop %_ASM_BX + #ifdef CONFIG_X86_64 + pop %r12 + pop %r13 +@@ -234,11 +214,20 @@ ENTRY(__vmx_vcpu_run) + pop %_ASM_BP + ret + +- /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ +-2: mov $1, %eax +- jmp 1b ++.Lfixup: ++ cmpb $0, kvm_rebooting ++ jne .Lvmfail ++ ud2 ++.Lvmfail: ++ /* VM-Fail: set return value to 1 */ ++ mov $1, %_ASM_BX ++ jmp .Lclear_regs ++ + ENDPROC(__vmx_vcpu_run) + ++ ++.section .text, "ax" ++ + /** + * vmread_error_trampoline - Trampoline from inline asm to vmread_error() + * @field: VMCS field encoding that failed +diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c +index 4bd1bf6214eea..d522c9de41df9 100644 +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -31,6 +31,7 @@ + #include <asm/apic.h> + #include <asm/asm.h> + #include <asm/cpu.h> ++#include <asm/cpu_device_id.h> + #include <asm/debugreg.h> + #include <asm/desc.h> + #include <asm/fpu/internal.h> +@@ -358,9 +359,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) + if (!vmx->disable_fb_clear) + return; + +- rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr); ++ msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr |= FB_CLEAR_DIS; +- wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); ++ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + /* Cache the MSR value to avoid reading it later */ + vmx->msr_ia32_mcu_opt_ctrl = msr; + } +@@ -371,7 +372,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) + return; + + vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; +- wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); ++ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + } + + static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) +@@ -862,6 +863,24 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) + return true; + } + ++unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) ++{ ++ unsigned int flags = 0; ++ ++ if (vmx->loaded_vmcs->launched) ++ flags |= VMX_RUN_VMRESUME; ++ ++ /* ++ * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free ++ * to change it directly without causing a vmexit. In that case read ++ * it after vmexit and store it in vmx->spec_ctrl. ++ */ ++ if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) ++ flags |= VMX_RUN_SAVE_SPEC_CTRL; ++ ++ return flags; ++} ++ + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) + { +@@ -6539,7 +6558,30 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) + } + } + +-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); ++void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, ++ unsigned int flags) ++{ ++ u64 hostval = this_cpu_read(x86_spec_ctrl_current); ++ ++ if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) ++ return; ++ ++ if (flags & VMX_RUN_SAVE_SPEC_CTRL) ++ vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); ++ ++ /* ++ * If the guest/host SPEC_CTRL values differ, restore the host value. ++ * ++ * For legacy IBRS, the IBRS bit always needs to be written after ++ * transitioning from a less privileged predictor mode, regardless of ++ * whether the guest/host values differ. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || ++ vmx->spec_ctrl != hostval) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); ++ ++ barrier_nospec(); ++} + + static void vmx_vcpu_run(struct kvm_vcpu *vcpu) + { +@@ -6628,32 +6670,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) + write_cr2(vcpu->arch.cr2); + + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, +- vmx->loaded_vmcs->launched); ++ __vmx_vcpu_run_flags(vmx)); + + vcpu->arch.cr2 = read_cr2(); + + vmx_enable_fb_clear(vmx); + +- /* +- * We do not use IBRS in the kernel. If this vCPU has used the +- * SPEC_CTRL MSR it may have left it on; save the value and +- * turn it off. This is much more efficient than blindly adding +- * it to the atomic save/restore list. Especially as the former +- * (Saving guest MSRs on vmexit) doesn't even exist in KVM. +- * +- * For non-nested case: +- * If the L01 MSR bitmap does not intercept the MSR, then we need to +- * save it. +- * +- * For nested case: +- * If the L02 MSR bitmap does not intercept the MSR, then we need to +- * save it. +- */ +- if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) +- vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); +- +- x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); +- + /* All fields are clean at this point */ + if (static_branch_unlikely(&enable_evmcs)) + current_evmcs->hv_clean_fields |= +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 7a3362ab59867..4d5be4610af84 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -10,6 +10,7 @@ + #include "capabilities.h" + #include "ops.h" + #include "vmcs.h" ++#include "run_flags.h" + + extern const u32 vmx_msr_index[]; + extern u64 host_efer; +@@ -336,6 +337,10 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); + struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); + void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); + void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); ++void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); ++unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); ++bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, ++ unsigned int flags); + + #define POSTED_INTR_ON 0 + #define POSTED_INTR_SN 1 +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index d0b297583df88..c431a34522d6c 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -10329,9 +10329,9 @@ void kvm_arch_end_assignment(struct kvm *kvm) + } + EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); + +-bool kvm_arch_has_assigned_device(struct kvm *kvm) ++bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) + { +- return atomic_read(&kvm->arch.assigned_device_count); ++ return arch_atomic_read(&kvm->arch.assigned_device_count); + } + EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); + +diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c +index 9b5edf1dfe9e9..7000c836951c5 100644 +--- a/drivers/base/cpu.c ++++ b/drivers/base/cpu.c +@@ -574,6 +574,12 @@ ssize_t __weak cpu_show_mmio_stale_data(struct device *dev, + return sysfs_emit(buf, "Not affected\n"); + } + ++ssize_t __weak cpu_show_retbleed(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "Not affected\n"); ++} ++ + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); + static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); + static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); +@@ -584,6 +590,7 @@ static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); + static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); + static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); + static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL); ++static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL); + + static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_meltdown.attr, +@@ -596,6 +603,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_itlb_multihit.attr, + &dev_attr_srbds.attr, + &dev_attr_mmio_stale_data.attr, ++ &dev_attr_retbleed.attr, + NULL + }; + +diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c +index 4195834a45912..cf7ebe3bd1ad2 100644 +--- a/drivers/cpufreq/acpi-cpufreq.c ++++ b/drivers/cpufreq/acpi-cpufreq.c +@@ -30,6 +30,7 @@ + #include <asm/msr.h> + #include <asm/processor.h> + #include <asm/cpufeature.h> ++#include <asm/cpu_device_id.h> + + MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); + MODULE_DESCRIPTION("ACPI Processor P-States Driver"); +diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c +index e2df9d1121063..5107cbe2d64dd 100644 +--- a/drivers/cpufreq/amd_freq_sensitivity.c ++++ b/drivers/cpufreq/amd_freq_sensitivity.c +@@ -18,6 +18,7 @@ + + #include <asm/msr.h> + #include <asm/cpufeature.h> ++#include <asm/cpu_device_id.h> + + #include "cpufreq_ondemand.h" + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +index d8687868407de..b588e0e409e72 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +@@ -35,7 +35,6 @@ + #include <linux/pci.h> + #include <linux/pm_runtime.h> + #include <drm/drm_crtc_helper.h> +-#include <drm/drm_damage_helper.h> + #include <drm/drm_edid.h> + #include <drm/drm_gem_framebuffer_helper.h> + #include <drm/drm_fb_helper.h> +@@ -496,7 +495,6 @@ bool amdgpu_display_ddc_probe(struct amdgpu_connector *amdgpu_connector, + static const struct drm_framebuffer_funcs amdgpu_fb_funcs = { + .destroy = drm_gem_fb_destroy, + .create_handle = drm_gem_fb_create_handle, +- .dirty = drm_atomic_helper_dirtyfb, + }; + + uint32_t amdgpu_display_supported_domains(struct amdgpu_device *adev, +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 347b08b56042f..63b2212262618 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -46,11 +46,13 @@ + #include <linux/tick.h> + #include <trace/events/power.h> + #include <linux/sched.h> ++#include <linux/sched/smt.h> + #include <linux/notifier.h> + #include <linux/cpu.h> + #include <linux/moduleparam.h> + #include <asm/cpu_device_id.h> + #include <asm/intel-family.h> ++#include <asm/nospec-branch.h> + #include <asm/mwait.h> + #include <asm/msr.h> + +@@ -97,6 +99,12 @@ static struct cpuidle_state *cpuidle_state_table; + */ + #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 + ++/* ++ * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE ++ * above. ++ */ ++#define CPUIDLE_FLAG_IBRS BIT(16) ++ + /* + * MWAIT takes an 8-bit "hint" in EAX "suggesting" + * the C-state (top nibble) and sub-state (bottom nibble) +@@ -107,6 +115,24 @@ static struct cpuidle_state *cpuidle_state_table; + #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF) + #define MWAIT2flg(eax) ((eax & 0xFF) << 24) + ++static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev, ++ struct cpuidle_driver *drv, int index) ++{ ++ bool smt_active = sched_smt_active(); ++ u64 spec_ctrl = spec_ctrl_current(); ++ int ret; ++ ++ if (smt_active) ++ wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ ++ ret = intel_idle(dev, drv, index); ++ ++ if (smt_active) ++ wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl); ++ ++ return ret; ++} ++ + /* + * States are indexed by the cstate number, + * which is also the index into the MWAIT hint array. +@@ -605,7 +631,7 @@ static struct cpuidle_state skl_cstates[] = { + { + .name = "C6", + .desc = "MWAIT 0x20", +- .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 85, + .target_residency = 200, + .enter = &intel_idle, +@@ -613,7 +639,7 @@ static struct cpuidle_state skl_cstates[] = { + { + .name = "C7s", + .desc = "MWAIT 0x33", +- .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 124, + .target_residency = 800, + .enter = &intel_idle, +@@ -621,7 +647,7 @@ static struct cpuidle_state skl_cstates[] = { + { + .name = "C8", + .desc = "MWAIT 0x40", +- .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 200, + .target_residency = 800, + .enter = &intel_idle, +@@ -629,7 +655,7 @@ static struct cpuidle_state skl_cstates[] = { + { + .name = "C9", + .desc = "MWAIT 0x50", +- .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 480, + .target_residency = 5000, + .enter = &intel_idle, +@@ -637,7 +663,7 @@ static struct cpuidle_state skl_cstates[] = { + { + .name = "C10", + .desc = "MWAIT 0x60", +- .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 890, + .target_residency = 5000, + .enter = &intel_idle, +@@ -666,7 +692,7 @@ static struct cpuidle_state skx_cstates[] = { + { + .name = "C6", + .desc = "MWAIT 0x20", +- .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, + .exit_latency = 133, + .target_residency = 600, + .enter = &intel_idle, +@@ -1370,6 +1396,11 @@ static void __init intel_idle_cpuidle_driver_init(void) + drv->states[drv->state_count] = /* structure copy */ + cpuidle_state_table[cstate]; + ++ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && ++ cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) { ++ drv->states[drv->state_count].enter = intel_idle_ibrs; ++ } ++ + drv->state_count += 1; + } + +diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c +index 510ca69746042..c83ff610ecb6c 100644 +--- a/fs/xfs/libxfs/xfs_attr.c ++++ b/fs/xfs/libxfs/xfs_attr.c +@@ -1007,7 +1007,7 @@ restart: + * The INCOMPLETE flag means that we will find the "old" + * attr, not the "new" one. + */ +- args->flags |= XFS_ATTR_INCOMPLETE; ++ args->op_flags |= XFS_DA_OP_INCOMPLETE; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; +diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c +index 0c23127347aca..c86ddbf6d105b 100644 +--- a/fs/xfs/libxfs/xfs_attr_leaf.c ++++ b/fs/xfs/libxfs/xfs_attr_leaf.c +@@ -2345,8 +2345,8 @@ xfs_attr3_leaf_lookup_int( + * If we are looking for INCOMPLETE entries, show only those. + * If we are looking for complete entries, show only those. + */ +- if ((args->flags & XFS_ATTR_INCOMPLETE) != +- (entry->flags & XFS_ATTR_INCOMPLETE)) { ++ if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != ++ !!(entry->flags & XFS_ATTR_INCOMPLETE)) { + continue; + } + if (entry->flags & XFS_ATTR_LOCAL) { +diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h +index 7b74e18becff7..38c05d6ae2aa4 100644 +--- a/fs/xfs/libxfs/xfs_attr_leaf.h ++++ b/fs/xfs/libxfs/xfs_attr_leaf.h +@@ -17,13 +17,27 @@ struct xfs_inode; + struct xfs_trans; + + /* +- * Used to keep a list of "remote value" extents when unlinking an inode. ++ * Incore version of the attribute leaf header. + */ +-typedef struct xfs_attr_inactive_list { +- xfs_dablk_t valueblk; /* block number of value bytes */ +- int valuelen; /* number of bytes in value */ +-} xfs_attr_inactive_list_t; +- ++struct xfs_attr3_icleaf_hdr { ++ uint32_t forw; ++ uint32_t back; ++ uint16_t magic; ++ uint16_t count; ++ uint16_t usedbytes; ++ /* ++ * Firstused is 32-bit here instead of 16-bit like the on-disk variant ++ * to support maximum fsb size of 64k without overflow issues throughout ++ * the attr code. Instead, the overflow condition is handled on ++ * conversion to/from disk. ++ */ ++ uint32_t firstused; ++ __u8 holes; ++ struct { ++ uint16_t base; ++ uint16_t size; ++ } freemap[XFS_ATTR_LEAF_MAPSIZE]; ++}; + + /*======================================================================== + * Function prototypes for the kernel. +diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c +index 3e39b7d40f256..de9096b8a47c6 100644 +--- a/fs/xfs/libxfs/xfs_attr_remote.c ++++ b/fs/xfs/libxfs/xfs_attr_remote.c +@@ -24,6 +24,23 @@ + + #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + ++/* ++ * Remote Attribute Values ++ * ======================= ++ * ++ * Remote extended attribute values are conceptually simple -- they're written ++ * to data blocks mapped by an inode's attribute fork, and they have an upper ++ * size limit of 64k. Setting a value does not involve the XFS log. ++ * ++ * However, on a v5 filesystem, maximally sized remote attr values require one ++ * block more than 64k worth of space to hold both the remote attribute value ++ * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; ++ * on a 64k block filesystem, this would be a 128k buffer. Note that the log ++ * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). ++ * Therefore, we /must/ ensure that remote attribute value buffers never touch ++ * the logging system and therefore never have a log item. ++ */ ++ + /* + * Each contiguous block has a header, so it is not just a simple attribute + * length to FSB conversion. +@@ -400,17 +417,25 @@ xfs_attr_rmtval_get( + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); +- error = xfs_trans_read_buf(mp, args->trans, +- mp->m_ddev_targp, +- dblkno, dblkcnt, 0, &bp, +- &xfs_attr3_rmt_buf_ops); +- if (error) ++ bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, ++ &xfs_attr3_rmt_buf_ops); ++ if (!bp) ++ return -ENOMEM; ++ error = bp->b_error; ++ if (error) { ++ xfs_buf_ioerror_alert(bp, __func__); ++ xfs_buf_relse(bp); ++ ++ /* bad CRC means corrupted metadata */ ++ if (error == -EFSBADCRC) ++ error = -EFSCORRUPTED; + return error; ++ } + + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); +- xfs_trans_brelse(args->trans, bp); ++ xfs_buf_relse(bp); + if (error) + return error; + +@@ -551,6 +576,32 @@ xfs_attr_rmtval_set( + return 0; + } + ++/* Mark stale any incore buffers for the remote value. */ ++int ++xfs_attr_rmtval_stale( ++ struct xfs_inode *ip, ++ struct xfs_bmbt_irec *map, ++ xfs_buf_flags_t incore_flags) ++{ ++ struct xfs_mount *mp = ip->i_mount; ++ struct xfs_buf *bp; ++ ++ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ++ ++ ASSERT((map->br_startblock != DELAYSTARTBLOCK) && ++ (map->br_startblock != HOLESTARTBLOCK)); ++ ++ bp = xfs_buf_incore(mp->m_ddev_targp, ++ XFS_FSB_TO_DADDR(mp, map->br_startblock), ++ XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); ++ if (bp) { ++ xfs_buf_stale(bp); ++ xfs_buf_relse(bp); ++ } ++ ++ return 0; ++} ++ + /* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. +@@ -559,7 +610,6 @@ int + xfs_attr_rmtval_remove( + struct xfs_da_args *args) + { +- struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; +@@ -574,9 +624,6 @@ xfs_attr_rmtval_remove( + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; +- struct xfs_buf *bp; +- xfs_daddr_t dblkno; +- int dblkcnt; + int nmap; + + /* +@@ -588,21 +635,9 @@ xfs_attr_rmtval_remove( + if (error) + return error; + ASSERT(nmap == 1); +- ASSERT((map.br_startblock != DELAYSTARTBLOCK) && +- (map.br_startblock != HOLESTARTBLOCK)); +- +- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), +- dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); +- +- /* +- * If the "remote" value is in the cache, remove it. +- */ +- bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); +- if (bp) { +- xfs_buf_stale(bp); +- xfs_buf_relse(bp); +- bp = NULL; +- } ++ error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); ++ if (error) ++ return error; + + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; +diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h +index 9d20b66ad379e..6fb4572845ce8 100644 +--- a/fs/xfs/libxfs/xfs_attr_remote.h ++++ b/fs/xfs/libxfs/xfs_attr_remote.h +@@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + int xfs_attr_rmtval_get(struct xfs_da_args *args); + int xfs_attr_rmtval_set(struct xfs_da_args *args); + int xfs_attr_rmtval_remove(struct xfs_da_args *args); ++int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, ++ xfs_buf_flags_t incore_flags); + + #endif /* __XFS_ATTR_REMOTE_H__ */ +diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h +index ae0bbd20d9caf..588e4674e931f 100644 +--- a/fs/xfs/libxfs/xfs_da_btree.h ++++ b/fs/xfs/libxfs/xfs_da_btree.h +@@ -82,6 +82,7 @@ typedef struct xfs_da_args { + #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ + #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ + #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ ++#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ + + #define XFS_DA_OP_FLAGS \ + { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ +@@ -89,7 +90,8 @@ typedef struct xfs_da_args { + { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ + { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ + { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ +- { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } ++ { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ ++ { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } + + /* + * Storage for holding state during Btree searches and split/join ops. +@@ -124,6 +126,19 @@ typedef struct xfs_da_state { + /* for dirv2 extrablk is data */ + } xfs_da_state_t; + ++/* ++ * In-core version of the node header to abstract the differences in the v2 and ++ * v3 disk format of the headers. Callers need to convert to/from disk format as ++ * appropriate. ++ */ ++struct xfs_da3_icnode_hdr { ++ uint32_t forw; ++ uint32_t back; ++ uint16_t magic; ++ uint16_t count; ++ uint16_t level; ++}; ++ + /* + * Utility macros to aid in logging changed structure fields. + */ +diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c +index b1ae572496b69..31bb250c18992 100644 +--- a/fs/xfs/libxfs/xfs_da_format.c ++++ b/fs/xfs/libxfs/xfs_da_format.c +@@ -13,6 +13,7 @@ + #include "xfs_mount.h" + #include "xfs_inode.h" + #include "xfs_dir2.h" ++#include "xfs_dir2_priv.h" + + /* + * Shortform directory ops +diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h +index ae654e06b2fb6..222ee48da5e80 100644 +--- a/fs/xfs/libxfs/xfs_da_format.h ++++ b/fs/xfs/libxfs/xfs_da_format.h +@@ -93,19 +93,6 @@ struct xfs_da3_intnode { + struct xfs_da_node_entry __btree[]; + }; + +-/* +- * In-core version of the node header to abstract the differences in the v2 and +- * v3 disk format of the headers. Callers need to convert to/from disk format as +- * appropriate. +- */ +-struct xfs_da3_icnode_hdr { +- uint32_t forw; +- uint32_t back; +- uint16_t magic; +- uint16_t count; +- uint16_t level; +-}; +- + /* + * Directory version 2. + * +@@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr { + __be32 pad; /* 64 bit alignment */ + }; + +-struct xfs_dir3_icleaf_hdr { +- uint32_t forw; +- uint32_t back; +- uint16_t magic; +- uint16_t count; +- uint16_t stale; +-}; +- + /* + * Leaf block entry. + */ +@@ -520,19 +499,6 @@ struct xfs_dir3_free { + + #define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) + +-/* +- * In core version of the free block header, abstracted away from on-disk format +- * differences. Use this in the code, and convert to/from the disk version using +- * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. +- */ +-struct xfs_dir3_icfree_hdr { +- uint32_t magic; +- uint32_t firstdb; +- uint32_t nvalid; +- uint32_t nused; +- +-}; +- + /* + * Single block format. + * +@@ -709,29 +675,6 @@ struct xfs_attr3_leafblock { + */ + }; + +-/* +- * incore, neutral version of the attribute leaf header +- */ +-struct xfs_attr3_icleaf_hdr { +- uint32_t forw; +- uint32_t back; +- uint16_t magic; +- uint16_t count; +- uint16_t usedbytes; +- /* +- * firstused is 32-bit here instead of 16-bit like the on-disk variant +- * to support maximum fsb size of 64k without overflow issues throughout +- * the attr code. Instead, the overflow condition is handled on +- * conversion to/from disk. +- */ +- uint32_t firstused; +- __u8 holes; +- struct { +- uint16_t base; +- uint16_t size; +- } freemap[XFS_ATTR_LEAF_MAPSIZE]; +-}; +- + /* + * Special value to represent fs block size in the leaf header firstused field. + * Only used when block size overflows the 2-bytes available on disk. +@@ -740,8 +683,6 @@ struct xfs_attr3_icleaf_hdr { + + /* + * Flags used in the leaf_entry[i].flags field. +- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified +- * on the system call, they are "or"ed together for various operations. + */ + #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ + #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h +index f542447794928..e170792c0acce 100644 +--- a/fs/xfs/libxfs/xfs_dir2.h ++++ b/fs/xfs/libxfs/xfs_dir2.h +@@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry; + struct xfs_dir2_data_hdr; + struct xfs_dir2_data_entry; + struct xfs_dir2_data_unused; ++struct xfs_dir3_icfree_hdr; ++struct xfs_dir3_icleaf_hdr; + + extern struct xfs_name xfs_name_dotdot; + +diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h +index 59f9fb2241a5f..d2eaea663e7f2 100644 +--- a/fs/xfs/libxfs/xfs_dir2_priv.h ++++ b/fs/xfs/libxfs/xfs_dir2_priv.h +@@ -8,6 +8,25 @@ + + struct dir_context; + ++/* ++ * In-core version of the leaf and free block headers to abstract the ++ * differences in the v2 and v3 disk format of the headers. ++ */ ++struct xfs_dir3_icleaf_hdr { ++ uint32_t forw; ++ uint32_t back; ++ uint16_t magic; ++ uint16_t count; ++ uint16_t stale; ++}; ++ ++struct xfs_dir3_icfree_hdr { ++ uint32_t magic; ++ uint32_t firstdb; ++ uint32_t nvalid; ++ uint32_t nused; ++}; ++ + /* xfs_dir2.c */ + extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, + xfs_dir2_db_t *dbp); +diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h +index c968b60cee15b..28203b626f6a2 100644 +--- a/fs/xfs/libxfs/xfs_format.h ++++ b/fs/xfs/libxfs/xfs_format.h +@@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block { + #define BMBT_BLOCKCOUNT_BITLEN 21 + + #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) ++#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) ++ ++/* ++ * bmbt records have a file offset (block) field that is 54 bits wide, so this ++ * is the largest xfs_fileoff_t that we ever expect to see. ++ */ ++#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK) + + typedef struct xfs_bmbt_rec { + __be64 l0, l1; +diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c +index 766b1386402a0..9c88203b537b1 100644 +--- a/fs/xfs/xfs_attr_inactive.c ++++ b/fs/xfs/xfs_attr_inactive.c +@@ -25,22 +25,18 @@ + #include "xfs_error.h" + + /* +- * Look at all the extents for this logical region, +- * invalidate any buffers that are incore/in transactions. ++ * Invalidate any incore buffers associated with this remote attribute value ++ * extent. We never log remote attribute value buffers, which means that they ++ * won't be attached to a transaction and are therefore safe to mark stale. ++ * The actual bunmapi will be taken care of later. + */ + STATIC int +-xfs_attr3_leaf_freextent( +- struct xfs_trans **trans, ++xfs_attr3_rmt_stale( + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) + { + struct xfs_bmbt_irec map; +- struct xfs_buf *bp; +- xfs_dablk_t tblkno; +- xfs_daddr_t dblkno; +- int tblkcnt; +- int dblkcnt; + int nmap; + int error; + +@@ -48,47 +44,28 @@ xfs_attr3_leaf_freextent( + * Roll through the "value", invalidating the attribute value's + * blocks. + */ +- tblkno = blkno; +- tblkcnt = blkcnt; +- while (tblkcnt > 0) { ++ while (blkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; +- error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, ++ error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); +- if (error) { ++ if (error) + return error; +- } + ASSERT(nmap == 1); +- ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* +- * If it's a hole, these are already unmapped +- * so there's nothing to invalidate. ++ * Mark any incore buffers for the remote value as stale. We ++ * never log remote attr value buffers, so the buffer should be ++ * easy to kill. + */ +- if (map.br_startblock != HOLESTARTBLOCK) { +- +- dblkno = XFS_FSB_TO_DADDR(dp->i_mount, +- map.br_startblock); +- dblkcnt = XFS_FSB_TO_BB(dp->i_mount, +- map.br_blockcount); +- bp = xfs_trans_get_buf(*trans, +- dp->i_mount->m_ddev_targp, +- dblkno, dblkcnt, 0); +- if (!bp) +- return -ENOMEM; +- xfs_trans_binval(*trans, bp); +- /* +- * Roll to next transaction. +- */ +- error = xfs_trans_roll_inode(trans, dp); +- if (error) +- return error; +- } ++ error = xfs_attr_rmtval_stale(dp, &map, 0); ++ if (error) ++ return error; + +- tblkno += map.br_blockcount; +- tblkcnt -= map.br_blockcount; ++ blkno += map.br_blockcount; ++ blkcnt -= map.br_blockcount; + } + + return 0; +@@ -102,86 +79,45 @@ xfs_attr3_leaf_freextent( + */ + STATIC int + xfs_attr3_leaf_inactive( +- struct xfs_trans **trans, +- struct xfs_inode *dp, +- struct xfs_buf *bp) ++ struct xfs_trans **trans, ++ struct xfs_inode *dp, ++ struct xfs_buf *bp) + { +- struct xfs_attr_leafblock *leaf; +- struct xfs_attr3_icleaf_hdr ichdr; +- struct xfs_attr_leaf_entry *entry; ++ struct xfs_attr3_icleaf_hdr ichdr; ++ struct xfs_mount *mp = bp->b_mount; ++ struct xfs_attr_leafblock *leaf = bp->b_addr; ++ struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; +- struct xfs_attr_inactive_list *list; +- struct xfs_attr_inactive_list *lp; +- int error; +- int count; +- int size; +- int tmp; +- int i; +- struct xfs_mount *mp = bp->b_mount; ++ int error = 0; ++ int i; + +- leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + + /* +- * Count the number of "remote" value extents. ++ * Find the remote value extents for this leaf and invalidate their ++ * incore buffers. + */ +- count = 0; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { +- if (be16_to_cpu(entry->nameidx) && +- ((entry->flags & XFS_ATTR_LOCAL) == 0)) { +- name_rmt = xfs_attr3_leaf_name_remote(leaf, i); +- if (name_rmt->valueblk) +- count++; +- } +- } +- +- /* +- * If there are no "remote" values, we're done. +- */ +- if (count == 0) { +- xfs_trans_brelse(*trans, bp); +- return 0; +- } ++ int blkcnt; + +- /* +- * Allocate storage for a list of all the "remote" value extents. +- */ +- size = count * sizeof(xfs_attr_inactive_list_t); +- list = kmem_alloc(size, 0); +- +- /* +- * Identify each of the "remote" value extents. +- */ +- lp = list; +- entry = xfs_attr3_leaf_entryp(leaf); +- for (i = 0; i < ichdr.count; entry++, i++) { +- if (be16_to_cpu(entry->nameidx) && +- ((entry->flags & XFS_ATTR_LOCAL) == 0)) { +- name_rmt = xfs_attr3_leaf_name_remote(leaf, i); +- if (name_rmt->valueblk) { +- lp->valueblk = be32_to_cpu(name_rmt->valueblk); +- lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, +- be32_to_cpu(name_rmt->valuelen)); +- lp++; +- } +- } +- } +- xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ ++ if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) ++ continue; + +- /* +- * Invalidate each of the "remote" value extents. +- */ +- error = 0; +- for (lp = list, i = 0; i < count; i++, lp++) { +- tmp = xfs_attr3_leaf_freextent(trans, dp, +- lp->valueblk, lp->valuelen); ++ name_rmt = xfs_attr3_leaf_name_remote(leaf, i); ++ if (!name_rmt->valueblk) ++ continue; + +- if (error == 0) +- error = tmp; /* save only the 1st errno */ ++ blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, ++ be32_to_cpu(name_rmt->valuelen)); ++ error = xfs_attr3_rmt_stale(dp, ++ be32_to_cpu(name_rmt->valueblk), blkcnt); ++ if (error) ++ goto err; + } + +- kmem_free(list); ++ xfs_trans_brelse(*trans, bp); ++err: + return error; + } + +diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c +index 203065a647652..e41c13ffa5a43 100644 +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -187,7 +187,12 @@ xfs_file_dio_aio_read( + + file_accessed(iocb->ki_filp); + +- xfs_ilock(ip, XFS_IOLOCK_SHARED); ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) ++ return -EAGAIN; ++ } else { ++ xfs_ilock(ip, XFS_IOLOCK_SHARED); ++ } + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + +diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c +index 7b72c189cff0b..30202d8c25e4f 100644 +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1513,10 +1513,8 @@ xfs_itruncate_extents_flags( + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp = *tpp; + xfs_fileoff_t first_unmap_block; +- xfs_fileoff_t last_block; + xfs_filblks_t unmap_len; + int error = 0; +- int done = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!atomic_read(&VFS_I(ip)->i_count) || +@@ -1536,21 +1534,22 @@ xfs_itruncate_extents_flags( + * the end of the file (in a crash where the space is allocated + * but the inode size is not yet updated), simply remove any + * blocks which show up between the new EOF and the maximum +- * possible file size. If the first block to be removed is +- * beyond the maximum file size (ie it is the same as last_block), +- * then there is nothing to do. ++ * possible file size. ++ * ++ * We have to free all the blocks to the bmbt maximum offset, even if ++ * the page cache can't scale that far. + */ + first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); +- last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); +- if (first_unmap_block == last_block) ++ if (first_unmap_block >= XFS_MAX_FILEOFF) { ++ WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); + return 0; ++ } + +- ASSERT(first_unmap_block < last_block); +- unmap_len = last_block - first_unmap_block + 1; +- while (!done) { ++ unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; ++ while (unmap_len > 0) { + ASSERT(tp->t_firstblock == NULLFSBLOCK); +- error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, +- XFS_ITRUNC_MAX_EXTENTS, &done); ++ error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, ++ flags, XFS_ITRUNC_MAX_EXTENTS); + if (error) + goto out; + +@@ -1570,7 +1569,7 @@ xfs_itruncate_extents_flags( + if (whichfork == XFS_DATA_FORK) { + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, +- first_unmap_block, last_block, true); ++ first_unmap_block, XFS_MAX_FILEOFF, true); + if (error) + goto out; + +diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c +index 904d8285c2269..dfbf3f8f1ec86 100644 +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -1544,7 +1544,8 @@ xfs_reflink_clear_inode_flag( + * We didn't find any shared blocks so turn off the reflink flag. + * First, get rid of any leftover CoW mappings. + */ +- error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); ++ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, ++ true); + if (error) + return error; + +diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c +index 8d1df9f8be071..a3a54a0fbffea 100644 +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -512,32 +512,6 @@ xfs_showargs( + seq_puts(m, ",noquota"); + } + +-static uint64_t +-xfs_max_file_offset( +- unsigned int blockshift) +-{ +- unsigned int pagefactor = 1; +- unsigned int bitshift = BITS_PER_LONG - 1; +- +- /* Figure out maximum filesize, on Linux this can depend on +- * the filesystem blocksize (on 32 bit platforms). +- * __block_write_begin does this in an [unsigned] long long... +- * page->index << (PAGE_SHIFT - bbits) +- * So, for page sized blocks (4K on 32 bit platforms), +- * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is +- * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) +- * but for smaller blocksizes it is less (bbits = log2 bsize). +- */ +- +-#if BITS_PER_LONG == 32 +- ASSERT(sizeof(sector_t) == 8); +- pagefactor = PAGE_SIZE; +- bitshift = BITS_PER_LONG; +-#endif +- +- return (((uint64_t)pagefactor) << bitshift) - 1; +-} +- + /* + * Set parameters for inode allocation heuristics, taking into account + * filesystem size and inode32/inode64 mount options; i.e. specifically +@@ -1650,6 +1624,26 @@ xfs_fs_fill_super( + if (error) + goto out_free_sb; + ++ /* ++ * XFS block mappings use 54 bits to store the logical block offset. ++ * This should suffice to handle the maximum file size that the VFS ++ * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT ++ * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes ++ * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON ++ * to check this assertion. ++ * ++ * Avoid integer overflow by comparing the maximum bmbt offset to the ++ * maximum pagecache offset in units of fs blocks. ++ */ ++ if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { ++ xfs_warn(mp, ++"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", ++ XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), ++ XFS_MAX_FILEOFF); ++ error = -EINVAL; ++ goto out_free_sb; ++ } ++ + error = xfs_filestream_mount(mp); + if (error) + goto out_free_sb; +@@ -1661,7 +1655,7 @@ xfs_fs_fill_super( + sb->s_magic = XFS_SUPER_MAGIC; + sb->s_blocksize = mp->m_sb.sb_blocksize; + sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; +- sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); ++ sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_max_links = XFS_MAXLINK; + sb->s_time_gran = 1; + sb->s_time_min = S32_MIN; +diff --git a/include/linux/cpu.h b/include/linux/cpu.h +index 29a6fa2f518db..b42e9c4134475 100644 +--- a/include/linux/cpu.h ++++ b/include/linux/cpu.h +@@ -68,6 +68,8 @@ extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, + extern ssize_t cpu_show_mmio_stale_data(struct device *dev, + struct device_attribute *attr, + char *buf); ++extern ssize_t cpu_show_retbleed(struct device *dev, ++ struct device_attribute *attr, char *buf); + + extern __printf(4, 5) + struct device *cpu_device_create(struct device *parent, void *drvdata, +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index dd4cdad76b18e..ee7d57478a454 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -955,7 +955,7 @@ static inline void kvm_arch_end_assignment(struct kvm *kvm) + { + } + +-static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) ++static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm) + { + return false; + } +diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h +index 4c56404e53a76..8265b99d6d55b 100644 +--- a/include/linux/mod_devicetable.h ++++ b/include/linux/mod_devicetable.h +@@ -672,9 +672,7 @@ struct x86_cpu_id { + __u16 steppings; + }; + +-#define X86_FEATURE_MATCH(x) \ +- { X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, x } +- ++/* Wild cards for x86_cpu_id::vendor, family, model and feature */ + #define X86_VENDOR_ANY 0xffff + #define X86_FAMILY_ANY 0 + #define X86_MODEL_ANY 0 +diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn +index 854e2ba9daa29..6a78afc6f13b4 100644 +--- a/scripts/Makefile.extrawarn ++++ b/scripts/Makefile.extrawarn +@@ -50,6 +50,7 @@ KBUILD_CFLAGS += -Wno-sign-compare + KBUILD_CFLAGS += -Wno-format-zero-length + KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast) + KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access) ++KBUILD_CFLAGS += $(call cc-disable-warning, cast-function-type-strict) + endif + + endif +diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h +index 59f924e92c284..3efaf338d3257 100644 +--- a/tools/arch/x86/include/asm/cpufeatures.h ++++ b/tools/arch/x86/include/asm/cpufeatures.h +@@ -284,7 +284,7 @@ + #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ + #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ + #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ +-#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+ 6) /* "" Fill RSB on VM-Exit when EIBRS is enabled */ ++#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */ + + /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ + #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ |