diff --git a/.mailmap b/.mailmap index 08f28f2999f0..bd9f1025ac44 100644 --- a/.mailmap +++ b/.mailmap @@ -325,6 +325,7 @@ Kenneth W Chen Kenneth Westfield Kiran Gunda Kirill Tkhai +Kishon Vijay Abraham I Konstantin Khlebnikov Konstantin Khlebnikov Koushik @@ -609,6 +610,11 @@ TripleX Chung TripleX Chung Tsuneo Yoshioka Tudor Ambarus +Tvrtko Ursulin +Tvrtko Ursulin +Tvrtko Ursulin +Tvrtko Ursulin +Tvrtko Ursulin Tycho Andersen Tzung-Bi Shih Uwe Kleine-König diff --git a/CREDITS b/CREDITS index df8d6946739f..3c2bb55847c6 100644 --- a/CREDITS +++ b/CREDITS @@ -63,6 +63,11 @@ D: dosfs, LILO, some fd features, ATM, various other hacks here and there S: Buenos Aires S: Argentina +NTFS FILESYSTEM +N: Anton Altaparmakov +E: anton@tuxera.com +D: NTFS filesystem + N: Tim Alpaerts E: tim_alpaerts@toyota-motor-europe.com D: 802.2 class II logical link control layer, diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 2d42998a89a6..3e6407de231c 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -68,7 +68,8 @@ over a rather long period of time, but improvements are always welcome! rcu_read_lock_sched(), or by the appropriate update-side lock. Explicit disabling of preemption (preempt_disable(), for example) can serve as rcu_read_lock_sched(), but is less readable and - prevents lockdep from detecting locking issues. + prevents lockdep from detecting locking issues. Acquiring a + spinlock also enters an RCU read-side critical section. Please note that you *cannot* rely on code known to be built only in non-preemptible kernels. Such code can and will break, @@ -382,16 +383,17 @@ over a rather long period of time, but improvements are always welcome! must use whatever locking or other synchronization is required to safely access and/or modify that data structure. - Do not assume that RCU callbacks will be executed on the same - CPU that executed the corresponding call_rcu() or call_srcu(). - For example, if a given CPU goes offline while having an RCU - callback pending, then that RCU callback will execute on some - surviving CPU. (If this was not the case, a self-spawning RCU - callback would prevent the victim CPU from ever going offline.) - Furthermore, CPUs designated by rcu_nocbs= might well *always* - have their RCU callbacks executed on some other CPUs, in fact, - for some real-time workloads, this is the whole point of using - the rcu_nocbs= kernel boot parameter. + Do not assume that RCU callbacks will be executed on + the same CPU that executed the corresponding call_rcu(), + call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or + call_rcu_tasks_trace(). For example, if a given CPU goes offline + while having an RCU callback pending, then that RCU callback + will execute on some surviving CPU. (If this was not the case, + a self-spawning RCU callback would prevent the victim CPU from + ever going offline.) Furthermore, CPUs designated by rcu_nocbs= + might well *always* have their RCU callbacks executed on some + other CPUs, in fact, for some real-time workloads, this is the + whole point of using the rcu_nocbs= kernel boot parameter. In addition, do not assume that callbacks queued in a given order will be invoked in that order, even if they all are queued on the @@ -444,7 +446,7 @@ over a rather long period of time, but improvements are always welcome! real-time workloads than is synchronize_rcu_expedited(). It is also permissible to sleep in RCU Tasks Trace read-side - critical, which are delimited by rcu_read_lock_trace() and + critical section, which are delimited by rcu_read_lock_trace() and rcu_read_unlock_trace(). However, this is a specialized flavor of RCU, and you should not use it without first checking with its current users. In most cases, you should instead use SRCU. @@ -490,6 +492,12 @@ over a rather long period of time, but improvements are always welcome! since the last time that you passed that same object to call_rcu() (or friends). + CONFIG_RCU_STRICT_GRACE_PERIOD: + combine with KASAN to check for pointers leaked out + of RCU read-side critical sections. This Kconfig + option is tough on both performance and scalability, + and so is limited to four-CPU systems. + __rcu sparse checks: tag the pointer to the RCU-protected data structure with __rcu, and sparse will warn you if you access that diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst index 659d5913784d..2524dcdadde2 100644 --- a/Documentation/RCU/rcu_dereference.rst +++ b/Documentation/RCU/rcu_dereference.rst @@ -408,7 +408,10 @@ member of the rcu_dereference() to use in various situations: RCU flavors, an RCU read-side critical section is entered using rcu_read_lock(), anything that disables bottom halves, anything that disables interrupts, or anything that disables - preemption. + preemption. Please note that spinlock critical sections + are also implied RCU read-side critical sections, even when + they are preemptible, as they are in kernels built with + CONFIG_PREEMPT_RT=y. 2. If the access might be within an RCU read-side critical section on the one hand, or protected by (say) my_lock on the other, diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 60ce02475142..872ac665223f 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -172,14 +172,25 @@ rcu_read_lock() critical section. Reference counts may be used in conjunction with RCU to maintain longer-term references to data structures. + Note that anything that disables bottom halves, preemption, + or interrupts also enters an RCU read-side critical section. + Acquiring a spinlock also enters an RCU read-side critical + sections, even for spinlocks that do not disable preemption, + as is the case in kernels built with CONFIG_PREEMPT_RT=y. + Sleeplocks do *not* enter RCU read-side critical sections. + rcu_read_unlock() ^^^^^^^^^^^^^^^^^ void rcu_read_unlock(void); This temporal primitives is used by a reader to inform the reclaimer that the reader is exiting an RCU read-side critical - section. Note that RCU read-side critical sections may be nested - and/or overlapping. + section. Anything that enables bottom halves, preemption, + or interrupts also exits an RCU read-side critical section. + Releasing a spinlock also exits an RCU read-side critical section. + + Note that RCU read-side critical sections may be nested and/or + overlapping. synchronize_rcu() ^^^^^^^^^^^^^^^^^ @@ -952,8 +963,8 @@ unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be initialized after each and every call to kmem_cache_alloc(), which renders reference-free spinlock acquisition completely unsafe. Therefore, when using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter. -(Those willing to use a kmem_cache constructor may also use locking, -including cache-friendly sequence locking.) +(Those willing to initialize their locks in a kmem_cache constructor +may also use locking, including cache-friendly sequence locking.) With traditional reference counting -- such as that implemented by the kref library in Linux -- there is typically code that runs when the last diff --git a/Documentation/admin-guide/RAS/address-translation.rst b/Documentation/admin-guide/RAS/address-translation.rst new file mode 100644 index 000000000000..f0ca17b43cd3 --- /dev/null +++ b/Documentation/admin-guide/RAS/address-translation.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Address translation +=================== + +x86 AMD +------- + +Zen-based AMD systems include a Data Fabric that manages the layout of +physical memory. Devices attached to the Fabric, like memory controllers, +I/O, etc., may not have a complete view of the system physical memory map. +These devices may provide a "normalized", i.e. device physical, address +when reporting memory errors. Normalized addresses must be translated to +a system physical address for the kernel to action on the memory. + +AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for +this case. + +Glossary of acronyms used in address translation for Zen-based systems + +* CCM = Cache Coherent Moderator +* COD = Cluster-on-Die +* COH_ST = Coherent Station +* DF = Data Fabric diff --git a/Documentation/RAS/ras.rst b/Documentation/admin-guide/RAS/error-decoding.rst similarity index 73% rename from Documentation/RAS/ras.rst rename to Documentation/admin-guide/RAS/error-decoding.rst index 2556b397cd27..26a72f3fe5de 100644 --- a/Documentation/RAS/ras.rst +++ b/Documentation/admin-guide/RAS/error-decoding.rst @@ -1,15 +1,10 @@ .. SPDX-License-Identifier: GPL-2.0 -Reliability, Availability and Serviceability features -===================================================== - -This documents different aspects of the RAS functionality present in the -kernel. - Error decoding ---------------- +============== -* x86 +x86 +--- Error decoding on AMD systems should be done using the rasdaemon tool: https://github.com/mchehab/rasdaemon/ diff --git a/Documentation/admin-guide/RAS/index.rst b/Documentation/admin-guide/RAS/index.rst new file mode 100644 index 000000000000..f4087040a7c0 --- /dev/null +++ b/Documentation/admin-guide/RAS/index.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. toctree:: + :maxdepth: 2 + + main + error-decoding + address-translation diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/RAS/main.rst similarity index 99% rename from Documentation/admin-guide/ras.rst rename to Documentation/admin-guide/RAS/main.rst index 8e03751d126d..7ac1d4ccc509 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/RAS/main.rst @@ -1,8 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 .. include:: -============================================ -Reliability, Availability and Serviceability -============================================ +================================================== +Reliability, Availability and Serviceability (RAS) +================================================== + +This documents different aspects of the RAS functionality present in the +kernel. RAS concepts ************ diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst index ae646d621a8a..7d3415eea05d 100644 --- a/Documentation/admin-guide/cgroup-v1/cpusets.rst +++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst @@ -179,7 +179,7 @@ files describing that cpuset: - cpuset.mem_hardwall flag: is memory allocation hardwalled - cpuset.memory_pressure: measure of how much paging pressure in cpuset - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes + - cpuset.memory_spread_slab flag: OBSOLETE. Doesn't have any function. - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset - cpuset.sched_relax_domain_level: the searching range when migrating tasks diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst index 0fa724d82abb..493a8e386700 100644 --- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -65,10 +65,12 @@ files include:: 1. Page fault accounting -hugetlb..limit_in_bytes -hugetlb..max_usage_in_bytes -hugetlb..usage_in_bytes -hugetlb..failcnt +:: + + hugetlb..limit_in_bytes + hugetlb..max_usage_in_bytes + hugetlb..usage_in_bytes + hugetlb..failcnt The HugeTLB controller allows users to limit the HugeTLB usage (page fault) per control group and enforces the limit during page fault. Since HugeTLB @@ -82,10 +84,12 @@ getting SIGBUS. 2. Reservation accounting -hugetlb..rsvd.limit_in_bytes -hugetlb..rsvd.max_usage_in_bytes -hugetlb..rsvd.usage_in_bytes -hugetlb..rsvd.failcnt +:: + + hugetlb..rsvd.limit_in_bytes + hugetlb..rsvd.max_usage_in_bytes + hugetlb..rsvd.usage_in_bytes + hugetlb..rsvd.failcnt The HugeTLB controller allows to limit the HugeTLB reservations per control group and enforces the controller limit at reservation time and at the fault of diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 32a8893e5617..cce768afec6b 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -473,8 +473,8 @@ Spectre variant 2 -mindirect-branch=thunk-extern -mindirect-branch-register options. If the kernel is compiled with a Clang compiler, the compiler needs to support -mretpoline-external-thunk option. The kernel config - CONFIG_RETPOLINE needs to be turned on, and the CPU needs to run with - the latest updated microcode. + CONFIG_MITIGATION_RETPOLINE needs to be turned on, and the CPU needs + to run with the latest updated microcode. On Intel Skylake-era systems the mitigation covers most, but not all, cases. See :ref:`[3] ` for more details. @@ -609,8 +609,8 @@ kernel command line. Selecting 'on' will, and 'auto' may, choose a mitigation method at run time according to the CPU, the available microcode, the setting of the - CONFIG_RETPOLINE configuration option, and the - compiler with which the kernel was built. + CONFIG_MITIGATION_RETPOLINE configuration option, + and the compiler with which the kernel was built. Selecting 'on' will also enable the mitigation against user space to user space task attacks. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index fb40a1f6f79e..dfc06fab9432 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -122,7 +122,7 @@ configure specific aspects of kernel behavior to your liking. pmf pnp rapidio - ras + RAS/index rtc serial-console svga diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 5762e7477a0c..0302a93b1d40 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -191,9 +191,7 @@ Dump-capture kernel config options (Arch Dependent, i386 and x86_64) CPU is enough for kdump kernel to dump vmcore on most of systems. However, you can also specify nr_cpus=X to enable multiple processors - in kdump kernel. In this case, "disable_cpu_apicid=" is needed to - tell kdump kernel which cpu is 1st kernel's BSP. Please refer to - admin-guide/kernel-parameters.txt for more details. + in kdump kernel. With CONFIG_SMP=n, the above things are not related. @@ -454,8 +452,7 @@ Notes on loading the dump-capture kernel: to use multi-thread programs with it, such as parallel dump feature of makedumpfile. Otherwise, the multi-thread program may have a great performance degradation. To enable multi-cpu support, you should bring up an - SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X] - options while loading it. + SMP dump-capture kernel and specify maxcpus/nr_cpus options while loading it. * For s390x there are two kdump modes: If a ELF header is specified with the elfcorehdr= kernel parameter, it is used by the kdump kernel as it diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 4410384596a9..e8bdf5e86a9b 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -108,6 +108,7 @@ is applicable:: CMA Contiguous Memory Area support is enabled. DRM Direct Rendering Management support is enabled. DYNAMIC_DEBUG Build in debug messages and enable them at runtime + EARLY Parameter processed too early to be embedded in initrd. EDD BIOS Enhanced Disk Drive Services (EDD) is enabled EFI EFI Partitioning (GPT) is enabled EVM Extended Verification Module diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2cb70a384af8..77c3d1a7f116 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -9,7 +9,7 @@ accept_memory=eager can be used to accept all memory at once during boot. - acpi= [HW,ACPI,X86,ARM64,RISCV64] + acpi= [HW,ACPI,X86,ARM64,RISCV64,EARLY] Advanced Configuration and Power Interface Format: { force | on | off | strict | noirq | rsdt | copy_dsdt } @@ -26,7 +26,7 @@ See also Documentation/power/runtime_pm.rst, pci=noacpi - acpi_apic_instance= [ACPI, IOAPIC] + acpi_apic_instance= [ACPI,IOAPIC,EARLY] Format: 2: use 2nd APIC table, if available 1,0: use 1st APIC table @@ -41,7 +41,7 @@ If set to native, use the device's native backlight mode. If set to none, disable the ACPI backlight interface. - acpi_force_32bit_fadt_addr + acpi_force_32bit_fadt_addr [ACPI,EARLY] force FADT to use 32 bit addresses rather than the 64 bit X_* addresses. Some firmware have broken 64 bit addresses for force ACPI ignore these and use @@ -97,7 +97,7 @@ no: ACPI OperationRegions are not marked as reserved, no further checks are performed. - acpi_force_table_verification [HW,ACPI] + acpi_force_table_verification [HW,ACPI,EARLY] Enable table checksum verification during early stage. By default, this is disabled due to x86 early mapping size limitation. @@ -137,7 +137,7 @@ acpi_no_memhotplug [ACPI] Disable memory hotplug. Useful for kdump kernels. - acpi_no_static_ssdt [HW,ACPI] + acpi_no_static_ssdt [HW,ACPI,EARLY] Disable installation of static SSDTs at early boot time By default, SSDTs contained in the RSDT/XSDT will be installed automatically and they will appear under @@ -151,7 +151,7 @@ Ignore the ACPI-based watchdog interface (WDAT) and let a native driver control the watchdog device instead. - acpi_rsdp= [ACPI,EFI,KEXEC] + acpi_rsdp= [ACPI,EFI,KEXEC,EARLY] Pass the RSDP address to the kernel, mostly used on machines running EFI runtime service to boot the second kernel for kdump. @@ -228,10 +228,10 @@ to assume that this machine's pmtimer latches its value and always returns good values. - acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode + acpi_sci= [HW,ACPI,EARLY] ACPI System Control Interrupt trigger mode Format: { level | edge | high | low } - acpi_skip_timer_override [HW,ACPI] + acpi_skip_timer_override [HW,ACPI,EARLY] Recognize and ignore IRQ0/pin2 Interrupt Override. For broken nForce2 BIOS resulting in XT-PIC timer. @@ -266,11 +266,11 @@ behave incorrectly in some ways with respect to system suspend and resume to be ignored (use wisely). - acpi_use_timer_override [HW,ACPI] + acpi_use_timer_override [HW,ACPI,EARLY] Use timer override. For some broken Nvidia NF5 boards that require a timer override, but don't have HPET - add_efi_memmap [EFI; X86] Include EFI memory map in + add_efi_memmap [EFI,X86,EARLY] Include EFI memory map in kernel's map of available physical RAM. agp= [AGP] @@ -307,7 +307,7 @@ do not want to use tracing_snapshot_alloc() as it needs to be done where GFP_KERNEL allocations are allowed. - allow_mismatched_32bit_el0 [ARM64] + allow_mismatched_32bit_el0 [ARM64,EARLY] Allow execve() of 32-bit applications and setting of the PER_LINUX32 personality on systems where only a strict subset of the CPUs support 32-bit EL0. When this @@ -351,7 +351,7 @@ This mode requires kvm-amd.avic=1. (Default when IOMMU HW support is present.) - amd_pstate= [X86] + amd_pstate= [X86,EARLY] disable Do not enable amd_pstate as the default scaling driver for the supported processors @@ -391,7 +391,7 @@ not play well with APC CPU idle - disable it if you have APC and your system crashes randomly. - apic= [APIC,X86] Advanced Programmable Interrupt Controller + apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller Change the output verbosity while booting Format: { quiet (default) | verbose | debug } Change the amount of debugging information output @@ -401,7 +401,7 @@ Format: apic=driver_name Examples: apic=bigsmp - apic_extnmi= [APIC,X86] External NMI delivery setting + apic_extnmi= [APIC,X86,EARLY] External NMI delivery setting Format: { bsp (default) | all | none } bsp: External NMI is delivered only to CPU 0 all: External NMIs are broadcast to all CPUs as a @@ -508,21 +508,22 @@ bert_disable [ACPI] Disable BERT OS support on buggy BIOSes. - bgrt_disable [ACPI][X86] + bgrt_disable [ACPI,X86,EARLY] Disable BGRT to avoid flickering OEM logo. blkdevparts= Manual partition parsing of block device(s) for embedded devices based on command line input. See Documentation/block/cmdline-partition.rst - boot_delay= Milliseconds to delay each printk during boot. + boot_delay= [KNL,EARLY] + Milliseconds to delay each printk during boot. Only works if CONFIG_BOOT_PRINTK_DELAY is enabled, and you may also have to specify "lpj=". Boot_delay values larger than 10 seconds (10000) are assumed erroneous and ignored. Format: integer - bootconfig [KNL] + bootconfig [KNL,EARLY] Extended command line options can be added to an initrd and this will cause the kernel to look for it. @@ -557,7 +558,7 @@ trust validation. format: { id: | builtin } - cca= [MIPS] Override the kernel pages' cache coherency + cca= [MIPS,EARLY] Override the kernel pages' cache coherency algorithm. Accepted values range from 0 to 7 inclusive. See arch/mips/include/asm/pgtable-bits.h for platform specific values (SB1, Loongson3 and @@ -672,19 +673,13 @@ [X86-64] hpet,tsc clocksource.arm_arch_timer.evtstrm= - [ARM,ARM64] + [ARM,ARM64,EARLY] Format: Enable/disable the eventstream feature of the ARM architected timer so that code using WFE-based polling loops can be debugged more effectively on production systems. - clocksource.max_cswd_read_retries= [KNL] - Number of clocksource_watchdog() retries due to - external delays before the clock will be marked - unstable. Defaults to two retries, that is, - three attempts to read the clock under test. - clocksource.verify_n_cpus= [KNL] Limit the number of CPUs checked for clocksources marked with CLOCK_SOURCE_VERIFY_PERCPU that @@ -702,7 +697,7 @@ 10 seconds when built into the kernel. cma=nn[MG]@[start[MG][-end[MG]]] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel global memory area for contiguous memory allocations and optionally the placement constraint by the physical address range of @@ -711,7 +706,7 @@ kernel/dma/contiguous.c cma_pernuma=nn[MG] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel per-numa memory area for contiguous memory allocations. A value of 0 disables per-numa CMA altogether. And If this option is not @@ -722,7 +717,7 @@ they will fallback to the global default memory area. numa_cma=:nn[MG][,:nn[MG]] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel numa memory area for contiguous memory allocations. It will reserve CMA area for the specified node. @@ -739,7 +734,7 @@ a hypervisor. Default: yes - coherent_pool=nn[KMG] [ARM,KNL] + coherent_pool=nn[KMG] [ARM,KNL,EARLY] Sets the size of memory pool for coherent, atomic dma allocations, by default set to 256K. @@ -757,7 +752,7 @@ condev= [HW,S390] console device conmode= - con3215_drop= [S390] 3215 console drop mode. + con3215_drop= [S390,EARLY] 3215 console drop mode. Format: y|n|Y|N|1|0 When set to true, drop data on the 3215 console when the console buffer is full. In this case the @@ -863,7 +858,7 @@ kernel before the cpufreq driver probes. cpu_init_udelay=N - [X86] Delay for N microsec between assert and de-assert + [X86,EARLY] Delay for N microsec between assert and de-assert of APIC INIT to start processors. This delay occurs on every CPU online, such as boot, and resume from suspend. Default: 10000 @@ -883,7 +878,7 @@ kernel more unstable. crashkernel=size[KMG][@offset[KMG]] - [KNL] Using kexec, Linux can switch to a 'crash kernel' + [KNL,EARLY] Using kexec, Linux can switch to a 'crash kernel' upon panic. This parameter reserves the physical memory region [offset, offset + size] for that kernel image. If '@offset' is omitted, then a suitable offset @@ -954,10 +949,10 @@ Format: , See also Documentation/input/devices/joystick-parport.rst - debug [KNL] Enable kernel debugging (events log level). + debug [KNL,EARLY] Enable kernel debugging (events log level). debug_boot_weak_hash - [KNL] Enable printing [hashed] pointers early in the + [KNL,EARLY] Enable printing [hashed] pointers early in the boot sequence. If enabled, we use a weak hash instead of siphash to hash pointers. Use this option if you are seeing instances of '(___ptrval___)') and need to see a @@ -974,10 +969,10 @@ will print _a_lot_ more information - normally only useful to lockdep developers. - debug_objects [KNL] Enable object debugging + debug_objects [KNL,EARLY] Enable object debugging debug_guardpage_minorder= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this + [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter allows control of the order of pages that will be intentionally kept free (and hence protected) by the buddy allocator. Bigger value increase the probability @@ -996,7 +991,7 @@ help tracking down these problems. debug_pagealloc= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter + [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter enables the feature at boot time. By default, it is disabled and the system will work mostly the same as a kernel built without CONFIG_DEBUG_PAGEALLOC. @@ -1004,8 +999,8 @@ useful to also enable the page_owner functionality. on: enable the feature - debugfs= [KNL] This parameter enables what is exposed to userspace - and debugfs internal clients. + debugfs= [KNL,EARLY] This parameter enables what is exposed to + userspace and debugfs internal clients. Format: { on, no-mount, off } on: All functions are enabled. no-mount: @@ -1084,7 +1079,7 @@ dhash_entries= [KNL] Set number of hash buckets for dentry cache. - disable_1tb_segments [PPC] + disable_1tb_segments [PPC,EARLY] Disables the use of 1TB hash page table segments. This causes the kernel to fall back to 256MB segments which can be useful when debugging issues that require an SLB @@ -1093,41 +1088,32 @@ disable= [IPV6] See Documentation/networking/ipv6.rst. - disable_radix [PPC] + disable_radix [PPC,EARLY] Disable RADIX MMU mode on POWER9 disable_tlbie [PPC] Disable TLBIE instruction. Currently does not work with KVM, with HASH MMU, or with coherent accelerators. - disable_cpu_apicid= [X86,APIC,SMP] - Format: - The number of initial APIC ID for the - corresponding CPU to be disabled at boot, - mostly used for the kdump 2nd kernel to - disable BSP to wake up multiple CPUs without - causing system reset or hang due to sending - INIT from AP to BSP. - - disable_ddw [PPC/PSERIES] + disable_ddw [PPC/PSERIES,EARLY] Disable Dynamic DMA Window support. Use this to workaround buggy firmware. disable_ipv6= [IPV6] See Documentation/networking/ipv6.rst. - disable_mtrr_cleanup [X86] + disable_mtrr_cleanup [X86,EARLY] The kernel tries to adjust MTRR layout from continuous to discrete, to make X server driver able to add WB entry later. This parameter disables that. - disable_mtrr_trim [X86, Intel and AMD only] + disable_mtrr_trim [X86, Intel and AMD only,EARLY] By default the kernel will trim any uncacheable memory out of your available memory pool based on MTRR settings. This parameter disables that behavior, possibly causing your machine to run very slowly. - disable_timer_pin_1 [X86] + disable_timer_pin_1 [X86,EARLY] Disable PIN 1 of APIC timer Can be useful to work around chipset bugs. @@ -1177,7 +1163,7 @@ dscc4.setup= [NET] - dt_cpu_ftrs= [PPC] + dt_cpu_ftrs= [PPC,EARLY] Format: {"off" | "known"} Control how the dt_cpu_ftrs device-tree binding is used for CPU feature discovery and setup (if it @@ -1197,12 +1183,12 @@ Documentation/admin-guide/dynamic-debug-howto.rst for details. - early_ioremap_debug [KNL] + early_ioremap_debug [KNL,EARLY] Enable debug messages in early_ioremap support. This is useful for tracking down temporary early mappings which are not unmapped. - earlycon= [KNL] Output early console device and options. + earlycon= [KNL,EARLY] Output early console device and options. When used with no options, the early console is determined by stdout-path property in device tree's @@ -1338,7 +1324,7 @@ address must be provided, and the serial port must already be setup and configured. - earlyprintk= [X86,SH,ARM,M68k,S390] + earlyprintk= [X86,SH,ARM,M68k,S390,UM,EARLY] earlyprintk=vga earlyprintk=sclp earlyprintk=xen @@ -1396,7 +1382,7 @@ edd= [EDD] Format: {"off" | "on" | "skip[mbr]"} - efi= [EFI] + efi= [EFI,EARLY] Format: { "debug", "disable_early_pci_dma", "nochunk", "noruntime", "nosoftreserve", "novamap", "no_disable_early_pci_dma" } @@ -1417,13 +1403,13 @@ no_disable_early_pci_dma: Leave the busmaster bit set on all PCI bridges while in the EFI boot stub - efi_no_storage_paranoia [EFI; X86] + efi_no_storage_paranoia [EFI,X86,EARLY] Using this parameter you can use more than 50% of your efi variable storage. Use this parameter only if you are really sure that your UEFI does sane gc and fulfills the spec otherwise your board may brick. - efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86] + efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI,X86,EARLY] Add arbitrary attribute to specific memory range by updating original EFI memory map. Region of memory which aa attribute is added to is @@ -1454,7 +1440,7 @@ eisa_irq_edge= [PARISC,HW] See header of drivers/parisc/eisa.c. - ekgdboc= [X86,KGDB] Allow early kernel console debugging + ekgdboc= [X86,KGDB,EARLY] Allow early kernel console debugging Format: ekgdboc=kbd This is designed to be used in conjunction with @@ -1469,13 +1455,13 @@ See comment before function elanfreq_setup() in arch/x86/kernel/cpu/cpufreq/elanfreq.c. - elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390] + elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390,EARLY] Specifies physical address of start of kernel core image elf header and optionally the size. Generally kexec loader will pass this option to capture kernel. See Documentation/admin-guide/kdump/kdump.rst for details. - enable_mtrr_cleanup [X86] + enable_mtrr_cleanup [X86,EARLY] The kernel tries to adjust MTRR layout from continuous to discrete, to make X server driver able to add WB entry later. This parameter enables that. @@ -1508,7 +1494,7 @@ Permit 'security.evm' to be updated regardless of current integrity status. - early_page_ext [KNL] Enforces page_ext initialization to earlier + early_page_ext [KNL,EARLY] Enforces page_ext initialization to earlier stages so cover more early boot allocations. Please note that as side effect some optimizations might be disabled to achieve that (e.g. parallelized @@ -1539,6 +1525,12 @@ Warning: use of this parameter will taint the kernel and may cause unknown problems. + fred= [X86-64] + Enable/disable Flexible Return and Event Delivery. + Format: { on | off } + on: enable FRED when it's present. + off: disable FRED, the default setting. + ftrace=[tracer] [FTRACE] will set and start the specified tracer as early as possible in order to facilitate early @@ -1600,7 +1592,7 @@ can be changed at run time by the max_graph_depth file in the tracefs tracing directory. default: 0 (no limit) - fw_devlink= [KNL] Create device links between consumer and supplier + fw_devlink= [KNL,EARLY] Create device links between consumer and supplier devices by scanning the firmware to infer the consumer/supplier relationships. This feature is especially useful when drivers are loaded as modules as @@ -1619,12 +1611,12 @@ rpm -- Like "on", but also use to order runtime PM. fw_devlink.strict= - [KNL] Treat all inferred dependencies as mandatory + [KNL,EARLY] Treat all inferred dependencies as mandatory dependencies. This only applies for fw_devlink=on|rpm. Format: fw_devlink.sync_state = - [KNL] When all devices that could probe have finished + [KNL,EARLY] When all devices that could probe have finished probing, this parameter controls what to do with devices that haven't yet received their sync_state() calls. @@ -1645,12 +1637,12 @@ gamma= [HW,DRM] - gart_fix_e820= [X86-64] disable the fix e820 for K8 GART + gart_fix_e820= [X86-64,EARLY] disable the fix e820 for K8 GART Format: off | on default: on gather_data_sampling= - [X86,INTEL] Control the Gather Data Sampling (GDS) + [X86,INTEL,EARLY] Control the Gather Data Sampling (GDS) mitigation. Gather Data Sampling is a hardware vulnerability which @@ -1748,7 +1740,7 @@ (that will set all pages holding image data during restoration read-only). - highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact + highmem=nn[KMG] [KNL,BOOT,EARLY] forces the highmem zone to have an exact size of . This works even on boxes that have no highmem otherwise. This also works to reduce highmem size on bigger boxes. @@ -1759,7 +1751,7 @@ hlt [BUGS=ARM,SH] - hostname= [KNL] Set the hostname (aka UTS nodename). + hostname= [KNL,EARLY] Set the hostname (aka UTS nodename). Format: This allows setting the system's hostname during early startup. This sets the name returned by gethostname. @@ -1804,7 +1796,7 @@ Documentation/admin-guide/mm/hugetlbpage.rst. Format: size[KMG] - hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation + hugetlb_cma= [HW,CMA,EARLY] The size of a CMA area used for allocation of gigantic hugepages. Or using node format, the size of a CMA area per node can be specified. Format: nn[KMGTPE] or (node format) @@ -1850,9 +1842,10 @@ If specified, z/VM IUCV HVC accepts connections from listed z/VM user IDs only. - hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations - which allow the hypervisor to 'idle' the - guest on lock contention. + hv_nopvspin [X86,HYPER_V,EARLY] + Disables the paravirt spinlock optimizations + which allow the hypervisor to 'idle' the guest + on lock contention. i2c_bus= [HW] Override the default board specific I2C bus speed or register an additional I2C bus that is not @@ -1917,7 +1910,7 @@ Format: [,[,[,]]] - idle= [X86] + idle= [X86,EARLY] Format: idle=poll, idle=halt, idle=nomwait Poll forces a polling idle loop that can slightly improve the performance of waking up a idle CPU, but @@ -1973,7 +1966,7 @@ mode generally follows that for the NaN encoding, except where unsupported by hardware. - ignore_loglevel [KNL] + ignore_loglevel [KNL,EARLY] Ignore loglevel setting - this will print /all/ kernel messages to the console. Useful for debugging. We also add it as printk module parameter, so users @@ -2091,21 +2084,21 @@ unpacking being completed before device_ and late_ initcalls. - initrd= [BOOT] Specify the location of the initial ramdisk + initrd= [BOOT,EARLY] Specify the location of the initial ramdisk - initrdmem= [KNL] Specify a physical address and size from which to + initrdmem= [KNL,EARLY] Specify a physical address and size from which to load the initrd. If an initrd is compiled in or specified in the bootparams, it takes priority over this setting. Format: ss[KMG],nn[KMG] Default is 0, 0 - init_on_alloc= [MM] Fill newly allocated pages and heap objects with + init_on_alloc= [MM,EARLY] Fill newly allocated pages and heap objects with zeroes. Format: 0 | 1 Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON. - init_on_free= [MM] Fill freed pages and heap objects with zeroes. + init_on_free= [MM,EARLY] Fill freed pages and heap objects with zeroes. Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. @@ -2161,7 +2154,7 @@ 0 disables intel_idle and fall back on acpi_idle. 1 to 9 specify maximum depth of C-state. - intel_pstate= [X86] + intel_pstate= [X86,EARLY] disable Do not enable intel_pstate as the default scaling driver for the supported processors @@ -2205,7 +2198,7 @@ Allow per-logical-CPU P-State performance control limits using cpufreq sysfs interface - intremap= [X86-64, Intel-IOMMU] + intremap= [X86-64,Intel-IOMMU,EARLY] on enable Interrupt Remapping (default) off disable Interrupt Remapping nosid disable Source ID checking @@ -2217,7 +2210,7 @@ strict regions from userspace. relaxed - iommu= [X86] + iommu= [X86,EARLY] off force noforce @@ -2232,7 +2225,7 @@ nobypass [PPC/POWERNV] Disable IOMMU bypass, using IOMMU for PCI devices. - iommu.forcedac= [ARM64, X86] Control IOVA allocation for PCI devices. + iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices. Format: { "0" | "1" } 0 - Try to allocate a 32-bit DMA address first, before falling back to the full range if needed. @@ -2240,7 +2233,7 @@ forcing Dual Address Cycle for PCI cards supporting greater than 32-bit addressing. - iommu.strict= [ARM64, X86, S390] Configure TLB invalidation behaviour + iommu.strict= [ARM64,X86,S390,EARLY] Configure TLB invalidation behaviour Format: { "0" | "1" } 0 - Lazy mode. Request that DMA unmap operations use deferred @@ -2256,7 +2249,7 @@ legacy driver-specific options takes precedence. iommu.passthrough= - [ARM64, X86] Configure DMA to bypass the IOMMU by default. + [ARM64,X86,EARLY] Configure DMA to bypass the IOMMU by default. Format: { "0" | "1" } 0 - Use IOMMU translation for DMA. 1 - Bypass the IOMMU for DMA. @@ -2266,7 +2259,7 @@ See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. - io_delay= [X86] I/O delay method + io_delay= [X86,EARLY] I/O delay method 0x80 Standard port 0x80 based delay 0xed @@ -2279,28 +2272,28 @@ ip= [IP_PNP] See Documentation/admin-guide/nfs/nfsroot.rst. - ipcmni_extend [KNL] Extend the maximum number of unique System V + ipcmni_extend [KNL,EARLY] Extend the maximum number of unique System V IPC identifiers from 32,768 to 16,777,216. irqaffinity= [SMP] Set the default irq affinity mask The argument is a cpu list, as described above. irqchip.gicv2_force_probe= - [ARM, ARM64] + [ARM,ARM64,EARLY] Format: Force the kernel to look for the second 4kB page of a GICv2 controller even if the memory range exposed by the device tree is too small. irqchip.gicv3_nolpi= - [ARM, ARM64] + [ARM,ARM64,EARLY] Force the kernel to ignore the availability of LPIs (and by consequence ITSs). Intended for system that use the kernel as a bootloader, and thus want to let secondary kernels in charge of setting up LPIs. - irqchip.gicv3_pseudo_nmi= [ARM64] + irqchip.gicv3_pseudo_nmi= [ARM64,EARLY] Enables support for pseudo-NMIs in the kernel. This requires the kernel to be built with CONFIG_ARM64_PSEUDO_NMI. @@ -2445,7 +2438,7 @@ parameter KASAN will print report only for the first invalid access. - keep_bootcon [KNL] + keep_bootcon [KNL,EARLY] Do not unregister boot console at start. This is only useful for debugging when something happens in the window between unregistering the boot console and initializing @@ -2453,7 +2446,7 @@ keepinitrd [HW,ARM] See retain_initrd. - kernelcore= [KNL,X86,IA-64,PPC] + kernelcore= [KNL,X86,IA-64,PPC,EARLY] Format: nn[KMGTPE] | nn% | "mirror" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested @@ -2478,7 +2471,7 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms. - kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. + kgdbdbgp= [KGDB,HW,EARLY] kgdb over EHCI usb debug port. Format: [,poll interval] The controller # is the number of the ehci usb debug port as it is probed via PCI. The poll interval is @@ -2499,7 +2492,7 @@ kms, kbd format: kms,kbd kms, kbd and serial format: kms,kbd,[,baud] - kgdboc_earlycon= [KGDB,HW] + kgdboc_earlycon= [KGDB,HW,EARLY] If the boot console provides the ability to read characters and can work in polling mode, you can use this parameter to tell kgdb to use it as a backend @@ -2514,14 +2507,14 @@ blank and the first boot console that implements read() will be picked. - kgdbwait [KGDB] Stop kernel execution and enter the + kgdbwait [KGDB,EARLY] Stop kernel execution and enter the kernel debugger at the earliest opportunity. kmac= [MIPS] Korina ethernet MAC address. Configure the RouterBoard 532 series on-chip Ethernet adapter MAC address. - kmemleak= [KNL] Boot-time kmemleak enable/disable + kmemleak= [KNL,EARLY] Boot-time kmemleak enable/disable Valid arguments: on, off Default: on Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, @@ -2540,8 +2533,8 @@ See also Documentation/trace/kprobetrace.rst "Kernel Boot Parameter" section. - kpti= [ARM64] Control page table isolation of user - and kernel address spaces. + kpti= [ARM64,EARLY] Control page table isolation of + user and kernel address spaces. Default: enabled on cores which need mitigation. 0: force disabled 1: force enabled @@ -2618,7 +2611,8 @@ for NPT. kvm-arm.mode= - [KVM,ARM] Select one of KVM/arm64's modes of operation. + [KVM,ARM,EARLY] Select one of KVM/arm64's modes of + operation. none: Forcefully disable KVM. @@ -2638,22 +2632,22 @@ used with extreme caution. kvm-arm.vgic_v3_group0_trap= - [KVM,ARM] Trap guest accesses to GICv3 group-0 + [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0 system registers kvm-arm.vgic_v3_group1_trap= - [KVM,ARM] Trap guest accesses to GICv3 group-1 + [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-1 system registers kvm-arm.vgic_v3_common_trap= - [KVM,ARM] Trap guest accesses to GICv3 common + [KVM,ARM,EARLY] Trap guest accesses to GICv3 common system registers kvm-arm.vgic_v4_enable= - [KVM,ARM] Allow use of GICv4 for direct injection of - LPIs. + [KVM,ARM,EARLY] Allow use of GICv4 for direct + injection of LPIs. - kvm_cma_resv_ratio=n [PPC] + kvm_cma_resv_ratio=n [PPC,EARLY] Reserves given percentage from system memory area for contiguous memory allocation for KVM hash pagetable allocation. @@ -2706,7 +2700,7 @@ (enabled). Disable by KVM if hardware lacks support for it. - l1d_flush= [X86,INTEL] + l1d_flush= [X86,INTEL,EARLY] Control mitigation for L1D based snooping vulnerability. Certain CPUs are vulnerable to an exploit against CPU @@ -2723,7 +2717,7 @@ on - enable the interface for the mitigation - l1tf= [X86] Control mitigation of the L1TF vulnerability on + l1tf= [X86,EARLY] Control mitigation of the L1TF vulnerability on affected CPUs The kernel PTE inversion protection is unconditionally @@ -2792,7 +2786,7 @@ l3cr= [PPC] - lapic [X86-32,APIC] Enable the local APIC even if BIOS + lapic [X86-32,APIC,EARLY] Enable the local APIC even if BIOS disabled it. lapic= [X86,APIC] Do not use TSC deadline @@ -2800,7 +2794,7 @@ back to the programmable timer unit in the LAPIC. Format: notscdeadline - lapic_timer_c2_ok [X86,APIC] trust the local apic timer + lapic_timer_c2_ok [X86,APIC,EARLY] trust the local apic timer in C2 power state. libata.dma= [LIBATA] DMA control @@ -2924,7 +2918,7 @@ lockd.nlm_udpport=M [NFS] Assign UDP port. Format: - lockdown= [SECURITY] + lockdown= [SECURITY,EARLY] { integrity | confidentiality } Enable the kernel lockdown feature. If set to integrity, kernel features that allow userland to @@ -3031,7 +3025,8 @@ logibm.irq= [HW,MOUSE] Logitech Bus Mouse Driver Format: - loglevel= All Kernel Messages with a loglevel smaller than the + loglevel= [KNL,EARLY] + All Kernel Messages with a loglevel smaller than the console loglevel will be printed to the console. It can also be changed with klogd or other programs. The loglevels are defined as follows: @@ -3045,13 +3040,15 @@ 6 (KERN_INFO) informational 7 (KERN_DEBUG) debug-level messages - log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two and greater - than the minimal size. The minimal size is defined - by LOG_BUF_SHIFT kernel config parameter. There is - also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter - that allows to increase the default size depending on - the number of CPUs. See init/Kconfig for more details. + log_buf_len=n[KMG] [KNL,EARLY] + Sets the size of the printk ring buffer, in bytes. + n must be a power of two and greater than the + minimal size. The minimal size is defined by + LOG_BUF_SHIFT kernel config parameter. There + is also CONFIG_LOG_CPU_MAX_BUF_SHIFT config + parameter that allows to increase the default size + depending on the number of CPUs. See init/Kconfig + for more details. logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for @@ -3109,7 +3106,7 @@ max_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory greater than or equal to this physical address is ignored. - maxcpus= [SMP] Maximum number of processors that an SMP kernel + maxcpus= [SMP,EARLY] Maximum number of processors that an SMP kernel will bring up during bootup. maxcpus=n : n >= 0 limits the kernel to bring up 'n' processors. Surely after bootup you can bring up the other plugged cpu by executing @@ -3136,7 +3133,7 @@ Format: , Specifies range of consoles to be captured by the MDA. - mds= [X86,INTEL] + mds= [X86,INTEL,EARLY] Control mitigation for the Micro-architectural Data Sampling (MDS) vulnerability. @@ -3168,11 +3165,12 @@ For details see: Documentation/admin-guide/hw-vuln/mds.rst - mem=nn[KMG] [HEXAGON] Set the memory size. + mem=nn[KMG] [HEXAGON,EARLY] Set the memory size. Must be specified, otherwise memory size will be 0. - mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory - Amount of memory to be used in cases as follows: + mem=nn[KMG] [KNL,BOOT,EARLY] Force usage of a specific amount + of memory Amount of memory to be used in cases + as follows: 1 for test; 2 when the kernel is not able to see the whole system memory; @@ -3196,8 +3194,8 @@ if system memory of hypervisor is not sufficient. mem=nn[KMG]@ss[KMG] - [ARM,MIPS] - override the memory layout reported by - firmware. + [ARM,MIPS,EARLY] - override the memory layout + reported by firmware. Define a memory region of size nn[KMG] starting at ss[KMG]. Multiple different regions can be specified with @@ -3206,7 +3204,7 @@ mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel memory. - memblock=debug [KNL] Enable memblock debug messages. + memblock=debug [KNL,EARLY] Enable memblock debug messages. memchunk=nn[KMG] [KNL,SH] Allow user to override the default size for @@ -3220,14 +3218,14 @@ option. See Documentation/admin-guide/mm/memory-hotplug.rst. - memmap=exactmap [KNL,X86] Enable setting of an exact + memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact E820 memory map, as specified by the user. Such memmap=exactmap lines can be constructed based on BIOS output or other requirements. See the memmap=nn@ss option description. memmap=nn[KMG]@ss[KMG] - [KNL, X86, MIPS, XTENSA] Force usage of a specific region of memory. + [KNL, X86,MIPS,XTENSA,EARLY] Force usage of a specific region of memory. Region of memory to be used is from ss to ss+nn. If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG], which limits max address to nn[KMG]. @@ -3237,11 +3235,11 @@ memmap=100M@2G,100M#3G,1G!1024G memmap=nn[KMG]#ss[KMG] - [KNL,ACPI] Mark specific memory as ACPI data. + [KNL,ACPI,EARLY] Mark specific memory as ACPI data. Region of memory to be marked is from ss to ss+nn. memmap=nn[KMG]$ss[KMG] - [KNL,ACPI] Mark specific memory as reserved. + [KNL,ACPI,EARLY] Mark specific memory as reserved. Region of memory to be reserved is from ss to ss+nn. Example: Exclude memory from 0x18690000-0x1869ffff memmap=64K$0x18690000 @@ -3251,14 +3249,14 @@ like Grub2, otherwise '$' and the following number will be eaten. - memmap=nn[KMG]!ss[KMG] + memmap=nn[KMG]!ss[KMG,EARLY] [KNL,X86] Mark specific memory as protected. Region of memory to be used, from ss to ss+nn. The memory region may be marked as e820 type 12 (0xc) and is NVDIMM or ADR memory. memmap=%-+ - [KNL,ACPI] Convert memory within the specified region + [KNL,ACPI,EARLY] Convert memory within the specified region from to . If "-" is left out, the whole region will be marked as , even if previously unavailable. If "+" is left @@ -3266,7 +3264,7 @@ specified as e820 types, e.g., 1 = RAM, 2 = reserved, 3 = ACPI, 12 = PRAM. - memory_corruption_check=0/1 [X86] + memory_corruption_check=0/1 [X86,EARLY] Some BIOSes seem to corrupt the first 64k of memory when doing things like suspend/resume. Setting this option will scan the memory @@ -3278,13 +3276,13 @@ affects the same memory, you can use memmap= to prevent the kernel from using that memory. - memory_corruption_check_size=size [X86] + memory_corruption_check_size=size [X86,EARLY] By default it checks for corruption in the low 64k, making this memory unavailable for normal use. Use this parameter to scan for corruption in more or less memory. - memory_corruption_check_period=seconds [X86] + memory_corruption_check_period=seconds [X86,EARLY] By default it checks for corruption every 60 seconds. Use this parameter to check at some other rate. 0 disables periodic checking. @@ -3308,7 +3306,7 @@ Note that even when enabled, there are a few cases where the feature is not effective. - memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest + memtest= [KNL,X86,ARM,M68K,PPC,RISCV,EARLY] Enable memtest Format: default : 0 Specifies the number of memtest passes to be @@ -3374,7 +3372,7 @@ https://repo.or.cz/w/linux-2.6/mini2440.git mitigations= - [X86,PPC,S390,ARM64] Control optional mitigations for + [X86,PPC,S390,ARM64,EARLY] Control optional mitigations for CPU vulnerabilities. This is a set of curated, arch-independent options, each of which is an aggregation of existing arch-specific options. @@ -3397,6 +3395,7 @@ nospectre_v1 [X86,PPC] nospectre_v2 [X86,PPC,S390,ARM64] retbleed=off [X86] + spec_rstack_overflow=off [X86] spec_store_bypass_disable=off [X86,PPC] spectre_v2_user=off [X86] srbds=off [X86,INTEL] @@ -3427,7 +3426,7 @@ retbleed=auto,nosmt [X86] mminit_loglevel= - [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this + [KNL,EARLY] When CONFIG_DEBUG_MEMORY_INIT is set, this parameter allows control of the logging verbosity for the additional memory initialisation checks. A value of 0 disables mminit logging and a level of 4 will @@ -3435,7 +3434,7 @@ so loglevel=8 may also need to be specified. mmio_stale_data= - [X86,INTEL] Control mitigation for the Processor + [X86,INTEL,EARLY] Control mitigation for the Processor MMIO Stale Data vulnerabilities. Processor MMIO Stale Data is a class of @@ -3510,7 +3509,7 @@ mousedev.yres= [MOUSE] Vertical screen resolution, used for devices reporting absolute coordinates, such as tablets - movablecore= [KNL,X86,IA-64,PPC] + movablecore= [KNL,X86,IA-64,PPC,EARLY] Format: nn[KMGTPE] | nn% This parameter is the complement to kernelcore=, it specifies the amount of memory used for migratable @@ -3521,7 +3520,7 @@ that the amount of memory usable for all allocations is not too small. - movable_node [KNL] Boot-time switch to make hotplugable memory + movable_node [KNL,EARLY] Boot-time switch to make hotplugable memory NUMA nodes to be movable. This means that the memory of such nodes will be usable only for movable allocations which rules out almost all kernel @@ -3545,21 +3544,21 @@ [HW] Make the MicroTouch USB driver use raw coordinates ('y', default) or cooked coordinates ('n') - mtrr=debug [X86] + mtrr=debug [X86,EARLY] Enable printing debug information related to MTRR registers at boot time. - mtrr_chunk_size=nn[KMG] [X86] + mtrr_chunk_size=nn[KMG,X86,EARLY] used for mtrr cleanup. It is largest continuous chunk that could hold holes aka. UC entries. - mtrr_gran_size=nn[KMG] [X86] + mtrr_gran_size=nn[KMG,X86,EARLY] Used for mtrr cleanup. It is granularity of mtrr block. Default is 1. Large value could prevent small alignment from using up MTRRs. - mtrr_spare_reg_nr=n [X86] + mtrr_spare_reg_nr=n [X86,EARLY] Format: Range: 0,7 : spare reg number Default : 1 @@ -3745,10 +3744,10 @@ emulation library even if a 387 maths coprocessor is present. - no4lvl [RISCV] Disable 4-level and 5-level paging modes. Forces - kernel to use 3-level paging instead. + no4lvl [RISCV,EARLY] Disable 4-level and 5-level paging modes. + Forces kernel to use 3-level paging instead. - no5lvl [X86-64,RISCV] Disable 5-level paging mode. Forces + no5lvl [X86-64,RISCV,EARLY] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. noaliencache [MM, NUMA, SLAB] Disables the allocation of alien @@ -3757,15 +3756,15 @@ noalign [KNL,ARM] - noaltinstr [S390] Disables alternative instructions patching - (CPU alternatives feature). + noaltinstr [S390,EARLY] Disables alternative instructions + patching (CPU alternatives feature). - noapic [SMP,APIC] Tells the kernel to not make use of any + noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any IOAPICs that may be present in the system. noautogroup Disable scheduler automatic task group creation. - nocache [ARM] + nocache [ARM,EARLY] no_console_suspend [HW] Never suspend the console @@ -3783,13 +3782,13 @@ turn on/off it dynamically. no_debug_objects - [KNL] Disable object debugging + [KNL,EARLY] Disable object debugging nodsp [SH] Disable hardware DSP at boot time. - noefi Disable EFI runtime services support. + noefi [EFI,EARLY] Disable EFI runtime services support. - no_entry_flush [PPC] Don't flush the L1-D cache when entering the kernel. + no_entry_flush [PPC,EARLY] Don't flush the L1-D cache when entering the kernel. noexec [IA-64] @@ -3820,6 +3819,7 @@ real-time systems. no_hash_pointers + [KNL,EARLY] Force pointers printed to the console or buffers to be unhashed. By default, when a pointer is printed via %p format string, that pointer is "hashed", i.e. obscured @@ -3844,9 +3844,9 @@ the impact of the sleep instructions. This is also useful when using JTAG debugger. - nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings. + nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings. - nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings. + nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings. nohz= [KNL] Boottime enable/disable dynamic ticks Valid arguments: on, off @@ -3868,13 +3868,13 @@ noinitrd [RAM] Tells the kernel not to load any configured initial RAM disk. - nointremap [X86-64, Intel-IOMMU] Do not enable interrupt + nointremap [X86-64,Intel-IOMMU,EARLY] Do not enable interrupt remapping. [Deprecated - use intremap=off] nointroute [IA-64] - noinvpcid [X86] Disable the INVPCID cpu feature. + noinvpcid [X86,EARLY] Disable the INVPCID cpu feature. noiotrap [SH] Disables trapped I/O port accesses. @@ -3885,19 +3885,19 @@ nojitter [IA-64] Disables jitter checking for ITC timers. - nokaslr [KNL] + nokaslr [KNL,EARLY] When CONFIG_RANDOMIZE_BASE is set, this disables kernel and module base offset ASLR (Address Space Layout Randomization). - no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page + no-kvmapf [X86,KVM,EARLY] Disable paravirtualized asynchronous page fault handling. - no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + no-kvmclock [X86,KVM,EARLY] Disable paravirtualized KVM clock driver - nolapic [X86-32,APIC] Do not enable or use the local APIC. + nolapic [X86-32,APIC,EARLY] Do not enable or use the local APIC. - nolapic_timer [X86-32,APIC] Do not use the local APIC timer. + nolapic_timer [X86-32,APIC,EARLY] Do not use the local APIC timer. nomca [IA-64] Disable machine check abort handling @@ -3922,23 +3922,23 @@ shutdown the other cpus. Instead use the REBOOT_VECTOR irq. - nopat [X86] Disable PAT (page attribute table extension of + nopat [X86,EARLY] Disable PAT (page attribute table extension of pagetables) support. - nopcid [X86-64] Disable the PCID cpu feature. + nopcid [X86-64,EARLY] Disable the PCID cpu feature. nopku [X86] Disable Memory Protection Keys CPU feature found in some Intel CPUs. - nopti [X86-64] + nopti [X86-64,EARLY] Equivalent to pti=off - nopv= [X86,XEN,KVM,HYPER_V,VMWARE] + nopv= [X86,XEN,KVM,HYPER_V,VMWARE,EARLY] Disables the PV optimizations forcing the guest to run as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. - nopvspin [X86,XEN,KVM] + nopvspin [X86,XEN,KVM,EARLY] Disables the qspinlock slow path using PV optimizations which allow the hypervisor to 'idle' the guest on lock contention. @@ -3958,20 +3958,20 @@ This is required for the Braillex ib80-piezo Braille reader made by F.H. Papenmeier (Germany). - nosgx [X86-64,SGX] Disables Intel SGX kernel support. + nosgx [X86-64,SGX,EARLY] Disables Intel SGX kernel support. - nosmap [PPC] + nosmap [PPC,EARLY] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. - nosmep [PPC64s] + nosmep [PPC64s,EARLY] Disable SMEP (Supervisor Mode Execution Prevention) even if it is supported by processor. - nosmp [SMP] Tells an SMP kernel to act as a UP kernel, + nosmp [SMP,EARLY] Tells an SMP kernel to act as a UP kernel, and disable the IO APIC. legacy for "maxcpus=0". - nosmt [KNL,MIPS,PPC,S390] Disable symmetric multithreading (SMT). + nosmt [KNL,MIPS,PPC,S390,EARLY] Disable symmetric multithreading (SMT). Equivalent to smt=1. [KNL,X86,PPC] Disable symmetric multithreading (SMT). @@ -3981,22 +3981,23 @@ nosoftlockup [KNL] Disable the soft-lockup detector. nospec_store_bypass_disable - [HW] Disable all mitigations for the Speculative Store Bypass vulnerability + [HW,EARLY] Disable all mitigations for the Speculative + Store Bypass vulnerability - nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch + nospectre_bhb [ARM64,EARLY] Disable all mitigations for Spectre-BHB (branch history injection) vulnerability. System may allow data leaks with this option. - nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1 + nospectre_v1 [X86,PPC,EARLY] Disable mitigations for Spectre Variant 1 (bounds check bypass). With this option data leaks are possible in the system. - nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for - the Spectre variant 2 (indirect branch prediction) - vulnerability. System may allow data leaks with this - option. + nospectre_v2 [X86,PPC_E500,ARM64,EARLY] Disable all mitigations + for the Spectre variant 2 (indirect branch + prediction) vulnerability. System may allow data + leaks with this option. - no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable + no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour @@ -4006,7 +4007,7 @@ broken timer IRQ sources. no_uaccess_flush - [PPC] Don't flush the L1-D cache after accessing user data. + [PPC,EARLY] Don't flush the L1-D cache after accessing user data. novmcoredd [KNL,KDUMP] Disable device dump. Device dump allows drivers to @@ -4020,15 +4021,15 @@ is set. no-vmw-sched-clock - [X86,PV_OPS] Disable paravirtualized VMware scheduler - clock and use the default one. + [X86,PV_OPS,EARLY] Disable paravirtualized VMware + scheduler clock and use the default one. nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). - nowb [ARM] + nowb [ARM,EARLY] - nox2apic [X86-64,APIC] Do not enable x2APIC mode. + nox2apic [X86-64,APIC,EARLY] Do not enable x2APIC mode. NOTE: this parameter will be ignored on systems with the LEGACY_XAPIC_DISABLED bit set in the @@ -4066,7 +4067,7 @@ purges which is reported from either PAL_VM_SUMMARY or SAL PALO. - nr_cpus= [SMP] Maximum number of processors that an SMP kernel + nr_cpus= [SMP,EARLY] Maximum number of processors that an SMP kernel could support. nr_cpus=n : n >= 1 limits the kernel to support 'n' processors. It could be larger than the number of already plugged CPU during bootup, later in @@ -4077,8 +4078,9 @@ nr_uarts= [SERIAL] maximum number of UARTs to be registered. - numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only - set up a single NUMA node spanning all memory. + numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86, EARLY] + Disable NUMA, Only set up a single NUMA node + spanning all memory. numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic NUMA balancing. @@ -4089,7 +4091,7 @@ This can be set from sysctl after boot. See Documentation/admin-guide/sysctl/vm.rst for details. - ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. + ohci1394_dma=early [HW,EARLY] enable debugging via the ohci1394 driver. See Documentation/core-api/debugging-via-ohci1394.rst for more info. @@ -4115,7 +4117,8 @@ Once locked, the boundary cannot be changed. 1 indicates lock status, 0 indicates unlock status. - oops=panic Always panic on oopses. Default is to just kill the + oops=panic [KNL,EARLY] + Always panic on oopses. Default is to just kill the process, but there is a small probability of deadlocking the machine. This will also cause panics on machine check exceptions. @@ -4131,13 +4134,13 @@ can be read from sysfs at: /sys/module/page_alloc/parameters/shuffle. - page_owner= [KNL] Boot-time page_owner enabling option. + page_owner= [KNL,EARLY] Boot-time page_owner enabling option. Storage of the information about who allocated each page is disabled in default. With this switch, we can turn it on. on: enable the feature - page_poison= [KNL] Boot-time parameter changing the state of + page_poison= [KNL,EARLY] Boot-time parameter changing the state of poisoning on the buddy allocator, available with CONFIG_PAGE_POISONING=y. off: turn off poisoning (default) @@ -4155,7 +4158,8 @@ timeout < 0: reboot immediately Format: - panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + panic_on_taint= [KNL,EARLY] + Bitmask for conditionally calling panic() in add_taint() Format: [,nousertaint] Hexadecimal bitmask representing the set of TAINT flags that will cause the kernel to panic when add_taint() is @@ -4311,7 +4315,7 @@ pcbit= [HW,ISDN] - pci=option[,option...] [PCI] various PCI subsystem options. + pci=option[,option...] [PCI,EARLY] various PCI subsystem options. Some options herein operate on a specific device or a set of devices (). These are @@ -4580,7 +4584,8 @@ Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= Select which percpu first chunk allocator to use. + percpu_alloc= [MM,EARLY] + Select which percpu first chunk allocator to use. Currently supported values are "embed" and "page". Archs may support subset or none of the selections. See comments in mm/percpu.c for details on each @@ -4649,12 +4654,12 @@ execution priority. ppc_strict_facility_enable - [PPC] This option catches any kernel floating point, + [PPC,ENABLE] This option catches any kernel floating point, Altivec, VSX and SPE outside of regions specifically allowed (eg kernel_enable_fpu()/kernel_disable_fpu()). There is some performance impact when enabling this. - ppc_tm= [PPC] + ppc_tm= [PPC,EARLY] Format: {"off"} Disable Hardware Transactional Memory @@ -4764,7 +4769,7 @@ [KNL] Number of legacy pty's. Overwrites compiled-in default number. - quiet [KNL] Disable most log messages + quiet [KNL,EARLY] Disable most log messages r128= [HW,DRM] @@ -4781,17 +4786,17 @@ ramdisk_start= [RAM] RAM disk image start address random.trust_cpu=off - [KNL] Disable trusting the use of the CPU's + [KNL,EARLY] Disable trusting the use of the CPU's random number generator (if available) to initialize the kernel's RNG. random.trust_bootloader=off - [KNL] Disable trusting the use of the a seed + [KNL,EARLY] Disable trusting the use of the a seed passed by the bootloader (if available) to initialize the kernel's RNG. randomize_kstack_offset= - [KNL] Enable or disable kernel stack offset + [KNL,EARLY] Enable or disable kernel stack offset randomization, which provides roughly 5 bits of entropy, frustrating memory corruption attacks that depend on stack address determinism or @@ -5032,6 +5037,11 @@ this kernel boot parameter, forcibly setting it to zero. + rcutree.enable_rcu_lazy= [KNL] + To save power, batch RCU callbacks and flush after + delay, memory pressure or callback list growing too + big. + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). @@ -5482,7 +5492,7 @@ Run specified binary instead of /init from the ramdisk, used for early userspace startup. See initrd. - rdrand= [X86] + rdrand= [X86,EARLY] force - Override the decision by the kernel to hide the advertisement of RDRAND support (this affects certain AMD processors because of buggy BIOS @@ -5578,7 +5588,7 @@ them. If is less than 0x10000, the region is assumed to be I/O ports; otherwise it is memory. - reservetop= [X86-32] + reservetop= [X86-32,EARLY] Format: nn[KMG] Reserves a hole at the top of the kernel virtual address space. @@ -5663,7 +5673,7 @@ [KNL] Disable ring 3 MONITOR/MWAIT feature on supported CPUs. - riscv_isa_fallback [RISCV] + riscv_isa_fallback [RISCV,EARLY] When CONFIG_RISCV_ISA_FALLBACK is not enabled, permit falling back to detecting extension support by parsing "riscv,isa" property on devicetree systems when the @@ -5672,13 +5682,14 @@ ro [KNL] Mount root device read-only on boot - rodata= [KNL] + rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. full Mark read-only kernel memory and aliases as read-only [arm64] rockchip.usb_uart + [EARLY] Enable the uart passthrough on the designated usb port on Rockchip SoCs. When active, the signals of the debug-uart get routed to the D+ and D- pins of the usb @@ -5739,7 +5750,7 @@ sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. - sched_verbose [KNL] Enables verbose scheduler debug messages. + sched_verbose [KNL,EARLY] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature @@ -5854,7 +5865,7 @@ non-zero "wait" parameter. See weight_single and weight_many. - skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate + skew_tick= [KNL,EARLY] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. Format: { "0" | "1" } @@ -5985,10 +5996,10 @@ 1: Fast pin select (default) 2: ATC IRMode - smt= [KNL,MIPS,S390] Set the maximum number of threads (logical - CPUs) to use per physical CPU on systems capable of - symmetric multithreading (SMT). Will be capped to the - actual hardware limit. + smt= [KNL,MIPS,S390,EARLY] Set the maximum number of threads + (logical CPUs) to use per physical CPU on systems + capable of symmetric multithreading (SMT). Will + be capped to the actual hardware limit. Format: Default: -1 (no limit) @@ -6010,7 +6021,7 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst - spectre_v2= [X86] Control mitigation of Spectre variant 2 + spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. The default operation protects the kernel from user space attacks. @@ -6025,8 +6036,8 @@ Selecting 'on' will, and 'auto' may, choose a mitigation method at run time according to the CPU, the available microcode, the setting of the - CONFIG_RETPOLINE configuration option, and the - compiler with which the kernel was built. + CONFIG_MITIGATION_RETPOLINE configuration option, + and the compiler with which the kernel was built. Selecting 'on' will also enable the mitigation against user space to user space task attacks. @@ -6090,7 +6101,7 @@ spectre_v2_user=auto. spec_rstack_overflow= - [X86] Control RAS overflow mitigation on AMD Zen CPUs + [X86,EARLY] Control RAS overflow mitigation on AMD Zen CPUs off - Disable mitigation microcode - Enable microcode mitigation only @@ -6101,7 +6112,7 @@ (cloud-specific mitigation) spec_store_bypass_disable= - [HW] Control Speculative Store Bypass (SSB) Disable mitigation + [HW,EARLY] Control Speculative Store Bypass (SSB) Disable mitigation (Speculative Store Bypass vulnerability) Certain CPUs are vulnerable to an exploit against a @@ -6197,7 +6208,7 @@ #DB exception for bus lock is triggered only when CPL > 0. - srbds= [X86,INTEL] + srbds= [X86,INTEL,EARLY] Control the Special Register Buffer Data Sampling (SRBDS) mitigation. @@ -6284,7 +6295,7 @@ srcutree.convert_to_big must have the 0x10 bit set for contention-based conversions to occur. - ssbd= [ARM64,HW] + ssbd= [ARM64,HW,EARLY] Speculative Store Bypass Disable control On CPUs that are vulnerable to the Speculative @@ -6308,7 +6319,7 @@ growing up) the main stack are reserved for no other mapping. Default value is 256 pages. - stack_depot_disable= [KNL] + stack_depot_disable= [KNL,EARLY] Setting this to true through kernel command line will disable the stack depot thereby saving the static memory consumed by the stack hash table. By default this is set @@ -6347,12 +6358,12 @@ be used to filter out binaries which have not yet been made aware of AT_MINSIGSTKSZ. - stress_hpt [PPC] + stress_hpt [PPC,EARLY] Limits the number of kernel HPT entries in the hash page table to increase the rate of hash page table faults on kernel addresses. - stress_slb [PPC] + stress_slb [PPC,EARLY] Limits the number of kernel SLB entries, and flushes them frequently to increase the rate of SLB faults on kernel addresses. @@ -6412,7 +6423,7 @@ This parameter controls use of the Protected Execution Facility on pSeries. - swiotlb= [ARM,IA-64,PPC,MIPS,X86] + swiotlb= [ARM,IA-64,PPC,MIPS,X86,EARLY] Format: { [,] | force | noforce } -- Number of I/O TLB slabs -- Second integer after comma. Number of swiotlb @@ -6422,7 +6433,7 @@ wouldn't be automatically used by the kernel noforce -- Never use bounce buffers (for debugging) - switches= [HW,M68k] + switches= [HW,M68k,EARLY] sysctl.*= [KNL] Set a sysctl parameter, right before loading the init @@ -6481,11 +6492,11 @@ : poll all this frequency 0: no polling (default) - threadirqs [KNL] + threadirqs [KNL,EARLY] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. - topology= [S390] + topology= [S390,EARLY] Format: {off | on} Specify if the kernel should make use of the cpu topology information if the hardware supports this. @@ -6726,7 +6737,7 @@ can be overridden by a later tsc=nowatchdog. A console message will flag any such suppression or overriding. - tsc_early_khz= [X86] Skip early TSC calibration and use the given + tsc_early_khz= [X86,EARLY] Skip early TSC calibration and use the given value instead. Useful when the early TSC frequency discovery procedure is not reliable, such as on overclocked systems with CPUID.16h support and partial CPUID.15h support. @@ -6761,7 +6772,7 @@ See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst for more details. - tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async + tsx_async_abort= [X86,INTEL,EARLY] Control mitigation for the TSX Async Abort (TAA) vulnerability. Similar to Micro-architectural Data Sampling (MDS) @@ -6827,7 +6838,7 @@ unknown_nmi_panic [X86] Cause panic on unknown NMI. - unwind_debug [X86-64] + unwind_debug [X86-64,EARLY] Enable unwinder debug output. This can be useful for debugging certain unwinder error conditions, including corrupt stacks and @@ -7017,7 +7028,7 @@ Example: user_debug=31 userpte= - [X86] Flags controlling user PTE allocations. + [X86,EARLY] Flags controlling user PTE allocations. nohigh = do not allocate PTE pages in HIGHMEM regardless of setting @@ -7046,7 +7057,7 @@ vector= [IA-64,SMP] vector=percpu: enable percpu vector domain - video= [FB] Frame buffer configuration + video= [FB,EARLY] Frame buffer configuration See Documentation/fb/modedb.rst. video.brightness_switch_enabled= [ACPI] @@ -7094,13 +7105,13 @@ P Enable page structure init time poisoning - Disable all of the above options - vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact - size of . This can be used to increase the - minimum size (128MB on x86). It can also be used to - decrease the size and leave more room for directly - mapped kernel RAM. + vmalloc=nn[KMG] [KNL,BOOT,EARLY] Forces the vmalloc area to have an + exact size of . This can be used to increase + the minimum size (128MB on x86). It can also be + used to decrease the size and leave more room + for directly mapped kernel RAM. - vmcp_cma=nn[MG] [KNL,S390] + vmcp_cma=nn[MG] [KNL,S390,EARLY] Sets the memory size reserved for contiguous memory allocations for the vmcp device driver. @@ -7113,7 +7124,7 @@ vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: - vsyscall= [X86-64] + vsyscall= [X86-64,EARLY] Controls the behavior of vsyscalls (i.e. calls to fixed addresses of 0xffffffffff600x00 from legacy code). Most statically-linked binaries and older @@ -7223,6 +7234,15 @@ threshold repeatedly. They are likely good candidates for using WQ_UNBOUND workqueues instead. + workqueue.cpu_intensive_warning_thresh= + If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel + will report the work functions which violate the + intensive_threshold_us repeatedly. In order to prevent + spurious warnings, start printing only after a work + function has violated this threshold number of times. + + The default is 4 times. 0 disables the warning. + workqueue.power_efficient Per-cpu workqueues are generally preferred because they show better performance thanks to cache @@ -7261,13 +7281,13 @@ When enabled, memory and cache locality will be impacted. - writecombine= [LOONGARCH] Control the MAT (Memory Access Type) of - ioremap_wc(). + writecombine= [LOONGARCH,EARLY] Control the MAT (Memory Access + Type) of ioremap_wc(). on - Enable writecombine, use WUC for ioremap_wc() off - Disable writecombine, use SUC for ioremap_wc() - x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of + x2apic_phys [X86-64,APIC,EARLY] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. @@ -7278,7 +7298,7 @@ save/restore/migration must be enabled to handle larger domains. - xen_emul_unplug= [HW,X86,XEN] + xen_emul_unplug= [HW,X86,XEN,EARLY] Unplug Xen emulated devices Format: [unplug0,][unplug1] ide-disks -- unplug primary master IDE devices @@ -7290,17 +7310,17 @@ the unplug protocol never -- do not unplug even if version check succeeds - xen_legacy_crash [X86,XEN] + xen_legacy_crash [X86,XEN,EARLY] Crash from Xen panic notifier, without executing late panic() code such as dumping handler. - xen_msr_safe= [X86,XEN] + xen_msr_safe= [X86,XEN,EARLY] Format: Select whether to always use non-faulting (safe) MSR access functions when running as Xen PV guest. The default value is controlled by CONFIG_XEN_PV_MSR_SAFE. - xen_nopvspin [X86,XEN] + xen_nopvspin [X86,XEN,EARLY] Disables the qspinlock slowpath using Xen PV optimizations. This parameter is obsoleted by "nopvspin" parameter, which has equivalent effect for XEN platform. @@ -7312,7 +7332,7 @@ has equivalent effect for XEN platform. xen_no_vector_callback - [KNL,X86,XEN] Disable the vector callback for Xen + [KNL,X86,XEN,EARLY] Disable the vector callback for Xen event channel interrupts. xen_scrub_pages= [XEN] @@ -7321,7 +7341,7 @@ with /sys/devices/system/xen_memory/xen_memory0/scrub_pages. Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT. - xen_timer_slop= [X86-64,XEN] + xen_timer_slop= [X86-64,XEN,EARLY] Set the timer slop (in nanoseconds) for the virtual Xen timers (default is 100000). This adjusts the minimum delta of virtualized Xen timers, where lower values @@ -7374,7 +7394,7 @@ host controller quirks. Meaning of each bit can be consulted in header drivers/usb/host/xhci.h. - xmon [PPC] + xmon [PPC,EARLY] Format: { early | on | rw | ro | off } Controls if xmon debugger is enabled. Default is off. Passing only "xmon" is equivalent to "xmon=early". diff --git a/Documentation/arch/x86/pti.rst b/Documentation/arch/x86/pti.rst index e08d35177bc0..57e8392f61d3 100644 --- a/Documentation/arch/x86/pti.rst +++ b/Documentation/arch/x86/pti.rst @@ -26,9 +26,9 @@ comments in pti.c). This approach helps to ensure that side-channel attacks leveraging the paging structures do not function when PTI is enabled. It can be -enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. -Once enabled at compile-time, it can be disabled at boot with the -'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). +enabled by setting CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=y at compile +time. Once enabled at compile-time, it can be disabled at boot with +the 'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). Page Table Management ===================== diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index 08ebf9edbfc1..7352ab89a55a 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -47,17 +47,21 @@ AMD nomenclature for package is 'Node'. Package-related topology information in the kernel: - - cpuinfo_x86.x86_max_cores: + - topology_num_threads_per_package() - The number of cores in a package. This information is retrieved via CPUID. + The number of threads in a package. - - cpuinfo_x86.x86_max_dies: + - topology_num_cores_per_package() - The number of dies in a package. This information is retrieved via CPUID. + The number of cores in a package. + + - topology_max_dies_per_package() + + The maximum number of dies in a package. - cpuinfo_x86.topo.die_id: - The physical ID of the die. This information is retrieved via CPUID. + The physical ID of the die. - cpuinfo_x86.topo.pkg_id: @@ -96,16 +100,6 @@ are SMT- or CMT-type threads. AMDs nomenclature for a CMT core is "Compute Unit". The kernel always uses "core". -Core-related topology information in the kernel: - - - smp_num_siblings: - - The number of threads in a core. The number of threads in a package can be - calculated by:: - - threads_per_package = cpuinfo_x86.x86_max_cores * smp_num_siblings - - Threads ======= A thread is a single scheduling unit. It's the equivalent to a logical Linux diff --git a/Documentation/arch/x86/x86_64/fred.rst b/Documentation/arch/x86/x86_64/fred.rst new file mode 100644 index 000000000000..9f57e7b91f7e --- /dev/null +++ b/Documentation/arch/x86/x86_64/fred.rst @@ -0,0 +1,96 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Flexible Return and Event Delivery (FRED) +========================================= + +Overview +======== + +The FRED architecture defines simple new transitions that change +privilege level (ring transitions). The FRED architecture was +designed with the following goals: + +1) Improve overall performance and response time by replacing event + delivery through the interrupt descriptor table (IDT event + delivery) and event return by the IRET instruction with lower + latency transitions. + +2) Improve software robustness by ensuring that event delivery + establishes the full supervisor context and that event return + establishes the full user context. + +The new transitions defined by the FRED architecture are FRED event +delivery and, for returning from events, two FRED return instructions. +FRED event delivery can effect a transition from ring 3 to ring 0, but +it is used also to deliver events incident to ring 0. One FRED +instruction (ERETU) effects a return from ring 0 to ring 3, while the +other (ERETS) returns while remaining in ring 0. Collectively, FRED +event delivery and the FRED return instructions are FRED transitions. + +In addition to these transitions, the FRED architecture defines a new +instruction (LKGS) for managing the state of the GS segment register. +The LKGS instruction can be used by 64-bit operating systems that do +not use the new FRED transitions. + +Furthermore, the FRED architecture is easy to extend for future CPU +architectures. + +Software based event dispatching +================================ + +FRED operates differently from IDT in terms of event handling. Instead +of directly dispatching an event to its handler based on the event +vector, FRED requires the software to dispatch an event to its handler +based on both the event's type and vector. Therefore, an event dispatch +framework must be implemented to facilitate the event-to-handler +dispatch process. The FRED event dispatch framework takes control +once an event is delivered, and employs a two-level dispatch. + +The first level dispatching is event type based, and the second level +dispatching is event vector based. + +Full supervisor/user context +============================ + +FRED event delivery atomically save and restore full supervisor/user +context upon event delivery and return. Thus it avoids the problem of +transient states due to %cr2 and/or %dr6, and it is no longer needed +to handle all the ugly corner cases caused by half baked entry states. + +FRED allows explicit unblock of NMI with new event return instructions +ERETS/ERETU, avoiding the mess caused by IRET which unconditionally +unblocks NMI, e.g., when an exception happens during NMI handling. + +FRED always restores the full value of %rsp, thus ESPFIX is no longer +needed when FRED is enabled. + +LKGS +==== + +LKGS behaves like the MOV to GS instruction except that it loads the +base address into the IA32_KERNEL_GS_BASE MSR instead of the GS +segment’s descriptor cache. With LKGS, it ends up with avoiding +mucking with kernel GS, i.e., an operating system can always operate +with its own GS base address. + +Because FRED event delivery from ring 3 and ERETU both swap the value +of the GS base address and that of the IA32_KERNEL_GS_BASE MSR, plus +the introduction of LKGS instruction, the SWAPGS instruction is no +longer needed when FRED is enabled, thus is disallowed (#UD). + +Stack levels +============ + +4 stack levels 0~3 are introduced to replace the nonreentrant IST for +event handling, and each stack level should be configured to use a +dedicated stack. + +The current stack level could be unchanged or go higher upon FRED +event delivery. If unchanged, the CPU keeps using the current event +stack. If higher, the CPU switches to a new event stack specified by +the MSR of the new stack level, i.e., MSR_IA32_FRED_RSP[123]. + +Only execution of a FRED return instruction ERET[US], could lower the +current stack level, causing the CPU to switch back to the stack it was +on before a previous event delivery that promoted the stack level. diff --git a/Documentation/arch/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst index a56070fc8e77..ad15e9bd623f 100644 --- a/Documentation/arch/x86/x86_64/index.rst +++ b/Documentation/arch/x86/x86_64/index.rst @@ -15,3 +15,4 @@ x86_64 Support cpu-hotplug-spec machinecheck fsgs + fred diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 3599cf9267b4..ed73c612174d 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -77,10 +77,12 @@ wants a function to be executed asynchronously it has to set up a work item pointing to that function and queue that work item on a workqueue. -Special purpose threads, called worker threads, execute the functions -off of the queue, one after the other. If no work is queued, the -worker threads become idle. These worker threads are managed in so -called worker-pools. +A work item can be executed in either a thread or the BH (softirq) context. + +For threaded workqueues, special purpose threads, called [k]workers, execute +the functions off of the queue, one after the other. If no work is queued, +the worker threads become idle. These worker threads are managed in +worker-pools. The cmwq design differentiates between the user-facing workqueues that subsystems and drivers queue work items on and the backend mechanism @@ -91,6 +93,12 @@ for high priority ones, for each possible CPU and some extra worker-pools to serve work items queued on unbound workqueues - the number of these backing pools is dynamic. +BH workqueues use the same framework. However, as there can only be one +concurrent execution context, there's no need to worry about concurrency. +Each per-CPU BH worker pool contains only one pseudo worker which represents +the BH execution context. A BH workqueue can be considered a convenience +interface to softirq. + Subsystems and drivers can create and queue work items through special workqueue API functions as they see fit. They can influence some aspects of the way the work items are executed by setting flags on the @@ -106,7 +114,7 @@ unless specifically overridden, a work item of a bound workqueue will be queued on the worklist of either normal or highpri worker-pool that is associated to the CPU the issuer is running on. -For any worker pool implementation, managing the concurrency level +For any thread pool implementation, managing the concurrency level (how many execution contexts are active) is an important issue. cmwq tries to keep the concurrency at a minimal but sufficient level. Minimal to save resources and sufficient in that the system is used at @@ -164,6 +172,17 @@ resources, scheduled and executed. ``flags`` --------- +``WQ_BH`` + BH workqueues can be considered a convenience interface to softirq. BH + workqueues are always per-CPU and all BH work items are executed in the + queueing CPU's softirq context in the queueing order. + + All BH workqueues must have 0 ``max_active`` and ``WQ_HIGHPRI`` is the + only allowed additional flag. + + BH work items cannot sleep. All other features such as delayed queueing, + flushing and canceling are supported. + ``WQ_UNBOUND`` Work items queued to an unbound wq are served by the special worker-pools which host workers which are not bound to any @@ -237,15 +256,11 @@ may queue at the same time. Unless there is a specific need for throttling the number of active work items, specifying '0' is recommended. -Some users depend on the strict execution ordering of ST wq. The -combination of ``@max_active`` of 1 and ``WQ_UNBOUND`` used to -achieve this behavior. Work items on such wq were always queued to the -unbound worker-pools and only one work item could be active at any given -time thus achieving the same ordering property as ST wq. - -In the current implementation the above configuration only guarantees -ST behavior within a given NUMA node. Instead ``alloc_ordered_workqueue()`` should -be used to achieve system-wide ST behavior. +Some users depend on strict execution ordering where only one work item +is in flight at any given time and the work items are processed in +queueing order. While the combination of ``@max_active`` of 1 and +``WQ_UNBOUND`` used to achieve this behavior, this is no longer the +case. Use ``alloc_ordered_queue()`` instead. Example Execution Scenarios diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index ab376b316c36..7f3582a67318 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -245,6 +245,10 @@ Contributing new tests (details) TEST_PROGS, TEST_GEN_PROGS mean it is the executable tested by default. + TEST_GEN_MODS_DIR should be used by tests that require modules to be built + before the test starts. The variable will contain the name of the directory + containing the modules. + TEST_CUSTOM_PROGS should be used by tests that require custom build rules and prevent common build rule use. diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml index 3d06db98e978..a93744763787 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml @@ -36,6 +36,7 @@ properties: - amlogic,meson-a1-gpio-intc - amlogic,meson-s4-gpio-intc - amlogic,c3-gpio-intc + - amlogic,t7-gpio-intc - const: amlogic,meson-gpio-intc reg: diff --git a/Documentation/devicetree/bindings/interrupt-controller/starfive,jh8100-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/starfive,jh8100-intc.yaml new file mode 100644 index 000000000000..ada5788602d6 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/starfive,jh8100-intc.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/interrupt-controller/starfive,jh8100-intc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: StarFive External Interrupt Controller + +description: + StarFive SoC JH8100 contain a external interrupt controller. It can be used + to handle high-level input interrupt signals. It also send the output + interrupt signal to RISC-V PLIC. + +maintainers: + - Changhuang Liang + +properties: + compatible: + const: starfive,jh8100-intc + + reg: + maxItems: 1 + + clocks: + description: APB clock for the interrupt controller + maxItems: 1 + + resets: + description: APB reset for the interrupt controller + maxItems: 1 + + interrupts: + maxItems: 1 + + interrupt-controller: true + + "#interrupt-cells": + const: 1 + +required: + - compatible + - reg + - clocks + - resets + - interrupts + - interrupt-controller + - "#interrupt-cells" + +additionalProperties: false + +examples: + - | + interrupt-controller@12260000 { + compatible = "starfive,jh8100-intc"; + reg = <0x12260000 0x10000>; + clocks = <&syscrg_ne 76>; + resets = <&syscrg_ne 13>; + interrupts = <45>; + interrupt-controller; + #interrupt-cells = <1>; + }; diff --git a/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml b/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml index 475aff7714d6..ea35d19be829 100644 --- a/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml +++ b/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml @@ -65,9 +65,11 @@ properties: rx-internal-delay-ps: enum: [0, 1800] + default: 0 tx-internal-delay-ps: enum: [0, 2000] + default: 0 '#address-cells': const: 1 diff --git a/Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml b/Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml index c29d7942915c..241d20f3aad0 100644 --- a/Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml +++ b/Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml @@ -64,7 +64,7 @@ examples: #include #include sound { - compatible = "lge,tegra-audio-max98089-p895", + compatible = "lg,tegra-audio-max98089-p895", "nvidia,tegra-audio-max98089"; nvidia,model = "LG Optimus Vu MAX98089"; diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index e3d593841aa7..ea8d16600e16 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -545,7 +545,7 @@ In such scenario, dpll device input signal shall be also configurable to drive dpll with signal recovered from the PHY netdevice. This is done by exposing a pin to the netdevice - attaching pin to the netdevice itself with -``netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``. +``dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``. Exposed pin id handle ``DPLL_A_PIN_ID`` is then identifiable by the user as it is attached to rtnetlink respond to get ``RTM_NEWLINK`` command in nested attribute ``IFLA_DPLL_PIN``. diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst index 9e38e4c221ca..eb770f891b27 100644 --- a/Documentation/filesystems/files.rst +++ b/Documentation/filesystems/files.rst @@ -116,7 +116,7 @@ before and after the reference count increment. This pattern can be seen in get_file_rcu() and __files_get_rcu(). In addition, it isn't possible to access or check fields in struct file -without first aqcuiring a reference on it under rcu lookup. Not doing +without first acquiring a reference on it under rcu lookup. Not doing that was always very dodgy and it was only usable for non-pointer data in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers either first acquire a reference or they must hold the files_lock of the diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e18bc5ae3b35..0ea1e44fa028 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -98,7 +98,6 @@ Documentation for filesystem implementations. isofs nilfs2 nfs/index - ntfs ntfs3 ocfs2 ocfs2-online-filecheck diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index d5bf4b6b7509..e664061ed55d 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -29,7 +29,7 @@ prototypes:: char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); + struct dentry *(*d_real)(struct dentry *, enum d_real_type type); locking rules: diff --git a/Documentation/filesystems/ntfs.rst b/Documentation/filesystems/ntfs.rst deleted file mode 100644 index 5bb093a26485..000000000000 --- a/Documentation/filesystems/ntfs.rst +++ /dev/null @@ -1,466 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -================================ -The Linux NTFS filesystem driver -================================ - - -.. Table of contents - - - Overview - - Web site - - Features - - Supported mount options - - Known bugs and (mis-)features - - Using NTFS volume and stripe sets - - The Device-Mapper driver - - The Software RAID / MD driver - - Limitations when using the MD driver - - -Overview -======== - -Linux-NTFS comes with a number of user-space programs known as ntfsprogs. -These include mkntfs, a full-featured ntfs filesystem format utility, -ntfsundelete used for recovering files that were unintentionally deleted -from an NTFS volume and ntfsresize which is used to resize an NTFS partition. -See the web site for more information. - -To mount an NTFS 1.2/3.x (Windows NT4/2000/XP/2003) volume, use the file -system type 'ntfs'. The driver currently supports read-only mode (with no -fault-tolerance, encryption or journalling) and very limited, but safe, write -support. - -For fault tolerance and raid support (i.e. volume and stripe sets), you can -use the kernel's Software RAID / MD driver. See section "Using Software RAID -with NTFS" for details. - - -Web site -======== - -There is plenty of additional information on the linux-ntfs web site -at http://www.linux-ntfs.org/ - -The web site has a lot of additional information, such as a comprehensive -FAQ, documentation on the NTFS on-disk format, information on the Linux-NTFS -userspace utilities, etc. - - -Features -======== - -- This is a complete rewrite of the NTFS driver that used to be in the 2.4 and - earlier kernels. This new driver implements NTFS read support and is - functionally equivalent to the old ntfs driver and it also implements limited - write support. The biggest limitation at present is that files/directories - cannot be created or deleted. See below for the list of write features that - are so far supported. Another limitation is that writing to compressed files - is not implemented at all. Also, neither read nor write access to encrypted - files is so far implemented. -- The new driver has full support for sparse files on NTFS 3.x volumes which - the old driver isn't happy with. -- The new driver supports execution of binaries due to mmap() now being - supported. -- The new driver supports loopback mounting of files on NTFS which is used by - some Linux distributions to enable the user to run Linux from an NTFS - partition by creating a large file while in Windows and then loopback - mounting the file while in Linux and creating a Linux filesystem on it that - is used to install Linux on it. -- A comparison of the two drivers using:: - - time find . -type f -exec md5sum "{}" \; - - run three times in sequence with each driver (after a reboot) on a 1.4GiB - NTFS partition, showed the new driver to be 20% faster in total time elapsed - (from 9:43 minutes on average down to 7:53). The time spent in user space - was unchanged but the time spent in the kernel was decreased by a factor of - 2.5 (from 85 CPU seconds down to 33). -- The driver does not support short file names in general. For backwards - compatibility, we implement access to files using their short file names if - they exist. The driver will not create short file names however, and a - rename will discard any existing short file name. -- The new driver supports exporting of mounted NTFS volumes via NFS. -- The new driver supports async io (aio). -- The new driver supports fsync(2), fdatasync(2), and msync(2). -- The new driver supports readv(2) and writev(2). -- The new driver supports access time updates (including mtime and ctime). -- The new driver supports truncate(2) and open(2) with O_TRUNC. But at present - only very limited support for highly fragmented files, i.e. ones which have - their data attribute split across multiple extents, is included. Another - limitation is that at present truncate(2) will never create sparse files, - since to mark a file sparse we need to modify the directory entry for the - file and we do not implement directory modifications yet. -- The new driver supports write(2) which can both overwrite existing data and - extend the file size so that you can write beyond the existing data. Also, - writing into sparse regions is supported and the holes are filled in with - clusters. But at present only limited support for highly fragmented files, - i.e. ones which have their data attribute split across multiple extents, is - included. Another limitation is that write(2) will never create sparse - files, since to mark a file sparse we need to modify the directory entry for - the file and we do not implement directory modifications yet. - -Supported mount options -======================= - -In addition to the generic mount options described by the manual page for the -mount command (man 8 mount, also see man 5 fstab), the NTFS driver supports the -following mount options: - -======================= ======================================================= -iocharset=name Deprecated option. Still supported but please use - nls=name in the future. See description for nls=name. - -nls=name Character set to use when returning file names. - Unlike VFAT, NTFS suppresses names that contain - unconvertible characters. Note that most character - sets contain insufficient characters to represent all - possible Unicode characters that can exist on NTFS. - To be sure you are not missing any files, you are - advised to use nls=utf8 which is capable of - representing all Unicode characters. - -utf8= Option no longer supported. Currently mapped to - nls=utf8 but please use nls=utf8 in the future and - make sure utf8 is compiled either as module or into - the kernel. See description for nls=name. - -uid= -gid= -umask= Provide default owner, group, and access mode mask. - These options work as documented in mount(8). By - default, the files/directories are owned by root and - he/she has read and write permissions, as well as - browse permission for directories. No one else has any - access permissions. I.e. the mode on all files is by - default rw------- and for directories rwx------, a - consequence of the default fmask=0177 and dmask=0077. - Using a umask of zero will grant all permissions to - everyone, i.e. all files and directories will have mode - rwxrwxrwx. - -fmask= -dmask= Instead of specifying umask which applies both to - files and directories, fmask applies only to files and - dmask only to directories. - -sloppy= If sloppy is specified, ignore unknown mount options. - Otherwise the default behaviour is to abort mount if - any unknown options are found. - -show_sys_files= If show_sys_files is specified, show the system files - in directory listings. Otherwise the default behaviour - is to hide the system files. - Note that even when show_sys_files is specified, "$MFT" - will not be visible due to bugs/mis-features in glibc. - Further, note that irrespective of show_sys_files, all - files are accessible by name, i.e. you can always do - "ls -l \$UpCase" for example to specifically show the - system file containing the Unicode upcase table. - -case_sensitive= If case_sensitive is specified, treat all file names as - case sensitive and create file names in the POSIX - namespace. Otherwise the default behaviour is to treat - file names as case insensitive and to create file names - in the WIN32/LONG name space. Note, the Linux NTFS - driver will never create short file names and will - remove them on rename/delete of the corresponding long - file name. - Note that files remain accessible via their short file - name, if it exists. If case_sensitive, you will need - to provide the correct case of the short file name. - -disable_sparse= If disable_sparse is specified, creation of sparse - regions, i.e. holes, inside files is disabled for the - volume (for the duration of this mount only). By - default, creation of sparse regions is enabled, which - is consistent with the behaviour of traditional Unix - filesystems. - -errors=opt What to do when critical filesystem errors are found. - Following values can be used for "opt": - - ======== ========================================= - continue DEFAULT, try to clean-up as much as - possible, e.g. marking a corrupt inode as - bad so it is no longer accessed, and then - continue. - recover At present only supported is recovery of - the boot sector from the backup copy. - If read-only mount, the recovery is done - in memory only and not written to disk. - ======== ========================================= - - Note that the options are additive, i.e. specifying:: - - errors=continue,errors=recover - - means the driver will attempt to recover and if that - fails it will clean-up as much as possible and - continue. - -mft_zone_multiplier= Set the MFT zone multiplier for the volume (this - setting is not persistent across mounts and can be - changed from mount to mount but cannot be changed on - remount). Values of 1 to 4 are allowed, 1 being the - default. The MFT zone multiplier determines how much - space is reserved for the MFT on the volume. If all - other space is used up, then the MFT zone will be - shrunk dynamically, so this has no impact on the - amount of free space. However, it can have an impact - on performance by affecting fragmentation of the MFT. - In general use the default. If you have a lot of small - files then use a higher value. The values have the - following meaning: - - ===== ================================= - Value MFT zone size (% of volume size) - ===== ================================= - 1 12.5% - 2 25% - 3 37.5% - 4 50% - ===== ================================= - - Note this option is irrelevant for read-only mounts. -======================= ======================================================= - - -Known bugs and (mis-)features -============================= - -- The link count on each directory inode entry is set to 1, due to Linux not - supporting directory hard links. This may well confuse some user space - applications, since the directory names will have the same inode numbers. - This also speeds up ntfs_read_inode() immensely. And we haven't found any - problems with this approach so far. If you find a problem with this, please - let us know. - - -Please send bug reports/comments/feedback/abuse to the Linux-NTFS development -list at sourceforge: linux-ntfs-dev@lists.sourceforge.net - - -Using NTFS volume and stripe sets -================================= - -For support of volume and stripe sets, you can either use the kernel's -Device-Mapper driver or the kernel's Software RAID / MD driver. The former is -the recommended one to use for linear raid. But the latter is required for -raid level 5. For striping and mirroring, either driver should work fine. - - -The Device-Mapper driver ------------------------- - -You will need to create a table of the components of the volume/stripe set and -how they fit together and load this into the kernel using the dmsetup utility -(see man 8 dmsetup). - -Linear volume sets, i.e. linear raid, has been tested and works fine. Even -though untested, there is no reason why stripe sets, i.e. raid level 0, and -mirrors, i.e. raid level 1 should not work, too. Stripes with parity, i.e. -raid level 5, unfortunately cannot work yet because the current version of the -Device-Mapper driver does not support raid level 5. You may be able to use the -Software RAID / MD driver for raid level 5, see the next section for details. - -To create the table describing your volume you will need to know each of its -components and their sizes in sectors, i.e. multiples of 512-byte blocks. - -For NT4 fault tolerant volumes you can obtain the sizes using fdisk. So for -example if one of your partitions is /dev/hda2 you would do:: - - $ fdisk -ul /dev/hda - - Disk /dev/hda: 81.9 GB, 81964302336 bytes - 255 heads, 63 sectors/track, 9964 cylinders, total 160086528 sectors - Units = sectors of 1 * 512 = 512 bytes - - Device Boot Start End Blocks Id System - /dev/hda1 * 63 4209029 2104483+ 83 Linux - /dev/hda2 4209030 37768814 16779892+ 86 NTFS - /dev/hda3 37768815 46170809 4200997+ 83 Linux - -And you would know that /dev/hda2 has a size of 37768814 - 4209030 + 1 = -33559785 sectors. - -For Win2k and later dynamic disks, you can for example use the ldminfo utility -which is part of the Linux LDM tools (the latest version at the time of -writing is linux-ldm-0.0.8.tar.bz2). You can download it from: - - http://www.linux-ntfs.org/ - -Simply extract the downloaded archive (tar xvjf linux-ldm-0.0.8.tar.bz2), go -into it (cd linux-ldm-0.0.8) and change to the test directory (cd test). You -will find the precompiled (i386) ldminfo utility there. NOTE: You will not be -able to compile this yourself easily so use the binary version! - -Then you would use ldminfo in dump mode to obtain the necessary information:: - - $ ./ldminfo --dump /dev/hda - -This would dump the LDM database found on /dev/hda which describes all of your -dynamic disks and all the volumes on them. At the bottom you will see the -VOLUME DEFINITIONS section which is all you really need. You may need to look -further above to determine which of the disks in the volume definitions is -which device in Linux. Hint: Run ldminfo on each of your dynamic disks and -look at the Disk Id close to the top of the output for each (the PRIVATE HEADER -section). You can then find these Disk Ids in the VBLK DATABASE section in the - components where you will get the LDM Name for the disk that is found in -the VOLUME DEFINITIONS section. - -Note you will also need to enable the LDM driver in the Linux kernel. If your -distribution did not enable it, you will need to recompile the kernel with it -enabled. This will create the LDM partitions on each device at boot time. You -would then use those devices (for /dev/hda they would be /dev/hda1, 2, 3, etc) -in the Device-Mapper table. - -You can also bypass using the LDM driver by using the main device (e.g. -/dev/hda) and then using the offsets of the LDM partitions into this device as -the "Start sector of device" when creating the table. Once again ldminfo would -give you the correct information to do this. - -Assuming you know all your devices and their sizes things are easy. - -For a linear raid the table would look like this (note all values are in -512-byte sectors):: - - # Offset into Size of this Raid type Device Start sector - # volume device of device - 0 1028161 linear /dev/hda1 0 - 1028161 3903762 linear /dev/hdb2 0 - 4931923 2103211 linear /dev/hdc1 0 - -For a striped volume, i.e. raid level 0, you will need to know the chunk size -you used when creating the volume. Windows uses 64kiB as the default, so it -will probably be this unless you changes the defaults when creating the array. - -For a raid level 0 the table would look like this (note all values are in -512-byte sectors):: - - # Offset Size Raid Number Chunk 1st Start 2nd Start - # into of the type of size Device in Device in - # volume volume stripes device device - 0 2056320 striped 2 128 /dev/hda1 0 /dev/hdb1 0 - -If there are more than two devices, just add each of them to the end of the -line. - -Finally, for a mirrored volume, i.e. raid level 1, the table would look like -this (note all values are in 512-byte sectors):: - - # Ofs Size Raid Log Number Region Should Number Source Start Target Start - # in of the type type of log size sync? of Device in Device in - # vol volume params mirrors Device Device - 0 2056320 mirror core 2 16 nosync 2 /dev/hda1 0 /dev/hdb1 0 - -If you are mirroring to multiple devices you can specify further targets at the -end of the line. - -Note the "Should sync?" parameter "nosync" means that the two mirrors are -already in sync which will be the case on a clean shutdown of Windows. If the -mirrors are not clean, you can specify the "sync" option instead of "nosync" -and the Device-Mapper driver will then copy the entirety of the "Source Device" -to the "Target Device" or if you specified multiple target devices to all of -them. - -Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1), -and hand it over to dmsetup to work with, like so:: - - $ dmsetup create myvolume1 /etc/ntfsvolume1 - -You can obviously replace "myvolume1" with whatever name you like. - -If it all worked, you will now have the device /dev/device-mapper/myvolume1 -which you can then just use as an argument to the mount command as usual to -mount the ntfs volume. For example:: - - $ mount -t ntfs -o ro /dev/device-mapper/myvolume1 /mnt/myvol1 - -(You need to create the directory /mnt/myvol1 first and of course you can use -anything you like instead of /mnt/myvol1 as long as it is an existing -directory.) - -It is advisable to do the mount read-only to see if the volume has been setup -correctly to avoid the possibility of causing damage to the data on the ntfs -volume. - - -The Software RAID / MD driver ------------------------------ - -An alternative to using the Device-Mapper driver is to use the kernel's -Software RAID / MD driver. For which you need to set up your /etc/raidtab -appropriately (see man 5 raidtab). - -Linear volume sets, i.e. linear raid, as well as stripe sets, i.e. raid level -0, have been tested and work fine (though see section "Limitations when using -the MD driver with NTFS volumes" especially if you want to use linear raid). -Even though untested, there is no reason why mirrors, i.e. raid level 1, and -stripes with parity, i.e. raid level 5, should not work, too. - -You have to use the "persistent-superblock 0" option for each raid-disk in the -NTFS volume/stripe you are configuring in /etc/raidtab as the persistent -superblock used by the MD driver would damage the NTFS volume. - -Windows by default uses a stripe chunk size of 64k, so you probably want the -"chunk-size 64k" option for each raid-disk, too. - -For example, if you have a stripe set consisting of two partitions /dev/hda5 -and /dev/hdb1 your /etc/raidtab would look like this:: - - raiddev /dev/md0 - raid-level 0 - nr-raid-disks 2 - nr-spare-disks 0 - persistent-superblock 0 - chunk-size 64k - device /dev/hda5 - raid-disk 0 - device /dev/hdb1 - raid-disk 1 - -For linear raid, just change the raid-level above to "raid-level linear", for -mirrors, change it to "raid-level 1", and for stripe sets with parity, change -it to "raid-level 5". - -Note for stripe sets with parity you will also need to tell the MD driver -which parity algorithm to use by specifying the option "parity-algorithm -which", where you need to replace "which" with the name of the algorithm to -use (see man 5 raidtab for available algorithms) and you will have to try the -different available algorithms until you find one that works. Make sure you -are working read-only when playing with this as you may damage your data -otherwise. If you find which algorithm works please let us know (email the -linux-ntfs developers list linux-ntfs-dev@lists.sourceforge.net or drop in on -IRC in channel #ntfs on the irc.freenode.net network) so we can update this -documentation. - -Once the raidtab is setup, run for example raid0run -a to start all devices or -raid0run /dev/md0 to start a particular md device, in this case /dev/md0. - -Then just use the mount command as usual to mount the ntfs volume using for -example:: - - mount -t ntfs -o ro /dev/md0 /mnt/myntfsvolume - -It is advisable to do the mount read-only to see if the md volume has been -setup correctly to avoid the possibility of causing damage to the data on the -ntfs volume. - - -Limitations when using the Software RAID / MD driver ------------------------------------------------------ - -Using the md driver will not work properly if any of your NTFS partitions have -an odd number of sectors. This is especially important for linear raid as all -data after the first partition with an odd number of sectors will be offset by -one or more sectors so if you mount such a partition with write support you -will cause massive damage to the data on the volume which will only become -apparent when you try to use the volume again under Windows. - -So when using linear raid, make sure that all your partitions have an even -number of sectors BEFORE attempting to use it. You have been warned! - -Even better is to simply use the Device-Mapper for linear raid and then you do -not have this problem with odd numbers of sectors. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index eebcc0f9e2bc..6e903a903f8f 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1264,7 +1264,7 @@ defined: char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); + struct dentry *(*d_real)(struct dentry *, enum d_real_type type); }; ``d_revalidate`` @@ -1419,16 +1419,14 @@ defined: the dentry being transited from. ``d_real`` - overlay/union type filesystems implement this method to return - one of the underlying dentries hidden by the overlay. It is - used in two different modes: + overlay/union type filesystems implement this method to return one + of the underlying dentries of a regular file hidden by the overlay. - Called from file_dentry() it returns the real dentry matching - the inode argument. The real dentry may be from a lower layer - already copied up, but still referenced from the file. This - mode is selected with a non-NULL inode argument. + The 'type' argument takes the values D_REAL_DATA or D_REAL_METADATA + for returning the real underlying dentry that refers to the inode + hosting the file's data or metadata respectively. - With NULL inode the topmost real underlying dentry is returned. + For non-regular files, the 'dentry' argument is returned. Each dentry has a pointer to its parent dentry, as well as a hash list of child dentries. Child dentries are basically like files in a diff --git a/Documentation/index.rst b/Documentation/index.rst index 36e61783437c..9dfdc826618c 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -113,7 +113,6 @@ to ReStructured Text format, or are simply too old. :maxdepth: 1 staging/index - RAS/ras Translations diff --git a/Documentation/networking/net_cachelines/inet_sock.rst b/Documentation/networking/net_cachelines/inet_sock.rst index a2babd0d7954..595d7ef5fc8b 100644 --- a/Documentation/networking/net_cachelines/inet_sock.rst +++ b/Documentation/networking/net_cachelines/inet_sock.rst @@ -1,9 +1,9 @@ .. SPDX-License-Identifier: GPL-2.0 .. Copyright (C) 2023 Google LLC -===================================================== -inet_connection_sock struct fast path usage breakdown -===================================================== +========================================== +inet_sock struct fast path usage breakdown +========================================== Type Name fastpath_tx_access fastpath_rx_access comment ..struct ..inet_sock diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 50b3d1cb1115..c78ecc1e176f 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -31,7 +31,7 @@ you probably needn't concern yourself with pcmciautils. ====================== =============== ======================================== GNU C 5.1 gcc --version Clang/LLVM (optional) 11.0.0 clang --version -Rust (optional) 1.74.1 rustc --version +Rust (optional) 1.76.0 rustc --version bindgen (optional) 0.65.1 bindgen --version GNU make 3.82 make --version bash 4.2 bash --version diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst index 08dd0f804410..497bb39727c8 100644 --- a/Documentation/process/maintainer-tip.rst +++ b/Documentation/process/maintainer-tip.rst @@ -304,13 +304,15 @@ following tag ordering scheme: - Reported-by: ``Reporter `` + - Closes: ``URL or Message-ID of the bug report this is fixing`` + - Originally-by: ``Original author `` - Suggested-by: ``Suggester `` - Co-developed-by: ``Co-author `` - Signed-off: ``Co-author `` + Signed-off-by: ``Co-author `` Note, that Co-developed-by and Signed-off-by of the co-author(s) must come in pairs. @@ -478,7 +480,7 @@ Multi-line comments:: * Larger multi-line comments should be split into paragraphs. */ -No tail comments: +No tail comments (see below): Please refrain from using tail comments. Tail comments disturb the reading flow in almost all contexts, but especially in code:: @@ -499,6 +501,34 @@ No tail comments: /* This magic initialization needs a comment. Maybe not? */ seed = MAGIC_CONSTANT; + Use C++ style, tail comments when documenting structs in headers to + achieve a more compact layout and better readability:: + + // eax + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right + // for the topology ID at the next level + : 27; // Reserved + // ebx + u32 num_processors : 16, // Number of processors at current level + : 16; // Reserved + + versus:: + + /* eax */ + /* + * Number of bits to shift APIC ID right for the topology ID + * at the next level + */ + u32 x2apic_shift : 5, + /* Reserved */ + : 27; + + /* ebx */ + /* Number of processors at current level */ + u32 num_processors : 16, + /* Reserved */ + : 16; + Comment the important things: Comments should be added where the operation is not obvious. Documenting diff --git a/Documentation/rust/general-information.rst b/Documentation/rust/general-information.rst index 236c6dd3c647..081397827a7e 100644 --- a/Documentation/rust/general-information.rst +++ b/Documentation/rust/general-information.rst @@ -77,27 +77,3 @@ configuration: #[cfg(CONFIG_X="y")] // Enabled as a built-in (`y`) #[cfg(CONFIG_X="m")] // Enabled as a module (`m`) #[cfg(not(CONFIG_X))] // Disabled - - -Testing -------- - -There are the tests that come from the examples in the Rust documentation -and get transformed into KUnit tests. These can be run via KUnit. For example -via ``kunit_tool`` (``kunit.py``) on the command line:: - - ./tools/testing/kunit/kunit.py run --make_options LLVM=1 --arch x86_64 --kconfig_add CONFIG_RUST=y - -Alternatively, KUnit can run them as kernel built-in at boot. Refer to -Documentation/dev-tools/kunit/index.rst for the general KUnit documentation -and Documentation/dev-tools/kunit/architecture.rst for the details of kernel -built-in vs. command line testing. - -Additionally, there are the ``#[test]`` tests. These can be run using -the ``rusttest`` Make target:: - - make LLVM=1 rusttest - -This requires the kernel ``.config`` and downloads external repositories. -It runs the ``#[test]`` tests on the host (currently) and thus is fairly -limited in what these tests can test. diff --git a/Documentation/rust/index.rst b/Documentation/rust/index.rst index 965f2db529e0..46d35bd395cf 100644 --- a/Documentation/rust/index.rst +++ b/Documentation/rust/index.rst @@ -40,6 +40,7 @@ configurations. general-information coding-guidelines arch-support + testing .. only:: subproject and html diff --git a/Documentation/rust/testing.rst b/Documentation/rust/testing.rst new file mode 100644 index 000000000000..6658998d1b6c --- /dev/null +++ b/Documentation/rust/testing.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Testing +======= + +This document contains useful information how to test the Rust code in the +kernel. + +There are two sorts of tests: + +- The KUnit tests. +- The ``#[test]`` tests. + +The KUnit tests +--------------- + +These are the tests that come from the examples in the Rust documentation. They +get transformed into KUnit tests. + +Usage +***** + +These tests can be run via KUnit. For example via ``kunit_tool`` (``kunit.py``) +on the command line:: + + ./tools/testing/kunit/kunit.py run --make_options LLVM=1 --arch x86_64 --kconfig_add CONFIG_RUST=y + +Alternatively, KUnit can run them as kernel built-in at boot. Refer to +Documentation/dev-tools/kunit/index.rst for the general KUnit documentation +and Documentation/dev-tools/kunit/architecture.rst for the details of kernel +built-in vs. command line testing. + +To use these KUnit doctests, the following must be enabled:: + + CONFIG_KUNIT + Kernel hacking -> Kernel Testing and Coverage -> KUnit - Enable support for unit tests + CONFIG_RUST_KERNEL_DOCTESTS + Kernel hacking -> Rust hacking -> Doctests for the `kernel` crate + +in the kernel config system. + +KUnit tests are documentation tests +*********************************** + +These documentation tests are typically examples of usage of any item (e.g. +function, struct, module...). + +They are very convenient because they are just written alongside the +documentation. For instance: + +.. code-block:: rust + + /// Sums two numbers. + /// + /// ``` + /// assert_eq!(mymod::f(10, 20), 30); + /// ``` + pub fn f(a: i32, b: i32) -> i32 { + a + b + } + +In userspace, the tests are collected and run via ``rustdoc``. Using the tool +as-is would be useful already, since it allows verifying that examples compile +(thus enforcing they are kept in sync with the code they document) and as well +as running those that do not depend on in-kernel APIs. + +For the kernel, however, these tests get transformed into KUnit test suites. +This means that doctests get compiled as Rust kernel objects, allowing them to +run against a built kernel. + +A benefit of this KUnit integration is that Rust doctests get to reuse existing +testing facilities. For instance, the kernel log would look like:: + + KTAP version 1 + 1..1 + KTAP version 1 + # Subtest: rust_doctests_kernel + 1..59 + # rust_doctest_kernel_build_assert_rs_0.location: rust/kernel/build_assert.rs:13 + ok 1 rust_doctest_kernel_build_assert_rs_0 + # rust_doctest_kernel_build_assert_rs_1.location: rust/kernel/build_assert.rs:56 + ok 2 rust_doctest_kernel_build_assert_rs_1 + # rust_doctest_kernel_init_rs_0.location: rust/kernel/init.rs:122 + ok 3 rust_doctest_kernel_init_rs_0 + ... + # rust_doctest_kernel_types_rs_2.location: rust/kernel/types.rs:150 + ok 59 rust_doctest_kernel_types_rs_2 + # rust_doctests_kernel: pass:59 fail:0 skip:0 total:59 + # Totals: pass:59 fail:0 skip:0 total:59 + ok 1 rust_doctests_kernel + +Tests using the `? `_ +operator are also supported as usual, e.g.: + +.. code-block:: rust + + /// ``` + /// # use kernel::{spawn_work_item, workqueue}; + /// spawn_work_item!(workqueue::system(), || pr_info!("x"))?; + /// # Ok::<(), Error>(()) + /// ``` + +The tests are also compiled with Clippy under ``CLIPPY=1``, just like normal +code, thus also benefitting from extra linting. + +In order for developers to easily see which line of doctest code caused a +failure, a KTAP diagnostic line is printed to the log. This contains the +location (file and line) of the original test (i.e. instead of the location in +the generated Rust file):: + + # rust_doctest_kernel_types_rs_2.location: rust/kernel/types.rs:150 + +Rust tests appear to assert using the usual ``assert!`` and ``assert_eq!`` +macros from the Rust standard library (``core``). We provide a custom version +that forwards the call to KUnit instead. Importantly, these macros do not +require passing context, unlike those for KUnit testing (i.e. +``struct kunit *``). This makes them easier to use, and readers of the +documentation do not need to care about which testing framework is used. In +addition, it may allow us to test third-party code more easily in the future. + +A current limitation is that KUnit does not support assertions in other tasks. +Thus, we presently simply print an error to the kernel log if an assertion +actually failed. Additionally, doctests are not run for nonpublic functions. + +The ``#[test]`` tests +--------------------- + +Additionally, there are the ``#[test]`` tests. These can be run using the +``rusttest`` Make target:: + + make LLVM=1 rusttest + +This requires the kernel ``.config`` and downloads external repositories. It +runs the ``#[test]`` tests on the host (currently) and thus is fairly limited in +what these tests can test. diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 457e16f06e04..3731ecf1e437 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -82,8 +82,9 @@ Code Seq# Include File Comments 0x10 00-0F drivers/char/s390/vmcp.h 0x10 10-1F arch/s390/include/uapi/sclp_ctl.h 0x10 20-2F arch/s390/include/uapi/asm/hypfs.h -0x12 all linux/fs.h +0x12 all linux/fs.h BLK* ioctls linux/blkpg.h +0x15 all linux/fs.h FS_IOC_* ioctls 0x1b all InfiniBand Subsystem 0x20 all drivers/cdrom/cm206.h diff --git a/Documentation/virt/hyperv/index.rst b/Documentation/virt/hyperv/index.rst index 4a7a1b738bbe..de447e11b4a5 100644 --- a/Documentation/virt/hyperv/index.rst +++ b/Documentation/virt/hyperv/index.rst @@ -10,3 +10,4 @@ Hyper-V Enlightenments overview vmbus clocks + vpci diff --git a/Documentation/virt/hyperv/vpci.rst b/Documentation/virt/hyperv/vpci.rst new file mode 100644 index 000000000000..b65b2126ede3 --- /dev/null +++ b/Documentation/virt/hyperv/vpci.rst @@ -0,0 +1,316 @@ +.. SPDX-License-Identifier: GPL-2.0 + +PCI pass-thru devices +========================= +In a Hyper-V guest VM, PCI pass-thru devices (also called +virtual PCI devices, or vPCI devices) are physical PCI devices +that are mapped directly into the VM's physical address space. +Guest device drivers can interact directly with the hardware +without intermediation by the host hypervisor. This approach +provides higher bandwidth access to the device with lower +latency, compared with devices that are virtualized by the +hypervisor. The device should appear to the guest just as it +would when running on bare metal, so no changes are required +to the Linux device drivers for the device. + +Hyper-V terminology for vPCI devices is "Discrete Device +Assignment" (DDA). Public documentation for Hyper-V DDA is +available here: `DDA`_ + +.. _DDA: https://learn.microsoft.com/en-us/windows-server/virtualization/hyper-v/plan/plan-for-deploying-devices-using-discrete-device-assignment + +DDA is typically used for storage controllers, such as NVMe, +and for GPUs. A similar mechanism for NICs is called SR-IOV +and produces the same benefits by allowing a guest device +driver to interact directly with the hardware. See Hyper-V +public documentation here: `SR-IOV`_ + +.. _SR-IOV: https://learn.microsoft.com/en-us/windows-hardware/drivers/network/overview-of-single-root-i-o-virtualization--sr-iov- + +This discussion of vPCI devices includes DDA and SR-IOV +devices. + +Device Presentation +------------------- +Hyper-V provides full PCI functionality for a vPCI device when +it is operating, so the Linux device driver for the device can +be used unchanged, provided it uses the correct Linux kernel +APIs for accessing PCI config space and for other integration +with Linux. But the initial detection of the PCI device and +its integration with the Linux PCI subsystem must use Hyper-V +specific mechanisms. Consequently, vPCI devices on Hyper-V +have a dual identity. They are initially presented to Linux +guests as VMBus devices via the standard VMBus "offer" +mechanism, so they have a VMBus identity and appear under +/sys/bus/vmbus/devices. The VMBus vPCI driver in Linux at +drivers/pci/controller/pci-hyperv.c handles a newly introduced +vPCI device by fabricating a PCI bus topology and creating all +the normal PCI device data structures in Linux that would +exist if the PCI device were discovered via ACPI on a bare- +metal system. Once those data structures are set up, the +device also has a normal PCI identity in Linux, and the normal +Linux device driver for the vPCI device can function as if it +were running in Linux on bare-metal. Because vPCI devices are +presented dynamically through the VMBus offer mechanism, they +do not appear in the Linux guest's ACPI tables. vPCI devices +may be added to a VM or removed from a VM at any time during +the life of the VM, and not just during initial boot. + +With this approach, the vPCI device is a VMBus device and a +PCI device at the same time. In response to the VMBus offer +message, the hv_pci_probe() function runs and establishes a +VMBus connection to the vPCI VSP on the Hyper-V host. That +connection has a single VMBus channel. The channel is used to +exchange messages with the vPCI VSP for the purpose of setting +up and configuring the vPCI device in Linux. Once the device +is fully configured in Linux as a PCI device, the VMBus +channel is used only if Linux changes the vCPU to be interrupted +in the guest, or if the vPCI device is removed from +the VM while the VM is running. The ongoing operation of the +device happens directly between the Linux device driver for +the device and the hardware, with VMBus and the VMBus channel +playing no role. + +PCI Device Setup +---------------- +PCI device setup follows a sequence that Hyper-V originally +created for Windows guests, and that can be ill-suited for +Linux guests due to differences in the overall structure of +the Linux PCI subsystem compared with Windows. Nonetheless, +with a bit of hackery in the Hyper-V virtual PCI driver for +Linux, the virtual PCI device is setup in Linux so that +generic Linux PCI subsystem code and the Linux driver for the +device "just work". + +Each vPCI device is set up in Linux to be in its own PCI +domain with a host bridge. The PCI domainID is derived from +bytes 4 and 5 of the instance GUID assigned to the VMBus vPCI +device. The Hyper-V host does not guarantee that these bytes +are unique, so hv_pci_probe() has an algorithm to resolve +collisions. The collision resolution is intended to be stable +across reboots of the same VM so that the PCI domainIDs don't +change, as the domainID appears in the user space +configuration of some devices. + +hv_pci_probe() allocates a guest MMIO range to be used as PCI +config space for the device. This MMIO range is communicated +to the Hyper-V host over the VMBus channel as part of telling +the host that the device is ready to enter d0. See +hv_pci_enter_d0(). When the guest subsequently accesses this +MMIO range, the Hyper-V host intercepts the accesses and maps +them to the physical device PCI config space. + +hv_pci_probe() also gets BAR information for the device from +the Hyper-V host, and uses this information to allocate MMIO +space for the BARs. That MMIO space is then setup to be +associated with the host bridge so that it works when generic +PCI subsystem code in Linux processes the BARs. + +Finally, hv_pci_probe() creates the root PCI bus. At this +point the Hyper-V virtual PCI driver hackery is done, and the +normal Linux PCI machinery for scanning the root bus works to +detect the device, to perform driver matching, and to +initialize the driver and device. + +PCI Device Removal +------------------ +A Hyper-V host may initiate removal of a vPCI device from a +guest VM at any time during the life of the VM. The removal +is instigated by an admin action taken on the Hyper-V host and +is not under the control of the guest OS. + +A guest VM is notified of the removal by an unsolicited +"Eject" message sent from the host to the guest over the VMBus +channel associated with the vPCI device. Upon receipt of such +a message, the Hyper-V virtual PCI driver in Linux +asynchronously invokes Linux kernel PCI subsystem calls to +shutdown and remove the device. When those calls are +complete, an "Ejection Complete" message is sent back to +Hyper-V over the VMBus channel indicating that the device has +been removed. At this point, Hyper-V sends a VMBus rescind +message to the Linux guest, which the VMBus driver in Linux +processes by removing the VMBus identity for the device. Once +that processing is complete, all vestiges of the device having +been present are gone from the Linux kernel. The rescind +message also indicates to the guest that Hyper-V has stopped +providing support for the vPCI device in the guest. If the +guest were to attempt to access that device's MMIO space, it +would be an invalid reference. Hypercalls affecting the device +return errors, and any further messages sent in the VMBus +channel are ignored. + +After sending the Eject message, Hyper-V allows the guest VM +60 seconds to cleanly shutdown the device and respond with +Ejection Complete before sending the VMBus rescind +message. If for any reason the Eject steps don't complete +within the allowed 60 seconds, the Hyper-V host forcibly +performs the rescind steps, which will likely result in +cascading errors in the guest because the device is now no +longer present from the guest standpoint and accessing the +device MMIO space will fail. + +Because ejection is asynchronous and can happen at any point +during the guest VM lifecycle, proper synchronization in the +Hyper-V virtual PCI driver is very tricky. Ejection has been +observed even before a newly offered vPCI device has been +fully setup. The Hyper-V virtual PCI driver has been updated +several times over the years to fix race conditions when +ejections happen at inopportune times. Care must be taken when +modifying this code to prevent re-introducing such problems. +See comments in the code. + +Interrupt Assignment +-------------------- +The Hyper-V virtual PCI driver supports vPCI devices using +MSI, multi-MSI, or MSI-X. Assigning the guest vCPU that will +receive the interrupt for a particular MSI or MSI-X message is +complex because of the way the Linux setup of IRQs maps onto +the Hyper-V interfaces. For the single-MSI and MSI-X cases, +Linux calls hv_compse_msi_msg() twice, with the first call +containing a dummy vCPU and the second call containing the +real vCPU. Furthermore, hv_irq_unmask() is finally called +(on x86) or the GICD registers are set (on arm64) to specify +the real vCPU again. Each of these three calls interact +with Hyper-V, which must decide which physical CPU should +receive the interrupt before it is forwarded to the guest VM. +Unfortunately, the Hyper-V decision-making process is a bit +limited, and can result in concentrating the physical +interrupts on a single CPU, causing a performance bottleneck. +See details about how this is resolved in the extensive +comment above the function hv_compose_msi_req_get_cpu(). + +The Hyper-V virtual PCI driver implements the +irq_chip.irq_compose_msi_msg function as hv_compose_msi_msg(). +Unfortunately, on Hyper-V the implementation requires sending +a VMBus message to the Hyper-V host and awaiting an interrupt +indicating receipt of a reply message. Since +irq_chip.irq_compose_msi_msg can be called with IRQ locks +held, it doesn't work to do the normal sleep until awakened by +the interrupt. Instead hv_compose_msi_msg() must send the +VMBus message, and then poll for the completion message. As +further complexity, the vPCI device could be ejected/rescinded +while the polling is in progress, so this scenario must be +detected as well. See comments in the code regarding this +very tricky area. + +Most of the code in the Hyper-V virtual PCI driver (pci- +hyperv.c) applies to Hyper-V and Linux guests running on x86 +and on arm64 architectures. But there are differences in how +interrupt assignments are managed. On x86, the Hyper-V +virtual PCI driver in the guest must make a hypercall to tell +Hyper-V which guest vCPU should be interrupted by each +MSI/MSI-X interrupt, and the x86 interrupt vector number that +the x86_vector IRQ domain has picked for the interrupt. This +hypercall is made by hv_arch_irq_unmask(). On arm64, the +Hyper-V virtual PCI driver manages the allocation of an SPI +for each MSI/MSI-X interrupt. The Hyper-V virtual PCI driver +stores the allocated SPI in the architectural GICD registers, +which Hyper-V emulates, so no hypercall is necessary as with +x86. Hyper-V does not support using LPIs for vPCI devices in +arm64 guest VMs because it does not emulate a GICv3 ITS. + +The Hyper-V virtual PCI driver in Linux supports vPCI devices +whose drivers create managed or unmanaged Linux IRQs. If the +smp_affinity for an unmanaged IRQ is updated via the /proc/irq +interface, the Hyper-V virtual PCI driver is called to tell +the Hyper-V host to change the interrupt targeting and +everything works properly. However, on x86 if the x86_vector +IRQ domain needs to reassign an interrupt vector due to +running out of vectors on a CPU, there's no path to inform the +Hyper-V host of the change, and things break. Fortunately, +guest VMs operate in a constrained device environment where +using all the vectors on a CPU doesn't happen. Since such a +problem is only a theoretical concern rather than a practical +concern, it has been left unaddressed. + +DMA +--- +By default, Hyper-V pins all guest VM memory in the host +when the VM is created, and programs the physical IOMMU to +allow the VM to have DMA access to all its memory. Hence +it is safe to assign PCI devices to the VM, and allow the +guest operating system to program the DMA transfers. The +physical IOMMU prevents a malicious guest from initiating +DMA to memory belonging to the host or to other VMs on the +host. From the Linux guest standpoint, such DMA transfers +are in "direct" mode since Hyper-V does not provide a virtual +IOMMU in the guest. + +Hyper-V assumes that physical PCI devices always perform +cache-coherent DMA. When running on x86, this behavior is +required by the architecture. When running on arm64, the +architecture allows for both cache-coherent and +non-cache-coherent devices, with the behavior of each device +specified in the ACPI DSDT. But when a PCI device is assigned +to a guest VM, that device does not appear in the DSDT, so the +Hyper-V VMBus driver propagates cache-coherency information +from the VMBus node in the ACPI DSDT to all VMBus devices, +including vPCI devices (since they have a dual identity as a VMBus +device and as a PCI device). See vmbus_dma_configure(). +Current Hyper-V versions always indicate that the VMBus is +cache coherent, so vPCI devices on arm64 always get marked as +cache coherent and the CPU does not perform any sync +operations as part of dma_map/unmap_*() calls. + +vPCI protocol versions +---------------------- +As previously described, during vPCI device setup and teardown +messages are passed over a VMBus channel between the Hyper-V +host and the Hyper-v vPCI driver in the Linux guest. Some +messages have been revised in newer versions of Hyper-V, so +the guest and host must agree on the vPCI protocol version to +be used. The version is negotiated when communication over +the VMBus channel is first established. See +hv_pci_protocol_negotiation(). Newer versions of the protocol +extend support to VMs with more than 64 vCPUs, and provide +additional information about the vPCI device, such as the +guest virtual NUMA node to which it is most closely affined in +the underlying hardware. + +Guest NUMA node affinity +------------------------ +When the vPCI protocol version provides it, the guest NUMA +node affinity of the vPCI device is stored as part of the Linux +device information for subsequent use by the Linux driver. See +hv_pci_assign_numa_node(). If the negotiated protocol version +does not support the host providing NUMA affinity information, +the Linux guest defaults the device NUMA node to 0. But even +when the negotiated protocol version includes NUMA affinity +information, the ability of the host to provide such +information depends on certain host configuration options. If +the guest receives NUMA node value "0", it could mean NUMA +node 0, or it could mean "no information is available". +Unfortunately it is not possible to distinguish the two cases +from the guest side. + +PCI config space access in a CoCo VM +------------------------------------ +Linux PCI device drivers access PCI config space using a +standard set of functions provided by the Linux PCI subsystem. +In Hyper-V guests these standard functions map to functions +hv_pcifront_read_config() and hv_pcifront_write_config() +in the Hyper-V virtual PCI driver. In normal VMs, +these hv_pcifront_*() functions directly access the PCI config +space, and the accesses trap to Hyper-V to be handled. +But in CoCo VMs, memory encryption prevents Hyper-V +from reading the guest instruction stream to emulate the +access, so the hv_pcifront_*() functions must invoke +hypercalls with explicit arguments describing the access to be +made. + +Config Block back-channel +------------------------- +The Hyper-V host and Hyper-V virtual PCI driver in Linux +together implement a non-standard back-channel communication +path between the host and guest. The back-channel path uses +messages sent over the VMBus channel associated with the vPCI +device. The functions hyperv_read_cfg_blk() and +hyperv_write_cfg_blk() are the primary interfaces provided to +other parts of the Linux kernel. As of this writing, these +interfaces are used only by the Mellanox mlx5 driver to pass +diagnostic data to a Hyper-V host running in the Azure public +cloud. The functions hyperv_read_cfg_blk() and +hyperv_write_cfg_blk() are implemented in a separate module +(pci-hyperv-intf.c, under CONFIG_PCI_HYPERV_INTERFACE) that +effectively stubs them out when running in non-Hyper-V +environments. diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 3ec0b7a455a0..09c7e585ff58 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8791,6 +8791,11 @@ means the VM type with value @n is supported. Possible values of @n are:: #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 +Note, KVM_X86_SW_PROTECTED_VM is currently only for development and testing. +Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in +production. The behavior and effective ABI for software-protected VMs is +unstable. + 9. Known KVM API problems ========================= diff --git a/MAINTAINERS b/MAINTAINERS index 2ecaaec6a6bf..a3c7accd3877 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -897,6 +897,12 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/ F: drivers/infiniband/hw/efa/ F: include/uapi/rdma/efa-abi.h +AMD ADDRESS TRANSLATION LIBRARY (ATL) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Supported +F: drivers/ras/amd/atl/* + AMD AXI W1 DRIVER M: Kris Chaplin R: Thomas Delev @@ -1395,6 +1401,7 @@ F: drivers/hwmon/max31760.c ANALOGBITS PLL LIBRARIES M: Paul Walmsley +M: Samuel Holland S: Supported F: drivers/clk/analogbits/* F: include/linux/clk/analogbits* @@ -2156,7 +2163,7 @@ M: Shawn Guo M: Sascha Hauer R: Pengutronix Kernel Team R: Fabio Estevam -R: NXP Linux Team +L: imx@lists.linux.dev L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git @@ -7582,7 +7589,6 @@ R: Robert Richter L: linux-edac@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next -F: Documentation/admin-guide/ras.rst F: Documentation/driver-api/edac.rst F: drivers/edac/ F: include/linux/edac.h @@ -8495,7 +8501,7 @@ FREESCALE IMX / MXC FEC DRIVER M: Wei Fang R: Shenwei Wang R: Clark Wang -R: NXP Linux Team +L: imx@lists.linux.dev L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/fsl,fec.yaml @@ -8530,7 +8536,7 @@ F: drivers/i2c/busses/i2c-imx.c FREESCALE IMX LPI2C DRIVER M: Dong Aisheng L: linux-i2c@vger.kernel.org -L: linux-imx@nxp.com +L: imx@lists.linux.dev S: Maintained F: Documentation/devicetree/bindings/i2c/i2c-imx-lpi2c.yaml F: drivers/i2c/busses/i2c-imx-lpi2c.c @@ -10734,7 +10740,7 @@ INTEL DRM I915 DRIVER (Meteor Lake, DG2 and older excluding Poulsbo, Moorestown M: Jani Nikula M: Joonas Lahtinen M: Rodrigo Vivi -M: Tvrtko Ursulin +M: Tvrtko Ursulin L: intel-gfx@lists.freedesktop.org S: Supported W: https://drm.pages.freedesktop.org/intel-docs/ @@ -11156,6 +11162,16 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/wwan/iosm/ +INTEL(R) FLEXIBLE RETURN AND EVENT DELIVERY +M: Xin Li +M: "H. Peter Anvin" +S: Supported +F: Documentation/arch/x86/x86_64/fred.rst +F: arch/x86/entry/entry_64_fred.S +F: arch/x86/entry/entry_fred.c +F: arch/x86/include/asm/fred.h +F: arch/x86/kernel/fred.c + INTEL(R) TRACE HUB M: Alexander Shishkin S: Supported @@ -12516,7 +12532,6 @@ F: arch/powerpc/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ F: kernel/module/livepatch.c -F: lib/livepatch/ F: samples/livepatch/ F: tools/testing/selftests/livepatch/ @@ -14111,6 +14126,17 @@ F: mm/ F: tools/mm/ F: tools/testing/selftests/mm/ +MEMORY MAPPING +M: Andrew Morton +R: Liam R. Howlett +R: Vlastimil Babka +R: Lorenzo Stoakes +L: linux-mm@kvack.org +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: mm/mmap.c + MEMORY TECHNOLOGY DEVICES (MTD) M: Miquel Raynal M: Richard Weinberger @@ -14369,7 +14395,7 @@ MICROCHIP MCP16502 PMIC DRIVER M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported -F: Documentation/devicetree/bindings/regulator/mcp16502-regulator.txt +F: Documentation/devicetree/bindings/regulator/microchip,mcp16502.yaml F: drivers/regulator/mcp16502.c MICROCHIP MCP3564 ADC DRIVER @@ -15578,16 +15604,6 @@ W: https://github.com/davejiang/linux/wiki T: git https://github.com/davejiang/linux.git F: drivers/ntb/hw/intel/ -NTFS FILESYSTEM -M: Anton Altaparmakov -R: Namjae Jeon -L: linux-ntfs-dev@lists.sourceforge.net -S: Supported -W: http://www.tuxera.com/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/aia21/ntfs.git -F: Documentation/filesystems/ntfs.rst -F: fs/ntfs/ - NTFS3 FILESYSTEM M: Konstantin Komarov L: ntfs3@lists.linux.dev @@ -15716,7 +15732,7 @@ F: drivers/iio/gyro/fxas21002c_spi.c NXP i.MX 7D/6SX/6UL/93 AND VF610 ADC DRIVER M: Haibo Chen L: linux-iio@vger.kernel.org -L: linux-imx@nxp.com +L: imx@lists.linux.dev S: Maintained F: Documentation/devicetree/bindings/iio/adc/fsl,imx7d-adc.yaml F: Documentation/devicetree/bindings/iio/adc/fsl,vf610-adc.yaml @@ -15753,7 +15769,7 @@ F: drivers/gpu/drm/imx/dcss/ NXP i.MX 8QXP ADC DRIVER M: Cai Huoqing M: Haibo Chen -L: linux-imx@nxp.com +L: imx@lists.linux.dev L: linux-iio@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/iio/adc/nxp,imx8qxp-adc.yaml @@ -15761,7 +15777,7 @@ F: drivers/iio/adc/imx8qxp-adc.c NXP i.MX 8QXP/8QM JPEG V4L2 DRIVER M: Mirela Rabulea -R: NXP Linux Team +L: imx@lists.linux.dev L: linux-media@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/media/nxp,imx8-jpeg.yaml @@ -15771,7 +15787,7 @@ NXP i.MX CLOCK DRIVERS M: Abel Vesa R: Peng Fan L: linux-clk@vger.kernel.org -L: linux-imx@nxp.com +L: imx@lists.linux.dev S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/abelvesa/linux.git clk/imx F: Documentation/devicetree/bindings/clock/imx* @@ -16732,6 +16748,7 @@ F: drivers/pci/controller/dwc/*layerscape* PCI DRIVER FOR FU740 M: Paul Walmsley M: Greentime Hu +M: Samuel Holland L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml @@ -17501,6 +17518,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core F: fs/timerfd.c F: include/linux/time_namespace.h F: include/linux/timer* +F: include/trace/events/timer* F: kernel/time/*timer* F: kernel/time/namespace.c @@ -17537,6 +17555,7 @@ F: Documentation/devicetree/bindings/power/supply/ F: drivers/power/supply/ F: include/linux/power/ F: include/linux/power_supply.h +F: tools/testing/selftests/power_supply/ POWERNV OPERATOR PANEL LCD DISPLAY DRIVER M: Suraj Jitindar Singh @@ -17984,33 +18003,34 @@ F: drivers/media/tuners/qt1010* QUALCOMM ATH12K WIRELESS DRIVER M: Kalle Valo -M: Jeff Johnson +M: Jeff Johnson L: ath12k@lists.infradead.org S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath12k T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git F: drivers/net/wireless/ath/ath12k/ +N: ath12k QUALCOMM ATHEROS ATH10K WIRELESS DRIVER M: Kalle Valo -M: Jeff Johnson +M: Jeff Johnson L: ath10k@lists.infradead.org S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath10k T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git -F: Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml F: drivers/net/wireless/ath/ath10k/ +N: ath10k QUALCOMM ATHEROS ATH11K WIRELESS DRIVER M: Kalle Valo -M: Jeff Johnson +M: Jeff Johnson L: ath11k@lists.infradead.org S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath11k B: https://wireless.wiki.kernel.org/en/users/Drivers/ath11k/bugreport T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git -F: Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml F: drivers/net/wireless/ath/ath11k/ +N: ath11k QUALCOMM ATHEROS ATH9K WIRELESS DRIVER M: Toke Høiland-Jørgensen @@ -18364,11 +18384,17 @@ M: Tony Luck M: Borislav Petkov L: linux-edac@vger.kernel.org S: Maintained -F: Documentation/admin-guide/ras.rst +F: Documentation/admin-guide/RAS F: drivers/ras/ F: include/linux/ras.h F: include/ras/ras_event.h +RAS FRU MEMORY POISON MANAGER (FMPM) +M: Yazen Ghannam +L: linux-edac@vger.kernel.org +S: Maintained +F: drivers/ras/amd/fmpm.c + RC-CORE / LIRC FRAMEWORK M: Sean Young L: linux-media@vger.kernel.org @@ -19106,6 +19132,7 @@ F: Documentation/rust/ F: rust/ F: samples/rust/ F: scripts/*rust* +F: tools/testing/selftests/rust/ K: \b(?i:rust)\b RXRPC SOCKETS (AF_RXRPC) @@ -19641,7 +19668,7 @@ F: drivers/mmc/host/sdhci-of-at91.c SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) NXP i.MX DRIVER M: Haibo Chen -L: linux-imx@nxp.com +L: imx@lists.linux.dev L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/sdhci-esdhc-imx.c @@ -19976,35 +20003,14 @@ S: Maintained F: drivers/watchdog/simatic-ipc-wdt.c SIFIVE DRIVERS -M: Palmer Dabbelt M: Paul Walmsley +M: Samuel Holland L: linux-riscv@lists.infradead.org S: Supported -N: sifive -K: [^@]sifive - -SIFIVE CACHE DRIVER -M: Conor Dooley -L: linux-riscv@lists.infradead.org -S: Maintained -F: Documentation/devicetree/bindings/cache/sifive,ccache0.yaml -F: drivers/cache/sifive_ccache.c - -SIFIVE FU540 SYSTEM-ON-CHIP -M: Paul Walmsley -M: Palmer Dabbelt -L: linux-riscv@lists.infradead.org -S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pjw/sifive.git -N: fu540 -K: fu540 - -SIFIVE PDMA DRIVER -M: Green Wan -S: Maintained -F: Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml F: drivers/dma/sf-pdma/ - +N: sifive +K: fu[57]40 +K: [^@]sifive SILEAD TOUCHSCREEN DRIVER M: Hans de Goede @@ -20214,8 +20220,8 @@ F: Documentation/devicetree/bindings/net/socionext,uniphier-ave4.yaml F: drivers/net/ethernet/socionext/sni_ave.c SOCIONEXT (SNI) NETSEC NETWORK DRIVER -M: Jassi Brar M: Ilias Apalodimas +M: Masahisa Kojima L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml @@ -20968,6 +20974,12 @@ F: Documentation/devicetree/bindings/phy/starfive,jh7110-usb-phy.yaml F: drivers/phy/starfive/phy-jh7110-pcie.c F: drivers/phy/starfive/phy-jh7110-usb.c +STARFIVE JH8100 EXTERNAL INTERRUPT CONTROLLER DRIVER +M: Changhuang Liang +S: Supported +F: Documentation/devicetree/bindings/interrupt-controller/starfive,jh8100-intc.yaml +F: drivers/irqchip/irq-starfive-jh8100-intc.c + STATIC BRANCH/CALL M: Peter Zijlstra M: Josh Poimboeuf diff --git a/Makefile b/Makefile index 6cdb5717bfe0..d18fa2a6240d 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 8 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* @@ -1201,7 +1201,7 @@ prepare0: archprepare # All the preparing.. prepare: prepare0 ifdef CONFIG_RUST - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh + +$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh $(Q)$(MAKE) $(build)=rust endif @@ -1711,7 +1711,7 @@ $(DOC_TARGETS): # "Is Rust available?" target PHONY += rustavailable rustavailable: - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh && echo "Rust is available!" + +$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh && echo "Rust is available!" # Documentation target # diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 7439b2377df5..8e9dd63b220c 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -467,11 +467,6 @@ smp_prepare_cpus(unsigned int max_cpus) smp_num_cpus = smp_num_probed; } -void -smp_prepare_boot_cpu(void) -{ -} - int __cpu_up(unsigned int cpu, struct task_struct *tidle) { diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index 8d9b188caa27..b2f2c59279a6 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -39,11 +39,6 @@ struct plat_smp_ops __weak plat_smp_ops; /* XXX: per cpu ? Only needed once in early secondary boot */ struct task_struct *secondary_idle_tsk; -/* Called from start_kernel */ -void __init smp_prepare_boot_cpu(void) -{ -} - static int __init arc_get_cpu_map(const char *name, struct cpumask *cpumask) { unsigned long dt_root = of_get_flat_dt_root(); diff --git a/arch/arm/boot/dts/nxp/imx/imx7s.dtsi b/arch/arm/boot/dts/nxp/imx/imx7s.dtsi index ebf7befcc11e..9c81c6baa2d3 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7s.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx7s.dtsi @@ -834,16 +834,6 @@ <&clks IMX7D_LCDIF_PIXEL_ROOT_CLK>; clock-names = "pix", "axi"; status = "disabled"; - - port { - #address-cells = <1>; - #size-cells = <0>; - - lcdif_out_mipi_dsi: endpoint@0 { - reg = <0>; - remote-endpoint = <&mipi_dsi_in_lcdif>; - }; - }; }; mipi_csi: mipi-csi@30750000 { @@ -895,22 +885,6 @@ samsung,esc-clock-frequency = <20000000>; samsung,pll-clock-frequency = <24000000>; status = "disabled"; - - ports { - #address-cells = <1>; - #size-cells = <0>; - - port@0 { - reg = <0>; - #address-cells = <1>; - #size-cells = <0>; - - mipi_dsi_in_lcdif: endpoint@0 { - reg = <0>; - remote-endpoint = <&lcdif_out_mipi_dsi>; - }; - }; - }; }; }; diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 0a90583f9f01..8f9dbe8d9029 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -297,6 +297,7 @@ CONFIG_FB_MODE_HELPERS=y CONFIG_LCD_CLASS_DEVICE=y CONFIG_LCD_L4F00242T03=y CONFIG_LCD_PLATFORM=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_BACKLIGHT_PWM=y CONFIG_BACKLIGHT_GPIO=y CONFIG_FRAMEBUFFER_CONSOLE=y diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index d68101655b74..9f21e170320f 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -4,7 +4,6 @@ #include #include -#include /* * ELF register definitions.. diff --git a/arch/arm/include/asm/vdso_datapage.h b/arch/arm/include/asm/vdso_datapage.h deleted file mode 100644 index bef68f59928d..000000000000 --- a/arch/arm/include/asm/vdso_datapage.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Adapted from arm64 version. - * - * Copyright (C) 2012 ARM Limited - */ -#ifndef __ASM_VDSO_DATAPAGE_H -#define __ASM_VDSO_DATAPAGE_H - -#ifdef __KERNEL__ - -#ifndef __ASSEMBLY__ - -#include -#include - -union vdso_data_store { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -}; - -#endif /* !__ASSEMBLY__ */ - -#endif /* __KERNEL__ */ - -#endif /* __ASM_VDSO_DATAPAGE_H */ diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 219cbc7e5d13..4915662842ff 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -21,10 +21,12 @@ #include #include #include -#include #include #include #include + +#include + #include "signal.h" /* diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index f297d66a8a76..d499ad461b00 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -35,9 +34,6 @@ extern char vdso_start[], vdso_end[]; /* Total number of pages needed for the data and text portions of the VDSO. */ unsigned int vdso_total_pages __ro_after_init; -/* - * The VDSO data page. - */ static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; diff --git a/arch/arm64/boot/dts/allwinner/Makefile b/arch/arm64/boot/dts/allwinner/Makefile index 91d505b385de..1f1f8d865d0e 100644 --- a/arch/arm64/boot/dts/allwinner/Makefile +++ b/arch/arm64/boot/dts/allwinner/Makefile @@ -42,5 +42,6 @@ dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-bigtreetech-cb1-manta.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-bigtreetech-pi.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-orangepi-zero2.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h616-x96-mate.dtb +dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-orangepi-zero2w.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-orangepi-zero3.dtb dtb-$(CONFIG_ARCH_SUNXI) += sun50i-h618-transpeed-8k618-t.dtb diff --git a/arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi b/arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi index a03c7667d2b6..2bfe2c431611 100644 --- a/arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi +++ b/arch/arm64/boot/dts/amlogic/amlogic-t7.dtsi @@ -171,6 +171,16 @@ }; }; + gpio_intc: interrupt-controller@4080 { + compatible = "amlogic,t7-gpio-intc", + "amlogic,meson-gpio-intc"; + reg = <0x0 0x4080 0x0 0x20>; + interrupt-controller; + #interrupt-cells = <2>; + amlogic,channel-interrupts = + <10 11 12 13 14 15 16 17 18 19 20 21>; + }; + uart_a: serial@78000 { compatible = "amlogic,t7-uart", "amlogic,meson-s4-uart"; reg = <0x0 0x78000 0x0 0x18>; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi index 4ae4fdab461e..43f1d45ccc96 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi @@ -255,7 +255,7 @@ <&clk IMX8MP_AUDIO_PLL2_OUT>; assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; assigned-clock-rates = <13000000>, <13000000>, <156000000>; - reset-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; + reset-gpios = <&gpio4 1 GPIO_ACTIVE_HIGH>; status = "disabled"; ports { diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index 76c73daf546b..39a550c1cd26 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -1820,7 +1820,7 @@ compatible = "fsl,imx8mp-ldb"; reg = <0x5c 0x4>, <0x128 0x4>; reg-names = "ldb", "lvds"; - clocks = <&clk IMX8MP_CLK_MEDIA_LDB>; + clocks = <&clk IMX8MP_CLK_MEDIA_LDB_ROOT>; clock-names = "ldb"; assigned-clocks = <&clk IMX8MP_CLK_MEDIA_LDB>; assigned-clock-parents = <&clk IMX8MP_VIDEO_PLL1_OUT>; diff --git a/arch/arm64/boot/dts/nvidia/tegra234-p3737-0000+p3701-0000.dts b/arch/arm64/boot/dts/nvidia/tegra234-p3737-0000+p3701-0000.dts index ea13c4a7027c..81a82933e350 100644 --- a/arch/arm64/boot/dts/nvidia/tegra234-p3737-0000+p3701-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra234-p3737-0000+p3701-0000.dts @@ -175,7 +175,7 @@ status = "okay"; phy-handle = <&mgbe0_phy>; - phy-mode = "usxgmii"; + phy-mode = "10gbase-r"; mdio { #address-cells = <1>; diff --git a/arch/arm64/boot/dts/nvidia/tegra234.dtsi b/arch/arm64/boot/dts/nvidia/tegra234.dtsi index 3f16595d099c..d1bd328892af 100644 --- a/arch/arm64/boot/dts/nvidia/tegra234.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra234.dtsi @@ -1459,7 +1459,7 @@ <&mc TEGRA234_MEMORY_CLIENT_MGBEAWR &emc>; interconnect-names = "dma-mem", "write"; iommus = <&smmu_niso0 TEGRA234_SID_MGBE>; - power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEA>; + power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEB>; status = "disabled"; }; @@ -1493,7 +1493,7 @@ <&mc TEGRA234_MEMORY_CLIENT_MGBEBWR &emc>; interconnect-names = "dma-mem", "write"; iommus = <&smmu_niso0 TEGRA234_SID_MGBE_VF1>; - power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEB>; + power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEC>; status = "disabled"; }; @@ -1527,7 +1527,7 @@ <&mc TEGRA234_MEMORY_CLIENT_MGBECWR &emc>; interconnect-names = "dma-mem", "write"; iommus = <&smmu_niso0 TEGRA234_SID_MGBE_VF2>; - power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBEC>; + power-domains = <&bpmp TEGRA234_POWER_DOMAIN_MGBED>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi index 8d41ed261adf..ee6f87c828ae 100644 --- a/arch/arm64/boot/dts/qcom/msm8996.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi @@ -457,25 +457,6 @@ }; }; - mpm: interrupt-controller { - compatible = "qcom,mpm"; - qcom,rpm-msg-ram = <&apss_mpm>; - interrupts = ; - mboxes = <&apcs_glb 1>; - interrupt-controller; - #interrupt-cells = <2>; - #power-domain-cells = <0>; - interrupt-parent = <&intc>; - qcom,mpm-pin-count = <96>; - qcom,mpm-pin-map = <2 184>, /* TSENS1 upper_lower_int */ - <52 243>, /* DWC3_PRI ss_phy_irq */ - <79 347>, /* DWC3_PRI hs_phy_irq */ - <80 352>, /* DWC3_SEC hs_phy_irq */ - <81 347>, /* QUSB2_PHY_PRI DP+DM */ - <82 352>, /* QUSB2_PHY_SEC DP+DM */ - <87 326>; /* SPMI */ - }; - psci { compatible = "arm,psci-1.0"; method = "smc"; @@ -765,15 +746,8 @@ }; rpm_msg_ram: sram@68000 { - compatible = "qcom,rpm-msg-ram", "mmio-sram"; + compatible = "qcom,rpm-msg-ram"; reg = <0x00068000 0x6000>; - #address-cells = <1>; - #size-cells = <1>; - ranges = <0 0x00068000 0x7000>; - - apss_mpm: sram@1b8 { - reg = <0x1b8 0x48>; - }; }; qfprom@74000 { @@ -856,8 +830,8 @@ reg = <0x004ad000 0x1000>, /* TM */ <0x004ac000 0x1000>; /* SROT */ #qcom,sensors = <8>; - interrupts-extended = <&mpm 2 IRQ_TYPE_LEVEL_HIGH>, - <&intc GIC_SPI 430 IRQ_TYPE_LEVEL_HIGH>; + interrupts = , + ; interrupt-names = "uplow", "critical"; #thermal-sensor-cells = <1>; }; @@ -1363,7 +1337,6 @@ interrupts = ; gpio-controller; gpio-ranges = <&tlmm 0 0 150>; - wakeup-parent = <&mpm>; #gpio-cells = <2>; interrupt-controller; #interrupt-cells = <2>; @@ -1891,7 +1864,7 @@ <0x0400a000 0x002100>; reg-names = "core", "chnls", "obsrvr", "intr", "cnfg"; interrupt-names = "periph_irq"; - interrupts-extended = <&mpm 87 IRQ_TYPE_LEVEL_HIGH>; + interrupts = ; qcom,ee = <0>; qcom,channel = <0>; #address-cells = <2>; @@ -3052,8 +3025,8 @@ #size-cells = <1>; ranges; - interrupts-extended = <&mpm 79 IRQ_TYPE_LEVEL_HIGH>, - <&mpm 52 IRQ_TYPE_LEVEL_HIGH>; + interrupts = , + ; interrupt-names = "hs_phy_irq", "ss_phy_irq"; clocks = <&gcc GCC_SYS_NOC_USB3_AXI_CLK>, diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts b/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts index ffc4406422ae..41215567b3ae 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts +++ b/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts @@ -563,6 +563,8 @@ }; &pcie4 { + max-link-speed = <2>; + perst-gpios = <&tlmm 141 GPIO_ACTIVE_LOW>; wake-gpios = <&tlmm 139 GPIO_ACTIVE_LOW>; diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts index def3976bd5bb..eb657e544961 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts +++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts @@ -722,6 +722,8 @@ }; &pcie4 { + max-link-speed = <2>; + perst-gpios = <&tlmm 141 GPIO_ACTIVE_LOW>; wake-gpios = <&tlmm 139 GPIO_ACTIVE_LOW>; diff --git a/arch/arm64/boot/dts/qcom/sm6115.dtsi b/arch/arm64/boot/dts/qcom/sm6115.dtsi index 160e098f1075..f9849b8befbf 100644 --- a/arch/arm64/boot/dts/qcom/sm6115.dtsi +++ b/arch/arm64/boot/dts/qcom/sm6115.dtsi @@ -1304,6 +1304,9 @@ &config_noc SLAVE_QUP_0 RPM_ALWAYS_TAG>, <&system_noc MASTER_QUP_0 RPM_ALWAYS_TAG &bimc SLAVE_EBI_CH0 RPM_ALWAYS_TAG>; + interconnect-names = "qup-core", + "qup-config", + "qup-memory"; #address-cells = <1>; #size-cells = <0>; status = "disabled"; diff --git a/arch/arm64/boot/dts/qcom/sm8650-mtp.dts b/arch/arm64/boot/dts/qcom/sm8650-mtp.dts index 9d916edb1c73..be133a3d5cbe 100644 --- a/arch/arm64/boot/dts/qcom/sm8650-mtp.dts +++ b/arch/arm64/boot/dts/qcom/sm8650-mtp.dts @@ -622,7 +622,7 @@ &tlmm { /* Reserved I/Os for NFC */ - gpio-reserved-ranges = <32 8>; + gpio-reserved-ranges = <32 8>, <74 1>; disp0_reset_n_active: disp0-reset-n-active-state { pins = "gpio133"; diff --git a/arch/arm64/boot/dts/qcom/sm8650-qrd.dts b/arch/arm64/boot/dts/qcom/sm8650-qrd.dts index 592a67a47c78..b9151c2ddf2e 100644 --- a/arch/arm64/boot/dts/qcom/sm8650-qrd.dts +++ b/arch/arm64/boot/dts/qcom/sm8650-qrd.dts @@ -659,7 +659,7 @@ &tlmm { /* Reserved I/Os for NFC */ - gpio-reserved-ranges = <32 8>; + gpio-reserved-ranges = <32 8>, <74 1>; bt_default: bt-default-state { bt-en-pins { diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index bac4cabef607..467ac2f768ac 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -227,8 +227,19 @@ static int ctr_encrypt(struct skcipher_request *req) src += blocks * AES_BLOCK_SIZE; } if (nbytes && walk.nbytes == walk.total) { + u8 buf[AES_BLOCK_SIZE]; + u8 *d = dst; + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + src = dst = memcpy(buf + sizeof(buf) - nbytes, + src, nbytes); + neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds, nbytes, walk.iv); + + if (unlikely(nbytes < AES_BLOCK_SIZE)) + memcpy(d, dst, nbytes); + nbytes = 0; } kernel_neon_end(); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 7f88028a00c0..b2a60e0bcfd2 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -247,7 +247,7 @@ struct kunwind_consume_entry_data { void *cookie; }; -static bool +static __always_inline bool arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) { struct kunwind_consume_entry_data *data = cookie; diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 5562daf38a22..89b6e7840002 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -69,10 +69,7 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = { /* * The vDSO data page. */ -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; +static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; static int vdso_mremap(const struct vm_special_mapping *sm, diff --git a/arch/csky/include/asm/vdso.h b/arch/csky/include/asm/vdso.h index bdce581b5fcb..181a15edafe8 100644 --- a/arch/csky/include/asm/vdso.h +++ b/arch/csky/include/asm/vdso.h @@ -5,11 +5,6 @@ #include -#ifndef GENERIC_TIME_VSYSCALL -struct vdso_data { -}; -#endif - /* * The VDSO symbols are mapped into Linux so we can just use regular symbol * addressing to get their offsets in userspace. The symbols are mapped at an diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c index 8e42352cbf12..92dbbf3e0205 100644 --- a/arch/csky/kernel/smp.c +++ b/arch/csky/kernel/smp.c @@ -152,10 +152,6 @@ void arch_irq_work_raise(void) } #endif -void __init smp_prepare_boot_cpu(void) -{ -} - void __init smp_prepare_cpus(unsigned int max_cpus) { } diff --git a/arch/csky/kernel/vdso.c b/arch/csky/kernel/vdso.c index 16c20d64d165..2ca886e4a458 100644 --- a/arch/csky/kernel/vdso.c +++ b/arch/csky/kernel/vdso.c @@ -8,25 +8,15 @@ #include #include -#ifdef GENERIC_TIME_VSYSCALL #include -#else -#include -#endif extern char vdso_start[], vdso_end[]; static unsigned int vdso_pages; static struct page **vdso_pagelist; -/* - * The vDSO data page. - */ -static union { - struct vdso_data data; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; +static union vdso_data_store vdso_data_store __page_aligned_data; +struct vdso_data *vdso_data = vdso_data_store.data; static int __init vdso_init(void) { diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index 608884bc3396..65e1fdf9fdb2 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -114,10 +114,6 @@ void send_ipi(const struct cpumask *cpumask, enum ipi_message_type msg) local_irq_restore(flags); } -void __init smp_prepare_boot_cpu(void) -{ -} - /* * interrupts should already be disabled from the VM * SP should already be correct; need to set THREADINFO_REG diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index 14941e4be66d..90dfccb41c14 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -21,15 +21,13 @@ #include #include #include +#include #include extern char vdso_start[], vdso_end[]; /* Kernel-provided data used by the VDSO. */ -static union { - u8 page[PAGE_SIZE]; - struct vdso_data data[CS_BASES]; -} generic_vdso_data __page_aligned_data; +static union vdso_data_store generic_vdso_data __page_aligned_data; static union { u8 page[LOONGARCH_VDSO_DATA_SIZE]; diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index a708fbd5a844..642fb80c5c4e 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -96,6 +96,9 @@ static const struct block_device_operations nfhd_ops = { static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) { + struct queue_limits lim = { + .logical_block_size = bsize, + }; struct nfhd_device *dev; int dev_id = id - NFHD_DEV_OFFSET; int err = -ENOMEM; @@ -117,9 +120,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!dev->disk) + dev->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(dev->disk)) { + err = PTR_ERR(dev->disk); goto free_dev; + } dev->disk->major = major_num; dev->disk->first_minor = dev_id * 16; @@ -128,7 +133,6 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->disk->private_data = dev; sprintf(dev->disk->disk_name, "nfhd%u", dev_id); set_capacity(dev->disk, (sector_t)blocks * (bsize / 512)); - blk_queue_logical_block_size(dev->disk->queue, bsize); err = add_disk(dev->disk); if (err) goto out_cleanup_disk; diff --git a/arch/mips/include/asm/vdso.h b/arch/mips/include/asm/vdso.h index cc7b516129a8..afb03d45bcd0 100644 --- a/arch/mips/include/asm/vdso.h +++ b/arch/mips/include/asm/vdso.h @@ -50,9 +50,4 @@ extern struct mips_vdso_image vdso_image_o32; extern struct mips_vdso_image vdso_image_n32; #endif -union mips_vdso_data { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -}; - #endif /* __ASM_VDSO_H */ diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index f6d40e43f108..dda36fa26307 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -24,7 +24,7 @@ #include /* Kernel-provided data used by the VDSO. */ -static union mips_vdso_data mips_vdso_data __page_aligned_data; +static union vdso_data_store mips_vdso_data __page_aligned_data; struct vdso_data *vdso_data = mips_vdso_data.data; /* diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 1c5a2d71d675..86da4bc5ee0b 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -57,10 +57,6 @@ static void boot_secondary(unsigned int cpu, struct task_struct *idle) spin_unlock(&boot_lock); } -void __init smp_prepare_boot_cpu(void) -{ -} - void __init smp_init_cpus(void) { struct device_node *cpu; diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 9bb2210c8d44..065ffd1b2f8a 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -69,7 +69,7 @@ enum rtas_function_index { RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE, RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2, RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW, - RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS, + RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW, RTAS_FNIDX__IBM_SCAN_LOG_DUMP, RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR, RTAS_FNIDX__IBM_SET_EEH_OPTION, @@ -164,7 +164,7 @@ typedef struct { #define RTAS_FN_IBM_READ_SLOT_RESET_STATE rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE) #define RTAS_FN_IBM_READ_SLOT_RESET_STATE2 rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2) #define RTAS_FN_IBM_REMOVE_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW) -#define RTAS_FN_IBM_RESET_PE_DMA_WINDOWS rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS) +#define RTAS_FN_IBM_RESET_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW) #define RTAS_FN_IBM_SCAN_LOG_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_SCAN_LOG_DUMP) #define RTAS_FN_IBM_SET_DYNAMIC_INDICATOR rtas_fn_handle(RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR) #define RTAS_FN_IBM_SET_EEH_OPTION rtas_fn_handle(RTAS_FNIDX__IBM_SET_EEH_OPTION) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 7e793b503e29..8064d9c3de86 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -375,8 +375,13 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { [RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW] = { .name = "ibm,remove-pe-dma-window", }, - [RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS] = { - .name = "ibm,reset-pe-dma-windows", + [RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW] = { + /* + * Note: PAPR+ v2.13 7.3.31.4.1 spells this as + * "ibm,reset-pe-dma-windows" (plural), but RTAS + * implementations use the singular form in practice. + */ + .name = "ibm,reset-pe-dma-window", }, [RTAS_FNIDX__IBM_SCAN_LOG_DUMP] = { .name = "ibm,scan-log-dump", diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 693334c20d07..a60e4139214b 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -984,7 +984,7 @@ static bool shared_caches __ro_after_init; /* cpumask of CPUs with asymmetric SMT dependency */ static int powerpc_smt_flags(void) { - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); @@ -1010,9 +1010,9 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); static int powerpc_shared_cache_flags(void) { if (static_branch_unlikely(&splpar_asym_pack)) - return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + return SD_SHARE_LLC | SD_ASYM_PACKING; - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_LLC; } static int powerpc_shared_proc_flags(void) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 496e16c588aa..e8c4129697b1 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -574,29 +574,6 @@ static void iommu_table_setparms(struct pci_controller *phb, struct iommu_table_ops iommu_table_lpar_multi_ops; -/* - * iommu_table_setparms_lpar - * - * Function: On pSeries LPAR systems, return TCE table info, given a pci bus. - */ -static void iommu_table_setparms_lpar(struct pci_controller *phb, - struct device_node *dn, - struct iommu_table *tbl, - struct iommu_table_group *table_group, - const __be32 *dma_window) -{ - unsigned long offset, size, liobn; - - of_parse_dma_window(dn, dma_window, &liobn, &offset, &size); - - iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL, - &iommu_table_lpar_multi_ops); - - - table_group->tce32_start = offset; - table_group->tce32_size = size; -} - struct iommu_table_ops iommu_table_pseries_ops = { .set = tce_build_pSeries, .clear = tce_free_pSeries, @@ -724,26 +701,71 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = { * dynamic 64bit DMA window, walking up the device tree. */ static struct device_node *pci_dma_find(struct device_node *dn, - const __be32 **dma_window) + struct dynamic_dma_window_prop *prop) { - const __be32 *dw = NULL; + const __be32 *default_prop = NULL; + const __be32 *ddw_prop = NULL; + struct device_node *rdn = NULL; + bool default_win = false, ddw_win = false; for ( ; dn && PCI_DN(dn); dn = dn->parent) { - dw = of_get_property(dn, "ibm,dma-window", NULL); - if (dw) { - if (dma_window) - *dma_window = dw; - return dn; + default_prop = of_get_property(dn, "ibm,dma-window", NULL); + if (default_prop) { + rdn = dn; + default_win = true; } - dw = of_get_property(dn, DIRECT64_PROPNAME, NULL); - if (dw) - return dn; - dw = of_get_property(dn, DMA64_PROPNAME, NULL); - if (dw) - return dn; + ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL); + if (ddw_prop) { + rdn = dn; + ddw_win = true; + break; + } + ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL); + if (ddw_prop) { + rdn = dn; + ddw_win = true; + break; + } + + /* At least found default window, which is the case for normal boot */ + if (default_win) + break; } - return NULL; + /* For PCI devices there will always be a DMA window, either on the device + * or parent bus + */ + WARN_ON(!(default_win | ddw_win)); + + /* caller doesn't want to get DMA window property */ + if (!prop) + return rdn; + + /* parse DMA window property. During normal system boot, only default + * DMA window is passed in OF. But, for kdump, a dedicated adapter might + * have both default and DDW in FDT. In this scenario, DDW takes precedence + * over default window. + */ + if (ddw_win) { + struct dynamic_dma_window_prop *p; + + p = (struct dynamic_dma_window_prop *)ddw_prop; + prop->liobn = p->liobn; + prop->dma_base = p->dma_base; + prop->tce_shift = p->tce_shift; + prop->window_shift = p->window_shift; + } else if (default_win) { + unsigned long offset, size, liobn; + + of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size); + + prop->liobn = cpu_to_be32((u32)liobn); + prop->dma_base = cpu_to_be64(offset); + prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K); + prop->window_shift = cpu_to_be32(order_base_2(size)); + } + + return rdn; } static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) @@ -751,17 +773,20 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) struct iommu_table *tbl; struct device_node *dn, *pdn; struct pci_dn *ppci; - const __be32 *dma_window = NULL; + struct dynamic_dma_window_prop prop; dn = pci_bus_to_OF_node(bus); pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", dn); - pdn = pci_dma_find(dn, &dma_window); + pdn = pci_dma_find(dn, &prop); - if (dma_window == NULL) - pr_debug(" no ibm,dma-window property !\n"); + /* In PPC architecture, there will always be DMA window on bus or one of the + * parent bus. During reboot, there will be ibm,dma-window property to + * define DMA window. For kdump, there will at least be default window or DDW + * or both. + */ ppci = PCI_DN(pdn); @@ -771,13 +796,24 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) if (!ppci->table_group) { ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); tbl = ppci->table_group->tables[0]; - if (dma_window) { - iommu_table_setparms_lpar(ppci->phb, pdn, tbl, - ppci->table_group, dma_window); - if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) - panic("Failed to initialize iommu table"); - } + iommu_table_setparms_common(tbl, ppci->phb->bus->number, + be32_to_cpu(prop.liobn), + be64_to_cpu(prop.dma_base), + 1ULL << be32_to_cpu(prop.window_shift), + be32_to_cpu(prop.tce_shift), NULL, + &iommu_table_lpar_multi_ops); + + /* Only for normal boot with default window. Doesn't matter even + * if we set these with DDW which is 64bit during kdump, since + * these will not be used during kdump. + */ + ppci->table_group->tce32_start = be64_to_cpu(prop.dma_base); + ppci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); + + if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) + panic("Failed to initialize iommu table"); + iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -968,6 +1004,12 @@ static void find_existing_ddw_windows_named(const char *name) continue; } + /* If at the time of system initialization, there are DDWs in OF, + * it means this is during kexec. DDW could be direct or dynamic. + * We will just mark DDWs as "dynamic" since this is kdump path, + * no need to worry about perforance. ddw_list_new_entry() will + * set window->direct = false. + */ window = ddw_list_new_entry(pdn, dma64); if (!window) { of_node_put(pdn); @@ -1524,8 +1566,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; struct iommu_table *tbl; - const __be32 *dma_window = NULL; struct pci_dn *pci; + struct dynamic_dma_window_prop prop; pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); @@ -1538,7 +1580,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) dn = pci_device_to_OF_node(dev); pr_debug(" node is %pOF\n", dn); - pdn = pci_dma_find(dn, &dma_window); + pdn = pci_dma_find(dn, &prop); if (!pdn || !PCI_DN(pdn)) { printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " "no DMA window found for pci dev=%s dn=%pOF\n", @@ -1551,8 +1593,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) if (!pci->table_group) { pci->table_group = iommu_pseries_alloc_group(pci->phb->node); tbl = pci->table_group->tables[0]; - iommu_table_setparms_lpar(pci->phb, pdn, tbl, - pci->table_group, dma_window); + + iommu_table_setparms_common(tbl, pci->phb->bus->number, + be32_to_cpu(prop.liobn), + be64_to_cpu(prop.dma_base), + 1ULL << be32_to_cpu(prop.window_shift), + be32_to_cpu(prop.tce_shift), NULL, + &iommu_table_lpar_multi_ops); + + /* Only for normal boot with default window. Doesn't matter even + * if we set these with DDW which is 64bit during kdump, since + * these will not be used during kdump. + */ + pci->table_group->tce32_start = be64_to_cpu(prop.dma_base); + pci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift); iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a068..e3142ce531a0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -315,7 +315,6 @@ config AS_HAS_OPTION_ARCH # https://reviews.llvm.org/D123515 def_bool y depends on $(as-instr, .option arch$(comma) +m) - depends on !$(as-instr, .option arch$(comma) -i) source "arch/riscv/Kconfig.socs" source "arch/riscv/Kconfig.errata" diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 510014051f5d..2468c55933cd 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -424,6 +424,7 @@ # define CSR_STATUS CSR_MSTATUS # define CSR_IE CSR_MIE # define CSR_TVEC CSR_MTVEC +# define CSR_ENVCFG CSR_MENVCFG # define CSR_SCRATCH CSR_MSCRATCH # define CSR_EPC CSR_MEPC # define CSR_CAUSE CSR_MCAUSE @@ -448,6 +449,7 @@ # define CSR_STATUS CSR_SSTATUS # define CSR_IE CSR_SIE # define CSR_TVEC CSR_STVEC +# define CSR_ENVCFG CSR_SENVCFG # define CSR_SCRATCH CSR_SSCRATCH # define CSR_EPC CSR_SEPC # define CSR_CAUSE CSR_SCAUSE diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 329172122952..15055f9df4da 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -25,6 +25,11 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #ifndef __ASSEMBLY__ + +extern void *return_address(unsigned int level); + +#define ftrace_return_address(n) return_address(n) + void MCOUNT_NAME(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 20f9c3ba2341..22deb7a2a6ec 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -11,8 +11,10 @@ static inline void arch_clear_hugepage_flags(struct page *page) } #define arch_clear_hugepage_flags arch_clear_hugepage_flags +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION bool arch_hugetlb_migration_supported(struct hstate *h); #define arch_hugetlb_migration_supported arch_hugetlb_migration_supported +#endif #ifdef CONFIG_RISCV_ISA_SVNAPOT #define __HAVE_ARCH_HUGE_PTE_CLEAR diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 5340f818746b..1f2d2599c655 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -81,6 +81,8 @@ #define RISCV_ISA_EXT_ZTSO 72 #define RISCV_ISA_EXT_ZACAS 73 +#define RISCV_ISA_EXT_XLINUXENVCFG 127 + #define RISCV_ISA_EXT_MAX 128 #define RISCV_ISA_EXT_INVALID U32_MAX diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index d169a4f41a2e..c80bb9990d32 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -95,7 +95,13 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) __pud_free(mm, pud); } -#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) +#define __pud_free_tlb(tlb, pud, addr) \ +do { \ + if (pgtable_l4_enabled) { \ + pagetable_pud_dtor(virt_to_ptdesc(pud)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \ + } \ +} while (0) #define p4d_alloc_one p4d_alloc_one static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -124,7 +130,11 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) __p4d_free(mm, p4d); } -#define __p4d_free_tlb(tlb, p4d, addr) p4d_free((tlb)->mm, p4d) +#define __p4d_free_tlb(tlb, p4d, addr) \ +do { \ + if (pgtable_l5_enabled) \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(p4d)); \ +} while (0) #endif /* __PAGETABLE_PMD_FOLDED */ static inline void sync_kernel_mappings(pgd_t *pgd) @@ -149,7 +159,11 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #ifndef __PAGETABLE_PMD_FOLDED -#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) +#define __pmd_free_tlb(tlb, pmd, addr) \ +do { \ + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ +} while (0) #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index b42017d76924..b99bd66107a6 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -136,7 +136,7 @@ enum napot_cont_order { * 10010 - IO Strongly-ordered, Non-cacheable, Non-bufferable, Shareable, Non-trustable */ #define _PAGE_PMA_THEAD ((1UL << 62) | (1UL << 61) | (1UL << 60)) -#define _PAGE_NOCACHE_THEAD ((1UL < 61) | (1UL << 60)) +#define _PAGE_NOCACHE_THEAD ((1UL << 61) | (1UL << 60)) #define _PAGE_IO_THEAD ((1UL << 63) | (1UL << 60)) #define _PAGE_MTMASK_THEAD (_PAGE_PMA_THEAD | _PAGE_IO_THEAD | (1UL << 59)) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 0c94260b5d0c..6066822e7396 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -84,7 +84,7 @@ * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled. */ -#define vmemmap ((struct page *)VMEMMAP_START) +#define vmemmap ((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT)) #define PCI_IO_SIZE SZ_16M #define PCI_IO_END VMEMMAP_START @@ -439,6 +439,10 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +#define pte_leaf_size(pte) (pte_napot(pte) ? \ + napot_cont_size(napot_cont_order(pte)) :\ + PAGE_SIZE) + #ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/asm-generic/pgtable.h diff --git a/arch/riscv/include/asm/suspend.h b/arch/riscv/include/asm/suspend.h index 02f87867389a..491296a335d0 100644 --- a/arch/riscv/include/asm/suspend.h +++ b/arch/riscv/include/asm/suspend.h @@ -14,6 +14,7 @@ struct suspend_context { struct pt_regs regs; /* Saved and restored by high-level functions */ unsigned long scratch; + unsigned long envcfg; unsigned long tvec; unsigned long ie; #ifdef CONFIG_MMU diff --git a/arch/riscv/include/asm/vmalloc.h b/arch/riscv/include/asm/vmalloc.h index 924d01b56c9a..51f6dfe19745 100644 --- a/arch/riscv/include/asm/vmalloc.h +++ b/arch/riscv/include/asm/vmalloc.h @@ -19,65 +19,6 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) return true; } -#ifdef CONFIG_RISCV_ISA_SVNAPOT -#include +#endif -#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size -static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end, - u64 pfn, unsigned int max_page_shift) -{ - unsigned long map_size = PAGE_SIZE; - unsigned long size, order; - - if (!has_svnapot()) - return map_size; - - for_each_napot_order_rev(order) { - if (napot_cont_shift(order) > max_page_shift) - continue; - - size = napot_cont_size(order); - if (end - addr < size) - continue; - - if (!IS_ALIGNED(addr, size)) - continue; - - if (!IS_ALIGNED(PFN_PHYS(pfn), size)) - continue; - - map_size = size; - break; - } - - return map_size; -} - -#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift -static inline int arch_vmap_pte_supported_shift(unsigned long size) -{ - int shift = PAGE_SHIFT; - unsigned long order; - - if (!has_svnapot()) - return shift; - - WARN_ON_ONCE(size >= PMD_SIZE); - - for_each_napot_order_rev(order) { - if (napot_cont_size(order) > size) - continue; - - if (!IS_ALIGNED(size, napot_cont_size(order))) - continue; - - shift = napot_cont_shift(order); - break; - } - - return shift; -} - -#endif /* CONFIG_RISCV_ISA_SVNAPOT */ -#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #endif /* _ASM_RISCV_VMALLOC_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f71910718053..604d6bf7e476 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -7,6 +7,7 @@ ifdef CONFIG_FTRACE CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) endif CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) CFLAGS_compat_syscall_table.o += $(call cc-option,-Wno-override-init,) @@ -46,6 +47,7 @@ obj-y += irq.o obj-y += process.o obj-y += ptrace.o obj-y += reset.o +obj-y += return_address.o obj-y += setup.o obj-y += signal.o obj-y += syscall_table.o diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 89920f84d0a3..79a5a35fab96 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "copy-unaligned.h" @@ -201,6 +202,16 @@ static const unsigned int riscv_zvbb_exts[] = { RISCV_ISA_EXT_ZVKB }; +/* + * While the [ms]envcfg CSRs were not defined until version 1.12 of the RISC-V + * privileged ISA, the existence of the CSRs is implied by any extension which + * specifies [ms]envcfg bit(s). Hence, we define a custom ISA extension for the + * existence of the CSR, and treat it as a subset of those other extensions. + */ +static const unsigned int riscv_xlinuxenvcfg_exts[] = { + RISCV_ISA_EXT_XLINUXENVCFG +}; + /* * The canonical order of ISA extension names in the ISA string is defined in * chapter 27 of the unprivileged specification. @@ -250,8 +261,8 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(c, RISCV_ISA_EXT_c), __RISCV_ISA_EXT_DATA(v, RISCV_ISA_EXT_v), __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h), - __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), - __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ), + __RISCV_ISA_EXT_SUPERSET(zicbom, RISCV_ISA_EXT_ZICBOM, riscv_xlinuxenvcfg_exts), + __RISCV_ISA_EXT_SUPERSET(zicboz, RISCV_ISA_EXT_ZICBOZ, riscv_xlinuxenvcfg_exts), __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND), __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR), @@ -538,6 +549,20 @@ static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) set_bit(RISCV_ISA_EXT_ZIHPM, isainfo->isa); } + /* + * "V" in ISA strings is ambiguous in practice: it should mean + * just the standard V-1.0 but vendors aren't well behaved. + * Many vendors with T-Head CPU cores which implement the 0.7.1 + * version of the vector specification put "v" into their DTs. + * CPU cores with the ratified spec will contain non-zero + * marchid. + */ + if (acpi_disabled && riscv_cached_mvendorid(cpu) == THEAD_VENDOR_ID && + riscv_cached_marchid(cpu) == 0x0) { + this_hwcap &= ~isa2hwcap[RISCV_ISA_EXT_v]; + clear_bit(RISCV_ISA_EXT_v, isainfo->isa); + } + /* * All "okay" hart should have same isa. Set HWCAP based on * common capabilities of every "okay" hart, in case they don't @@ -950,7 +975,7 @@ arch_initcall(check_unaligned_access_all_cpus); void riscv_user_isa_enable(void) { if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ)) - csr_set(CSR_SENVCFG, ENVCFG_CBZE); + csr_set(CSR_ENVCFG, ENVCFG_CBZE); } #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/kernel/return_address.c b/arch/riscv/kernel/return_address.c new file mode 100644 index 000000000000..c8115ec8fb30 --- /dev/null +++ b/arch/riscv/kernel/return_address.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This code come from arch/arm64/kernel/return_address.c + * + * Copyright (C) 2023 SiFive. + */ + +#include +#include +#include + +struct return_address_data { + unsigned int level; + void *addr; +}; + +static bool save_return_addr(void *d, unsigned long pc) +{ + struct return_address_data *data = d; + + if (!data->level) { + data->addr = (void *)pc; + return false; + } + + --data->level; + + return true; +} +NOKPROBE_SYMBOL(save_return_addr); + +noinline void *return_address(unsigned int level) +{ + struct return_address_data data; + + data.level = level + 3; + data.addr = NULL; + + arch_stack_walk(save_return_addr, &data, current, NULL); + + if (!data.level) + return data.addr; + else + return NULL; + +} +EXPORT_SYMBOL_GPL(return_address); +NOKPROBE_SYMBOL(return_address); diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 519b6bd946e5..c4ed7d977f57 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -42,10 +42,6 @@ static DECLARE_COMPLETION(cpu_running); -void __init smp_prepare_boot_cpu(void) -{ -} - void __init smp_prepare_cpus(unsigned int max_cpus) { int cpuid; diff --git a/arch/riscv/kernel/suspend.c b/arch/riscv/kernel/suspend.c index 239509367e42..299795341e8a 100644 --- a/arch/riscv/kernel/suspend.c +++ b/arch/riscv/kernel/suspend.c @@ -15,6 +15,8 @@ void suspend_save_csrs(struct suspend_context *context) { context->scratch = csr_read(CSR_SCRATCH); + if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_XLINUXENVCFG)) + context->envcfg = csr_read(CSR_ENVCFG); context->tvec = csr_read(CSR_TVEC); context->ie = csr_read(CSR_IE); @@ -36,6 +38,8 @@ void suspend_save_csrs(struct suspend_context *context) void suspend_restore_csrs(struct suspend_context *context) { csr_write(CSR_SCRATCH, context->scratch); + if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_XLINUXENVCFG)) + csr_write(CSR_ENVCFG, context->envcfg); csr_write(CSR_TVEC, context->tvec); csr_write(CSR_IE, context->ie); diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 2cf76218a5bd..98315b98256d 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -30,14 +30,8 @@ enum rv_vdso_map { #define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) -/* - * The vDSO data page. - */ -static union { - struct vdso_data data; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = &vdso_data_store.data; +static union vdso_data_store vdso_data_store __page_aligned_data; +struct vdso_data *vdso_data = vdso_data_store.data; struct __vdso_info { const char *name; diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 29c7606414d2..5ef2a6891158 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -426,10 +426,12 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return __hugetlb_valid_size(size); } +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION bool arch_hugetlb_migration_supported(struct hstate *h) { return __hugetlb_valid_size(huge_page_size(h)); } +#endif #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index c924be0d7ed8..06756bad5e30 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -880,4 +880,3 @@ CONFIG_ATOMIC64_SELFTEST=y CONFIG_STRING_SELFTEST=y CONFIG_TEST_BITOPS=m CONFIG_TEST_BPF=m -CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c8f0c9fe40d7..d33f814f78b2 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -808,4 +808,3 @@ CONFIG_KPROBES_SANITY_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_TEST_LIVEPATCH=m diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h index 73ee89142666..0e2b40ef69b0 100644 --- a/arch/s390/include/asm/vdso/data.h +++ b/arch/s390/include/asm/vdso/data.h @@ -3,7 +3,6 @@ #define __S390_ASM_VDSO_DATA_H #include -#include struct arch_vdso_data { __s64 tod_steering_delta; diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index bbaefd84f15e..a45b3a4c91db 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -25,10 +25,7 @@ extern char vdso32_start[], vdso32_end[]; static struct vm_special_mapping vvar_mapping; -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; +static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index f3969a3600db..a0cc9bb41a92 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1206,10 +1206,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) { } -void smp_prepare_boot_cpu(void) -{ -} - void __init smp_setup_processor_id(void) { if (tlb_type == spitfire) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 92ee2697ff39..63fc062add70 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,8 +108,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) static DEFINE_MUTEX(ubd_lock); static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ -static int ubd_open(struct gendisk *disk, blk_mode_t mode); -static void ubd_release(struct gendisk *disk); static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -118,16 +116,11 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); static const struct block_device_operations ubd_blops = { .owner = THIS_MODULE, - .open = ubd_open, - .release = ubd_release, .ioctl = ubd_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl, .getgeo = ubd_getgeo, }; -/* Protected by ubd_lock */ -static struct gendisk *ubd_gendisk[MAX_DEV]; - #ifdef CONFIG_BLK_DEV_UBD_SYNC #define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ .cl = 1 }) @@ -155,7 +148,6 @@ struct ubd { * backing or the cow file. */ char *file; char *serial; - int count; int fd; __u64 size; struct openflags boot_openflags; @@ -165,7 +157,7 @@ struct ubd { unsigned no_trim:1; struct cow cow; struct platform_device pdev; - struct request_queue *queue; + struct gendisk *disk; struct blk_mq_tag_set tag_set; spinlock_t lock; }; @@ -181,7 +173,6 @@ struct ubd { #define DEFAULT_UBD { \ .file = NULL, \ .serial = NULL, \ - .count = 0, \ .fd = -1, \ .size = -1, \ .boot_openflags = OPEN_FLAGS, \ @@ -774,8 +765,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) ubd_dev->fd = fd; if(ubd_dev->cow.file != NULL){ - blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long)); - err = -ENOMEM; ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len); if(ubd_dev->cow.bitmap == NULL){ @@ -797,11 +786,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) if(err < 0) goto error; ubd_dev->cow.fd = err; } - if (ubd_dev->no_trim == 0) { - blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); - blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST); - } - blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue); return 0; error: os_close_file(ubd_dev->fd); @@ -851,27 +835,6 @@ static const struct attribute_group *ubd_attr_groups[] = { NULL, }; -static int ubd_disk_register(int major, u64 size, int unit, - struct gendisk *disk) -{ - disk->major = major; - disk->first_minor = unit << UBD_SHIFT; - disk->minors = 1 << UBD_SHIFT; - disk->fops = &ubd_blops; - set_capacity(disk, size / 512); - sprintf(disk->disk_name, "ubd%c", 'a' + unit); - - ubd_devs[unit].pdev.id = unit; - ubd_devs[unit].pdev.name = DRIVER_NAME; - ubd_devs[unit].pdev.dev.release = ubd_device_release; - dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]); - platform_device_register(&ubd_devs[unit].pdev); - - disk->private_data = &ubd_devs[unit]; - disk->queue = ubd_devs[unit].queue; - return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups); -} - #define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE)) static const struct blk_mq_ops ubd_mq_ops = { @@ -881,18 +844,36 @@ static const struct blk_mq_ops ubd_mq_ops = { static int ubd_add(int n, char **error_out) { struct ubd *ubd_dev = &ubd_devs[n]; + struct queue_limits lim = { + .max_segments = MAX_SG, + .seg_boundary_mask = PAGE_SIZE - 1, + }; struct gendisk *disk; int err = 0; if(ubd_dev->file == NULL) goto out; + if (ubd_dev->cow.file) + lim.max_hw_sectors = 8 * sizeof(long); + if (!ubd_dev->no_trim) { + lim.max_hw_discard_sectors = UBD_MAX_REQUEST; + lim.max_write_zeroes_sectors = UBD_MAX_REQUEST; + } + err = ubd_file_size(ubd_dev, &ubd_dev->size); if(err < 0){ *error_out = "Couldn't determine size of device's file"; goto out; } + err = ubd_open_dev(ubd_dev); + if (err) { + pr_err("ubd%c: Can't open \"%s\": errno = %d\n", + 'a' + n, ubd_dev->file, -err); + goto out; + } + ubd_dev->size = ROUND_BLOCK(ubd_dev->size); ubd_dev->tag_set.ops = &ubd_mq_ops; @@ -904,29 +885,43 @@ static int ubd_add(int n, char **error_out) err = blk_mq_alloc_tag_set(&ubd_dev->tag_set); if (err) - goto out; + goto out_close; - disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev); + disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } - ubd_dev->queue = disk->queue; - blk_queue_write_cache(ubd_dev->queue, true, false); - blk_queue_max_segments(ubd_dev->queue, MAX_SG); - blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); - err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk); + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_write_cache(disk->queue, true, false); + disk->major = UBD_MAJOR; + disk->first_minor = n << UBD_SHIFT; + disk->minors = 1 << UBD_SHIFT; + disk->fops = &ubd_blops; + set_capacity(disk, ubd_dev->size / 512); + sprintf(disk->disk_name, "ubd%c", 'a' + n); + disk->private_data = ubd_dev; + set_disk_ro(disk, !ubd_dev->openflags.w); + + ubd_dev->pdev.id = n; + ubd_dev->pdev.name = DRIVER_NAME; + ubd_dev->pdev.dev.release = ubd_device_release; + dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev); + platform_device_register(&ubd_dev->pdev); + + err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups); if (err) goto out_cleanup_disk; - ubd_gendisk[n] = disk; return 0; out_cleanup_disk: put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&ubd_dev->tag_set); +out_close: + ubd_close_dev(ubd_dev); out: return err; } @@ -1012,7 +1007,6 @@ static int ubd_id(char **str, int *start_out, int *end_out) static int ubd_remove(int n, char **error_out) { - struct gendisk *disk = ubd_gendisk[n]; struct ubd *ubd_dev; int err = -ENODEV; @@ -1023,15 +1017,15 @@ static int ubd_remove(int n, char **error_out) if(ubd_dev->file == NULL) goto out; - /* you cannot remove a open disk */ - err = -EBUSY; - if(ubd_dev->count > 0) - goto out; + if (ubd_dev->disk) { + /* you cannot remove a open disk */ + err = -EBUSY; + if (disk_openers(ubd_dev->disk)) + goto out; - ubd_gendisk[n] = NULL; - if(disk != NULL){ - del_gendisk(disk); - put_disk(disk); + del_gendisk(ubd_dev->disk); + ubd_close_dev(ubd_dev); + put_disk(ubd_dev->disk); } err = 0; @@ -1153,37 +1147,6 @@ static int __init ubd_driver_init(void){ device_initcall(ubd_driver_init); -static int ubd_open(struct gendisk *disk, blk_mode_t mode) -{ - struct ubd *ubd_dev = disk->private_data; - int err = 0; - - mutex_lock(&ubd_mutex); - if(ubd_dev->count == 0){ - err = ubd_open_dev(ubd_dev); - if(err){ - printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", - disk->disk_name, ubd_dev->file, -err); - goto out; - } - } - ubd_dev->count++; - set_disk_ro(disk, !ubd_dev->openflags.w); -out: - mutex_unlock(&ubd_mutex); - return err; -} - -static void ubd_release(struct gendisk *disk) -{ - struct ubd *ubd_dev = disk->private_data; - - mutex_lock(&ubd_mutex); - if(--ubd_dev->count == 0) - ubd_close_dev(ubd_dev); - mutex_unlock(&ubd_mutex); -} - static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, __u64 *cow_offset, unsigned long *bitmap, __u64 bitmap_offset, unsigned long *bitmap_words, diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 63535c8c8b2a..720b96388191 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -147,6 +147,7 @@ config X86 select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) + select GENERIC_CLOCKEVENTS_BROADCAST_IDLE if GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE @@ -496,6 +497,15 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_FRED + bool "Flexible Return and Event Delivery" + depends on X86_64 + help + When enabled, try to use Flexible Return and Event Delivery + instead of the legacy SYSCALL/SYSENTER/IDT architecture for + ring transitions and exception/interrupt handling if the + system supports. + if X86_32 config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" @@ -2421,6 +2431,18 @@ source "kernel/livepatch/Kconfig" endmenu +config CC_HAS_NAMED_AS + def_bool CC_IS_GCC && GCC_VERSION >= 120100 + +config USE_X86_SEG_SUPPORT + def_bool y + depends on CC_HAS_NAMED_AS + # + # -fsanitize=kernel-address (KASAN) is at the moment incompatible + # with named address spaces - see GCC PR sanitizer/111736. + # + depends on !KASAN + config CC_HAS_SLS def_bool $(cc-option,-mharden-sls=all) @@ -2452,12 +2474,12 @@ config CALL_PADDING config FINEIBT def_bool y - depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE + depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE select CALL_PADDING config HAVE_CALL_THUNKS def_bool y - depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL + depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL config CALL_THUNKS def_bool n @@ -2479,7 +2501,7 @@ menuconfig SPECULATION_MITIGATIONS if SPECULATION_MITIGATIONS -config PAGE_TABLE_ISOLATION +config MITIGATION_PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" default y depends on (X86_64 || X86_PAE) @@ -2490,7 +2512,7 @@ config PAGE_TABLE_ISOLATION See Documentation/arch/x86/pti.rst for more details. -config RETPOLINE +config MITIGATION_RETPOLINE bool "Avoid speculative indirect branches in kernel" select OBJTOOL if HAVE_OBJTOOL default y @@ -2500,9 +2522,9 @@ config RETPOLINE branches. Requires a compiler with -mindirect-branch=thunk-extern support for full protection. The kernel may run slower. -config RETHUNK +config MITIGATION_RETHUNK bool "Enable return-thunks" - depends on RETPOLINE && CC_HAS_RETURN_THUNK + depends on MITIGATION_RETPOLINE && CC_HAS_RETURN_THUNK select OBJTOOL if HAVE_OBJTOOL default y if X86_64 help @@ -2511,14 +2533,14 @@ config RETHUNK Requires a compiler with -mfunction-return=thunk-extern support for full protection. The kernel may run slower. -config CPU_UNRET_ENTRY +config MITIGATION_UNRET_ENTRY bool "Enable UNRET on kernel entry" - depends on CPU_SUP_AMD && RETHUNK && X86_64 + depends on CPU_SUP_AMD && MITIGATION_RETHUNK && X86_64 default y help Compile the kernel with support for the retbleed=unret mitigation. -config CALL_DEPTH_TRACKING +config MITIGATION_CALL_DEPTH_TRACKING bool "Mitigate RSB underflow with call depth tracking" depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE @@ -2538,7 +2560,7 @@ config CALL_DEPTH_TRACKING config CALL_THUNKS_DEBUG bool "Enable call thunks and call depth tracking debugging" - depends on CALL_DEPTH_TRACKING + depends on MITIGATION_CALL_DEPTH_TRACKING select FUNCTION_ALIGNMENT_32B default n help @@ -2549,14 +2571,14 @@ config CALL_THUNKS_DEBUG Only enable this when you are debugging call thunks as this creates a noticeable runtime overhead. If unsure say N. -config CPU_IBPB_ENTRY +config MITIGATION_IBPB_ENTRY bool "Enable IBPB on kernel entry" depends on CPU_SUP_AMD && X86_64 default y help Compile the kernel with support for the retbleed=ibpb mitigation. -config CPU_IBRS_ENTRY +config MITIGATION_IBRS_ENTRY bool "Enable IBRS on kernel entry" depends on CPU_SUP_INTEL && X86_64 default y @@ -2565,14 +2587,14 @@ config CPU_IBRS_ENTRY This mitigates both spectre_v2 and retbleed at great cost to performance. -config CPU_SRSO +config MITIGATION_SRSO bool "Mitigate speculative RAS overflow on AMD" - depends on CPU_SUP_AMD && X86_64 && RETHUNK + depends on CPU_SUP_AMD && X86_64 && MITIGATION_RETHUNK default y help Enable the SRSO mitigation needed on AMD Zen1-4 machines. -config SLS +config MITIGATION_SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 select OBJTOOL if HAVE_OBJTOOL @@ -2582,7 +2604,7 @@ config SLS against straight line speculation. The kernel image might be slightly larger. -config GDS_FORCE_MITIGATION +config MITIGATION_GDS_FORCE bool "Force GDS Mitigation" depends on CPU_SUP_INTEL default n diff --git a/arch/x86/Makefile b/arch/x86/Makefile index da8f3caf2781..1eccc2ee45fb 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -22,7 +22,7 @@ RETPOLINE_VDSO_CFLAGS := -mretpoline endif RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix) -ifdef CONFIG_RETHUNK +ifdef CONFIG_MITIGATION_RETHUNK RETHUNK_CFLAGS := -mfunction-return=thunk-extern RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS) endif @@ -53,6 +53,9 @@ REALMODE_CFLAGS += -fno-stack-protector REALMODE_CFLAGS += -Wno-address-of-packed-member REALMODE_CFLAGS += $(cc_stack_align4) REALMODE_CFLAGS += $(CLANG_FLAGS) +ifdef CONFIG_CC_IS_CLANG +REALMODE_CFLAGS += -Wno-gnu +endif export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -192,7 +195,7 @@ KBUILD_CFLAGS += -Wno-sign-compare KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) # Additionally, avoid generating expensive indirect jumps which # are subject to retpolines for small number of switch cases. @@ -205,7 +208,7 @@ ifdef CONFIG_RETPOLINE endif endif -ifdef CONFIG_SLS +ifdef CONFIG_MITIGATION_SLS KBUILD_CFLAGS += -mharden-sls=all endif @@ -296,12 +299,11 @@ install: vdso-install-$(CONFIG_X86_64) += arch/x86/entry/vdso/vdso64.so.dbg vdso-install-$(CONFIG_X86_X32_ABI) += arch/x86/entry/vdso/vdsox32.so.dbg -vdso-install-$(CONFIG_X86_32) += arch/x86/entry/vdso/vdso32.so.dbg -vdso-install-$(CONFIG_IA32_EMULATION) += arch/x86/entry/vdso/vdso32.so.dbg +vdso-install-$(CONFIG_COMPAT_32) += arch/x86/entry/vdso/vdso32.so.dbg archprepare: checkbin checkbin: -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifeq ($(RETPOLINE_CFLAGS),) @echo "You are building kernel with non-retpoline compiler." >&2 @echo "Please update your compiler." >&2 diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 18d15d1ce87d..f196b1d1ddf8 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -5,6 +5,8 @@ #include "../string.h" #include "efi.h" +#include + #include /* diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index c1bb180973ea..e162d7f59cc5 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include + static unsigned long fs; static inline void set_fs(unsigned long seg) { diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c index 6edd034b0b30..f2e50f9758e6 100644 --- a/arch/x86/boot/compressed/efi.c +++ b/arch/x86/boot/compressed/efi.c @@ -7,6 +7,8 @@ #include "misc.h" +#include + /** * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment. * diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h index 866c0af8b5b9..b22300970f97 100644 --- a/arch/x86/boot/compressed/efi.h +++ b/arch/x86/boot/compressed/efi.h @@ -97,15 +97,6 @@ typedef struct { u32 tables; } efi_system_table_32_t; -/* kexec external ABI */ -struct efi_setup_data { - u64 fw_vendor; - u64 __unused; - u64 tables; - u64 smbios; - u64 reserved[8]; -}; - struct efi_unaccepted_memory { u32 version; u32 unit_size; diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 4a029baa5147..909f2a35b60c 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -8,8 +8,8 @@ * Copyright (C) 2016 Kees Cook */ -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION +/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #include "error.h" #include "misc.h" diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 51f957b24ba7..c882e1f67af0 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include #include #include #include "pgtable.h" diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index bea0719d70f2..ec71846d28c9 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -12,6 +12,7 @@ */ #include "misc.h" +#include #include #include #include @@ -372,7 +373,7 @@ static void enforce_vmpl0(void) MSR_AMD64_SNP_VMPL_SSS | \ MSR_AMD64_SNP_SECURE_TSC | \ MSR_AMD64_SNP_VMGEXIT_PARAM | \ - MSR_AMD64_SNP_VMSA_REG_PROTECTION | \ + MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ MSR_AMD64_SNP_RESERVED_MASK) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 73abbbdd26f8..91801138b10b 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -42,7 +42,7 @@ CONFIG_EFI_STUB=y CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y -# CONFIG_RETHUNK is not set +# CONFIG_MITIGATION_RETHUNK is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ca2fe186994b..c93e7f5c2a06 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -18,6 +18,9 @@ obj-y += vdso/ obj-y += vsyscall/ obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o +CFLAGS_entry_fred.o += -fno-stack-protector +CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) +obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o + obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o - diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1d94790a54..ea81770629ee 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -65,7 +65,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -87,14 +87,17 @@ For 32-bit we have the following conventions - kernel is built with pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + + .if \unwind_hint UNWIND_HINT_REGS + .endif .if \save_ret pushq %rsi /* return address on top of stack */ .endif .endm -.macro CLEAR_REGS +.macro CLEAR_REGS clear_bp=1 /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -109,7 +112,9 @@ For 32-bit we have the following conventions - kernel is built with xorl %r10d, %r10d /* nospec r10 */ xorl %r11d, %r11d /* nospec r11 */ xorl %ebx, %ebx /* nospec rbx */ + .if \clear_bp xorl %ebp, %ebp /* nospec rbp */ + .endif xorl %r12d, %r12d /* nospec r12 */ xorl %r13d, %r13d /* nospec r13 */ xorl %r14d, %r14d /* nospec r14 */ @@ -117,9 +122,9 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret - CLEAR_REGS +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint + CLEAR_REGS clear_bp=\clear_bp .endm .macro POP_REGS pop_rdi=1 @@ -142,10 +147,10 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ #define PTI_USER_PGTABLE_BIT PAGE_SHIFT @@ -160,7 +165,7 @@ For 32-bit we have the following conventions - kernel is built with .macro ADJUST_KERNEL_CR3 reg:req ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID - /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ + /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg .endm @@ -173,7 +178,7 @@ For 32-bit we have the following conventions - kernel is built with .endm #define THIS_CPU_user_pcid_flush_mask \ - PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask) .macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req mov %cr3, \scratch_reg @@ -239,17 +244,19 @@ For 32-bit we have the following conventions - kernel is built with .Ldone_\@: .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +/* Restore CR3 from a kernel context. May restore a user CR3 value. */ +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID - /* - * KERNEL pages can always resume with NOFLUSH as we do - * explicit flushes. + * If CR3 contained the kernel page tables at the paranoid exception + * entry, then there is nothing to restore as CR3 is not modified while + * handling the exception. */ bt $PTI_USER_PGTABLE_BIT, \save_reg - jnc .Lnoflush_\@ + jnc .Lend_\@ + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID /* * Check if there's a pending flush for the user ASID we're @@ -257,25 +264,17 @@ For 32-bit we have the following conventions - kernel is built with */ movq \save_reg, \scratch_reg andq $(0x7FF), \scratch_reg - bt \scratch_reg, THIS_CPU_user_pcid_flush_mask - jnc .Lnoflush_\@ - btr \scratch_reg, THIS_CPU_user_pcid_flush_mask - jmp .Lwrcr3_\@ + jc .Lwrcr3_\@ -.Lnoflush_\@: SET_NOFLUSH_BIT \save_reg .Lwrcr3_\@: - /* - * The CR3 write could be avoided when not changing its value, - * but would require a CR3 read *and* a scratch register. - */ movq \save_reg, %cr3 .Lend_\@: .endm -#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */ .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req .endm @@ -285,7 +284,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req .endm #endif @@ -303,7 +302,7 @@ For 32-bit we have the following conventions - kernel is built with * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. */ .macro IBRS_ENTER save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -332,7 +331,7 @@ For 32-bit we have the following conventions - kernel is built with * regs. Must be called after the last RET. */ .macro IBRS_EXIT save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -426,3 +425,63 @@ For 32-bit we have the following conventions - kernel is built with .endm #endif /* CONFIG_SMP */ + +#ifdef CONFIG_X86_64 + +/* rdi: arg1 ... normal C conventions. rax is saved/restored. */ +.macro THUNK name, func +SYM_FUNC_START(\name) + pushq %rbp + movq %rsp, %rbp + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + + call \func + + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + popq %rbp + RET +SYM_FUNC_END(\name) + _ASM_NOKPROBE(\name) +.endm + +#else /* CONFIG_X86_32 */ + +/* put return address in eax (arg1) */ +.macro THUNK name, func, put_ret_addr_in_eax=0 +SYM_CODE_START_NOALIGN(\name) + pushl %eax + pushl %ecx + pushl %edx + + .if \put_ret_addr_in_eax + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + .endif + + call \func + popl %edx + popl %ecx + popl %eax + RET + _ASM_NOKPROBE(\name) +SYM_CODE_END(\name) + .endm + +#endif diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 003379049924..d9feadffa972 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -10,6 +10,8 @@ #include #include +#include "calling.h" + .pushsection .noinstr.text, "ax" SYM_FUNC_START(entry_ibpb) @@ -43,3 +45,4 @@ EXPORT_SYMBOL_GPL(mds_verw_sel); .popsection +THUNK warn_thunk_thunk, __warn_thunk diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index fba427646805..d3a814efbff6 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -305,7 +305,7 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 #define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET) ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX @@ -649,10 +649,6 @@ SYM_CODE_START_LOCAL(asm_\cfunc) SYM_CODE_END(asm_\cfunc) .endm -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit and emit the __irqentry_text_* markers diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9bb485977629..8af2a26b24f6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,7 +191,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary + movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary) #endif /* @@ -248,7 +248,13 @@ SYM_CODE_START(ret_from_fork_asm) * and unwind should work normally. */ UNWIND_HINT_REGS + +#ifdef CONFIG_X86_FRED + ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ + "jmp asm_fred_exit_user", X86_FEATURE_FRED +#else jmp swapgs_restore_regs_and_return_to_usermode +#endif SYM_CODE_END(ret_from_fork_asm) .popsection @@ -371,14 +377,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm -/* - * System vectors which invoke their handlers directly and are not - * going through the regular common device interrupt handling code. - */ -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number @@ -563,7 +561,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI #endif @@ -580,7 +578,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) jnz .Lnative_iret ud2 -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION .Lpti_restore_regs_and_return_to_usermode: POP_REGS pop_rdi=0 @@ -972,14 +970,14 @@ SYM_CODE_START_LOCAL(paranoid_exit) IBRS_EXIT save_reg=%r15 /* - * The order of operations is important. RESTORE_CR3 requires + * The order of operations is important. PARANOID_RESTORE_CR3 requires * kernel GSBASE. * * NB to anyone to try to optimize this code: this code does * not execute at all for exceptions from user mode. Those * exceptions go through error_return instead. */ - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14 /* Handle the three GSBASE cases */ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE @@ -1100,7 +1098,7 @@ SYM_CODE_END(error_return) * * Registers: * %r14: Used to save/restore the CR3 of the interrupted context - * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber. */ SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_ENTRY @@ -1408,8 +1406,7 @@ end_repeat_nmi: /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ IBRS_EXIT save_reg=%r15 - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 /* * The above invocation of paranoid_entry stored the GSBASE diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S new file mode 100644 index 000000000000..a02bc6f3d2e6 --- /dev/null +++ b/arch/x86/entry/entry_64_fred.S @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The actual FRED entry points. + */ + +#include + +#include +#include +#include + +#include "calling.h" + + .code64 + .section .noinstr.text, "ax" + +.macro FRED_ENTER + UNWIND_HINT_END_OF_STACK + ENDBR + PUSH_AND_CLEAR_REGS + movq %rsp, %rdi /* %rdi -> pt_regs */ +.endm + +.macro FRED_EXIT + UNWIND_HINT_REGS + POP_REGS +.endm + +/* + * The new RIP value that FRED event delivery establishes is + * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3. + * Thus the FRED ring 3 entry point must be 4K page aligned. + */ + .align 4096 + +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) + FRED_ENTER + call fred_entry_from_user +SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) + FRED_EXIT +1: ERETU + + _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU) +SYM_CODE_END(asm_fred_entrypoint_user) + +/* + * The new RIP value that FRED event delivery establishes is + * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in + * ring 0, i.e., asm_fred_entrypoint_user + 256. + */ + .org asm_fred_entrypoint_user + 256, 0xcc +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) + FRED_ENTER + call fred_entry_from_kernel + FRED_EXIT + ERETS +SYM_CODE_END(asm_fred_entrypoint_kernel) + +#if IS_ENABLED(CONFIG_KVM_INTEL) +SYM_FUNC_START(asm_fred_entry_from_kvm) + push %rbp + mov %rsp, %rbp + + UNWIND_HINT_SAVE + + /* + * Both IRQ and NMI from VMX can be handled on current task stack + * because there is no need to protect from reentrancy and the call + * stack leading to this helper is effectively constant and shallow + * (relatively speaking). Do the same when FRED is active, i.e., no + * need to check current stack level for a stack switch. + * + * Emulate the FRED-defined redzone and stack alignment. + */ + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp + and $FRED_STACK_FRAME_RSP_MASK, %rsp + + /* + * Start to push a FRED stack frame, which is always 64 bytes: + * + * +--------+-----------------+ + * | Bytes | Usage | + * +--------+-----------------+ + * | 63:56 | Reserved | + * | 55:48 | Event Data | + * | 47:40 | SS + Event Info | + * | 39:32 | RSP | + * | 31:24 | RFLAGS | + * | 23:16 | CS + Aux Info | + * | 15:8 | RIP | + * | 7:0 | Error Code | + * +--------+-----------------+ + */ + push $0 /* Reserved, must be 0 */ + push $0 /* Event data, 0 for IRQ/NMI */ + push %rdi /* fred_ss handed in by the caller */ + push %rbp + pushf + mov $__KERNEL_CS, %rax + push %rax + + /* + * Unlike the IDT event delivery, FRED _always_ pushes an error code + * after pushing the return RIP, thus the CALL instruction CANNOT be + * used here to push the return RIP, otherwise there is no chance to + * push an error code before invoking the IRQ/NMI handler. + * + * Use LEA to get the return RIP and push it, then push an error code. + */ + lea 1f(%rip), %rax + push %rax /* Return RIP */ + push $0 /* Error code, 0 for IRQ/NMI */ + + PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0 + movq %rsp, %rdi /* %rdi -> pt_regs */ + call __fred_entry_from_kvm /* Call the C entry point */ + POP_REGS + ERETS +1: + /* + * Objtool doesn't understand what ERETS does, this hint tells it that + * yes, we'll reach here and with what stack state. A save/restore pair + * isn't strictly needed, but it's the simplest form. + */ + UNWIND_HINT_RESTORE + pop %rbp + RET + +SYM_FUNC_END(asm_fred_entry_from_kvm) +EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c new file mode 100644 index 000000000000..ac120cbdaaf2 --- /dev/null +++ b/arch/x86/entry/entry_fred.c @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The FRED specific kernel/user entry functions which are invoked from + * assembly code and dispatch to the associated handlers. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* FRED EVENT_TYPE_OTHER vector numbers */ +#define FRED_SYSCALL 1 +#define FRED_SYSENTER 2 + +static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code) +{ + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + /* Panic on events from a high stack level */ + if (regs->fred_cs.sl > 0) { + pr_emerg("PANIC: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + die("invalid or fatal FRED event", regs, regs->orig_ax); + panic("invalid or fatal FRED event"); + } else { + unsigned long flags = oops_begin(); + int sig = SIGKILL; + + pr_alert("BUG: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + + if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + sig = 0; + + oops_end(flags, regs, sig); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +static noinstr void fred_intx(struct pt_regs *regs) +{ + switch (regs->fred_ss.vector) { + /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */ + case X86_TRAP_BP: + return exc_int3(regs); + + /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */ + case X86_TRAP_OF: + return exc_overflow(regs); + +#ifdef CONFIG_IA32_EMULATION + /* INT80 */ + case IA32_SYSCALL_VECTOR: + if (ia32_enabled()) + return int80_emulation(regs); + fallthrough; +#endif + + default: + return exc_general_protection(regs, 0); + } +} + +static __always_inline void fred_other(struct pt_regs *regs) +{ + /* The compiler can fold these conditions into a single test */ + if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_syscall_64(regs, regs->orig_ax); + return; + } else if (ia32_enabled() && + likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_fast_syscall_32(regs); + return; + } else { + exc_invalid_op(regs); + return; + } +} + +#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function + +static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { + SYSVEC(ERROR_APIC_VECTOR, error_interrupt), + SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), + SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + + SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi), + SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single), + SYSVEC(CALL_FUNCTION_VECTOR, call_function), + SYSVEC(REBOOT_VECTOR, reboot), + + SYSVEC(THRESHOLD_APIC_VECTOR, threshold), + SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error), + SYSVEC(THERMAL_APIC_VECTOR, thermal), + + SYSVEC(IRQ_WORK_VECTOR, irq_work), + + SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +}; + +static bool fred_setup_done __initdata; + +void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) +{ + if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON_ONCE(fred_setup_done)) + return; + + if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR])) + sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; +} + +static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs) +{ + spurious_interrupt(regs, regs->fred_ss.vector); +} + +void __init fred_complete_exception_setup(void) +{ + unsigned int vector; + + for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++) + set_bit(vector, system_vectors); + + for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) { + if (sysvec_table[vector]) + set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors); + else + sysvec_table[vector] = fred_handle_spurious_interrupt; + } + fred_setup_done = true; +} + +static noinstr void fred_extint(struct pt_regs *regs) +{ + unsigned int vector = regs->fred_ss.vector; + unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS); + + if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) + return; + + if (likely(vector >= FIRST_SYSTEM_VECTOR)) { + irqentry_state_t state = irqentry_enter(regs); + + instrumentation_begin(); + sysvec_table[index](regs); + instrumentation_end(); + irqentry_exit(regs, state); + } else { + common_interrupt(regs, vector); + } +} + +static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) +{ + /* Optimize for #PF. That's the only exception which matters performance wise */ + if (likely(regs->fred_ss.vector == X86_TRAP_PF)) + return exc_page_fault(regs, error_code); + + switch (regs->fred_ss.vector) { + case X86_TRAP_DE: return exc_divide_error(regs); + case X86_TRAP_DB: return fred_exc_debug(regs); + case X86_TRAP_BR: return exc_bounds(regs); + case X86_TRAP_UD: return exc_invalid_op(regs); + case X86_TRAP_NM: return exc_device_not_available(regs); + case X86_TRAP_DF: return exc_double_fault(regs, error_code); + case X86_TRAP_TS: return exc_invalid_tss(regs, error_code); + case X86_TRAP_NP: return exc_segment_not_present(regs, error_code); + case X86_TRAP_SS: return exc_stack_segment(regs, error_code); + case X86_TRAP_GP: return exc_general_protection(regs, error_code); + case X86_TRAP_MF: return exc_coprocessor_error(regs); + case X86_TRAP_AC: return exc_alignment_check(regs, error_code); + case X86_TRAP_XF: return exc_simd_coprocessor_error(regs); + +#ifdef CONFIG_X86_MCE + case X86_TRAP_MC: return fred_exc_machine_check(regs); +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + case X86_TRAP_VE: return exc_virtualization_exception(regs); +#endif +#ifdef CONFIG_X86_CET + case X86_TRAP_CP: return exc_control_protection(regs, error_code); +#endif + default: return fred_bad_type(regs, error_code); + } + +} + +static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code) +{ + switch (regs->fred_ss.vector) { + case X86_TRAP_BP: return exc_int3(regs); + case X86_TRAP_OF: return exc_overflow(regs); + default: return fred_bad_type(regs, error_code); + } +} + +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_SWINT: + return fred_intx(regs); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + case EVENT_TYPE_OTHER: + return fred_other(regs); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +#if IS_ENABLED(CONFIG_KVM_INTEL) +__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs) +{ + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + return fred_exc_nmi(regs); + default: + WARN_ON_ONCE(1); + } +} +#endif diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index 0103e103a657..da37f42f4549 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -4,33 +4,15 @@ * Copyright 2008 by Steven Rostedt, Red Hat, Inc * (inspired by Andi Kleen's thunk_64.S) */ - #include - #include - #include - /* put return address in eax (arg1) */ - .macro THUNK name, func, put_ret_addr_in_eax=0 -SYM_CODE_START_NOALIGN(\name) - pushl %eax - pushl %ecx - pushl %edx +#include +#include +#include - .if \put_ret_addr_in_eax - /* Place EIP in the arg1 */ - movl 3*4(%esp), %eax - .endif +#include "calling.h" - call \func - popl %edx - popl %ecx - popl %eax - RET - _ASM_NOKPROBE(\name) -SYM_CODE_END(\name) - .endm - - THUNK preempt_schedule_thunk, preempt_schedule - THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace - EXPORT_SYMBOL(preempt_schedule_thunk) - EXPORT_SYMBOL(preempt_schedule_notrace_thunk) +THUNK preempt_schedule_thunk, preempt_schedule +THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace +EXPORT_SYMBOL(preempt_schedule_thunk) +EXPORT_SYMBOL(preempt_schedule_notrace_thunk) diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 416b400f39db..119ebdc3d362 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -9,39 +9,6 @@ #include "calling.h" #include - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro THUNK name, func -SYM_FUNC_START(\name) - pushq %rbp - movq %rsp, %rbp - - pushq %rdi - pushq %rsi - pushq %rdx - pushq %rcx - pushq %rax - pushq %r8 - pushq %r9 - pushq %r10 - pushq %r11 - - call \func - - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi - popq %rdi - popq %rbp - RET -SYM_FUNC_END(\name) - _ASM_NOKPROBE(\name) - .endm - THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace EXPORT_SYMBOL(preempt_schedule_thunk) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index b1b8dd1608f7..620f6257bbe9 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,7 +3,7 @@ # Building vDSO images for x86. # -# Include the generic Makefile to check the built vdso. +# Include the generic Makefile to check the built vDSO: include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. @@ -18,48 +18,39 @@ OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n -VDSO64-$(CONFIG_X86_64) := y -VDSOX32-$(CONFIG_X86_X32_ABI) := y -VDSO32-$(CONFIG_X86_32) := y -VDSO32-$(CONFIG_IA32_EMULATION) := y - -# files to link into the vdso +# Files to link into the vDSO: vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o vobjs-$(CONFIG_X86_SGX) += vsgx.o -# files to link into kernel -obj-y += vma.o extable.o -KASAN_SANITIZE_vma.o := y -UBSAN_SANITIZE_vma.o := y -KCSAN_SANITIZE_vma.o := y -OBJECT_FILES_NON_STANDARD_vma.o := n -OBJECT_FILES_NON_STANDARD_extable.o := n +# Files to link into the kernel: +obj-y += vma.o extable.o +KASAN_SANITIZE_vma.o := y +UBSAN_SANITIZE_vma.o := y +KCSAN_SANITIZE_vma.o := y -# vDSO images to build -vdso_img-$(VDSO64-y) += 64 -vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32 +OBJECT_FILES_NON_STANDARD_vma.o := n +OBJECT_FILES_NON_STANDARD_extable.o := n -obj-$(VDSO32-y) += vdso32-setup.o -OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n +# vDSO images to build: +obj-$(CONFIG_X86_64) += vdso-image-64.o +obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o +obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) -vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) +OBJECT_FILES_NON_STANDARD_vdso-image-32.o := n +OBJECT_FILES_NON_STANDARD_vdso-image-64.o := n +OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n + +vobjs := $(addprefix $(obj)/, $(vobjs-y)) +vobjs32 := $(addprefix $(obj)/, $(vobjs32-y)) $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.lds $(vobjs-y) targets += vdso32/vdso32.lds $(vobjs32-y) -# Build the vDSO image C files and link them in. -vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) -vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) -vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) -obj-y += $(vdso_img_objs) -targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) +targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg) CPPFLAGS_vdso.lds += -P -C @@ -87,7 +78,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) CFL += $(RETPOLINE_VDSO_CFLAGS) endif @@ -123,7 +114,7 @@ VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \ vobjx32s-y := $(vobjs-y:.o=-x32.o) # same thing, but in the output directory -vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) +vobjx32s := $(addprefix $(obj)/, $(vobjx32s-y)) # Convert 64bit object file to x32 for x32 vDSO. quiet_cmd_x32 = X32 $@ @@ -164,7 +155,7 @@ KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) endif @@ -190,5 +181,3 @@ GCOV_PROFILE := n quiet_cmd_vdso_and_check = VDSO $@ cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check) - -clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 7645730dc228..6d83ceb7f1ba 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -274,59 +274,6 @@ up_fail: return ret; } -#ifdef CONFIG_X86_64 -/* - * Put the vdso above the (randomized) stack with another randomized - * offset. This way there is no hole in the middle of address space. - * To save memory make sure it is still in the same PTE as the stack - * top. This doesn't give that many random bits. - * - * Note that this algorithm is imperfect: the distribution of the vdso - * start address within a PMD is biased toward the end. - * - * Only used for the 64-bit and x32 vdsos. - */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ - unsigned long addr, end; - unsigned offset; - - /* - * Round up the start address. It can start out unaligned as a result - * of stack start randomization. - */ - start = PAGE_ALIGN(start); - - /* Round the lowest possible end address up to a PMD boundary. */ - end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= DEFAULT_MAP_WINDOW) - end = DEFAULT_MAP_WINDOW; - end -= len; - - if (end > start) { - offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); - addr = start + (offset << PAGE_SHIFT); - } else { - addr = start; - } - - /* - * Forcibly align the final address in case we have a hardware - * issue that requires alignment for performance reasons. - */ - addr = align_vdso_addr(addr); - - return addr; -} - -static int map_vdso_randomized(const struct vdso_image *image) -{ - unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); - - return map_vdso(image, addr); -} -#endif - int map_vdso_once(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; @@ -369,7 +316,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_64); + return map_vdso(&vdso_image_64, 0); } #ifdef CONFIG_COMPAT @@ -380,7 +327,7 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm, if (x32) { if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_x32); + return map_vdso(&vdso_image_x32, 0); } #endif #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index e0ca8120aea8..a3c0df11d0e6 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 81f6d8275b6b..69a3b02e50bb 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -579,7 +579,7 @@ static void amd_pmu_cpu_starting(int cpu) if (!x86_pmu.amd_nb_constraints) return; - nb_id = topology_die_id(cpu); + nb_id = topology_amd_node_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); for_each_online_cpu(i) { diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 5bf03c575812..4ccb8fa483e6 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -71,7 +71,7 @@ union amd_uncore_info { }; struct amd_uncore { - union amd_uncore_info * __percpu info; + union amd_uncore_info __percpu *info; struct amd_uncore_pmu *pmus; unsigned int num_pmus; bool init_done; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3804f21ab049..768d1414897f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4b50a3a9818a..326c8cd5aa2d 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -834,7 +834,7 @@ static int __init cstate_init(void) } if (has_cstate_pkg) { - if (topology_max_die_per_package() > 1) { + if (topology_max_dies_per_package() > 1) { err = perf_pmu_register(&cstate_pkg_pmu, "cstate_die", -1); } else { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index d49d661ec0a7..2641ba620f12 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7927c0b832fa..258e2cdf28fa 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1893,7 +1893,7 @@ static int __init intel_uncore_init(void) return -ENODEV; __uncore_max_dies = - topology_max_packages() * topology_max_die_per_package(); + topology_max_packages() * topology_max_dies_per_package(); id = x86_match_cpu(intel_uncore_match); if (!id) { diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 56eea2c66cfb..92da8aaa5966 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -1221,8 +1221,8 @@ void nhmex_uncore_cpu_init(void) uncore_nhmex = true; else nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; - if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (nhmex_uncore_cbox.num_boxes > topology_num_cores_per_package()) + nhmex_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = nhmex_msr_uncores; } /* end of Nehalem-EX uncore support */ diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 7fd4334e12a1..9462fd9f3b7a 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -364,8 +364,8 @@ static struct intel_uncore_type *snb_msr_uncores[] = { void snb_uncore_cpu_init(void) { uncore_msr_uncores = snb_msr_uncores; - if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (snb_uncore_cbox.num_boxes > topology_num_cores_per_package()) + snb_uncore_cbox.num_boxes = topology_num_cores_per_package(); } static void skl_uncore_msr_init_box(struct intel_uncore_box *box) @@ -428,8 +428,8 @@ static struct intel_uncore_type *skl_msr_uncores[] = { void skl_uncore_cpu_init(void) { uncore_msr_uncores = skl_msr_uncores; - if (skl_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - skl_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (skl_uncore_cbox.num_boxes > topology_num_cores_per_package()) + skl_uncore_cbox.num_boxes = topology_num_cores_per_package(); snb_uncore_arb.ops = &skl_uncore_msr_ops; } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index a96496bef678..2eaf0f339849 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1172,8 +1172,8 @@ static struct intel_uncore_type *snbep_msr_uncores[] = { void snbep_uncore_cpu_init(void) { - if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (snbep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + snbep_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = snbep_msr_uncores; } @@ -1406,7 +1406,7 @@ static int topology_gidnid_map(int nodeid, u32 gidnid) */ for (i = 0; i < 8; i++) { if (nodeid == GIDNIDMAP(gidnid, i)) { - if (topology_max_die_per_package() > 1) + if (topology_max_dies_per_package() > 1) die_id = i; else die_id = topology_phys_to_logical_pkg(i); @@ -1845,8 +1845,8 @@ static struct intel_uncore_type *ivbep_msr_uncores[] = { void ivbep_uncore_cpu_init(void) { - if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (ivbep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + ivbep_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = ivbep_msr_uncores; } @@ -2917,8 +2917,8 @@ static bool hswep_has_limit_sbox(unsigned int device) void hswep_uncore_cpu_init(void) { - if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (hswep_uncore_cbox.num_boxes > topology_num_cores_per_package()) + hswep_uncore_cbox.num_boxes = topology_num_cores_per_package(); /* Detect 6-8 core systems with only two SBOXes */ if (hswep_has_limit_sbox(HSWEP_PCU_DID)) @@ -3280,8 +3280,8 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { void bdx_uncore_cpu_init(void) { - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) - bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + if (bdx_uncore_cbox.num_boxes > topology_num_cores_per_package()) + bdx_uncore_cbox.num_boxes = topology_num_cores_per_package(); uncore_msr_uncores = bdx_msr_uncores; /* Detect systems with no SBOXes */ diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 8d98d468b976..fb2b1961e5a3 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -674,7 +674,7 @@ static const struct attribute_group *rapl_attr_update[] = { static int __init init_rapl_pmus(void) { - int maxdie = topology_max_packages() * topology_max_die_per_package(); + int maxdie = topology_max_packages() * topology_max_dies_per_package(); size_t size; size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 96e6c51515f5..edd2f35b2a5e 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -16,6 +16,11 @@ extern struct boot_params boot_params; static struct real_mode_header hv_vtl_real_mode_header; +static bool __init hv_vtl_msi_ext_dest_id(void) +{ + return true; +} + void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); @@ -26,8 +31,9 @@ void __init hv_vtl_init_platform(void) x86_init.timers.timer_init = x86_init_noop; /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + x86_init.mpparse.parse_smp_cfg = x86_init_noop; x86_platform.get_wallclock = get_rtc_noop; x86_platform.set_wallclock = set_rtc_noop; @@ -38,6 +44,8 @@ void __init hv_vtl_init_platform(void) x86_platform.legacy.warm_reset = 0; x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 0; + + x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id; } static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc) diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 7dcbf153ad72..768d73de0d09 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -502,6 +503,31 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], return -EFAULT; } +/* + * When transitioning memory between encrypted and decrypted, the caller + * of set_memory_encrypted() or set_memory_decrypted() is responsible for + * ensuring that the memory isn't in use and isn't referenced while the + * transition is in progress. The transition has multiple steps, and the + * memory is in an inconsistent state until all steps are complete. A + * reference while the state is inconsistent could result in an exception + * that can't be cleanly fixed up. + * + * But the Linux kernel load_unaligned_zeropad() mechanism could cause a + * stray reference that can't be prevented by the caller, so Linux has + * specific code to handle this case. But when the #VC and #VE exceptions + * routed to a paravisor, the specific code doesn't work. To avoid this + * problem, mark the pages as "not present" while the transition is in + * progress. If load_unaligned_zeropad() causes a stray reference, a normal + * page fault is generated instead of #VC or #VE, and the page-fault-based + * handlers for load_unaligned_zeropad() resolve the reference. When the + * transition is complete, hv_vtom_set_host_visibility() marks the pages + * as "present" again. + */ +static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc) +{ + return !set_memory_np(kbuffer, pagecount); +} + /* * hv_vtom_set_host_visibility - Set specified memory visible to host. * @@ -515,16 +541,28 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo enum hv_mem_host_visibility visibility = enc ? VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE; u64 *pfn_array; + phys_addr_t paddr; + void *vaddr; int ret = 0; bool result = true; int i, pfn; pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); - if (!pfn_array) - return false; + if (!pfn_array) { + result = false; + goto err_set_memory_p; + } for (i = 0, pfn = 0; i < pagecount; i++) { - pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE); + /* + * Use slow_virt_to_phys() because the PRESENT bit has been + * temporarily cleared in the PTEs. slow_virt_to_phys() works + * without the PRESENT bit while virt_to_hvpfn() or similar + * does not. + */ + vaddr = (void *)kbuffer + (i * HV_HYP_PAGE_SIZE); + paddr = slow_virt_to_phys(vaddr); + pfn_array[pfn] = paddr >> HV_HYP_PAGE_SHIFT; pfn++; if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { @@ -538,14 +576,30 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo } } - err_free_pfn_array: +err_free_pfn_array: kfree(pfn_array); + +err_set_memory_p: + /* + * Set the PTE PRESENT bits again to revert what hv_vtom_clear_present() + * did. Do this even if there is an error earlier in this function in + * order to avoid leaving the memory range in a "broken" state. Setting + * the PRESENT bits shouldn't fail, but return an error if it does. + */ + if (set_memory_p(kbuffer, pagecount)) + result = false; + return result; } static bool hv_vtom_tlb_flush_required(bool private) { - return true; + /* + * Since hv_vtom_clear_present() marks the PTEs as "not present" + * and flushes the TLB, they can't be in the TLB. That makes the + * flush controlled by this function redundant, so return "false". + */ + return false; } static bool hv_vtom_cache_flush_required(void) @@ -608,6 +662,7 @@ void __init hv_vtom_init(void) x86_platform.hyper.is_private_mmio = hv_is_private_mmio; x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required; x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; + x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; /* Set WB as the default cache mode. */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 9d159b771dc8..94ce0f7c9d3a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -46,6 +46,10 @@ extern void x86_32_probe_apic(void); static inline void x86_32_probe_apic(void) { } #endif +extern u32 cpuid_to_apicid[]; + +#define CPU_ACPIID_INVALID U32_MAX + #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; @@ -54,8 +58,6 @@ extern int local_apic_timer_c2_ok; extern bool apic_is_disabled; extern unsigned int lapic_timer_period; -extern u32 cpuid_to_apicid[]; - extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { APIC_PIC, @@ -169,6 +171,14 @@ extern bool apic_needs_pit(void); extern void apic_send_IPI_allbutself(unsigned int vector); +extern void topology_register_apic(u32 apic_id, u32 acpi_id, bool present); +extern void topology_register_boot_apic(u32 apic_id); +extern int topology_hotplug_apic(u32 apic_id, u32 acpi_id); +extern void topology_hotunplug_apic(unsigned int cpu); +extern void topology_apply_cmdline_limits_early(void); +extern void topology_init_possible_cpus(void); +extern void topology_reset_possible_cpus_up(void); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 @@ -183,6 +193,8 @@ static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } static inline bool apic_needs_pit(void) { return true; } +static inline void topology_apply_cmdline_limits_early(void) { } +static inline void topology_init_possible_cpus(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC @@ -289,16 +301,11 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - bool (*apic_id_registered)(void); - bool (*check_apicid_used)(physid_mask_t *map, u32 apicid); void (*init_apic_ldr)(void); - void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); u32 (*cpu_present_to_apicid)(int mps_cpu); - u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb); u32 (*get_apic_id)(u32 id); - u32 (*set_apic_id)(u32 apicid); /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); @@ -527,7 +534,6 @@ extern int default_apic_id_valid(u32 apicid); extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); -extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); extern u32 default_cpu_present_to_apicid(int mps_cpu); void apic_send_nmi_to_offline_cpu(unsigned int cpu); diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index b1a98fa38828..076bf8dee702 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #ifndef CONFIG_X86_CMPXCHG64 diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0216f63a366b..fe1e7e3cc844 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -33,7 +33,7 @@ * Returns: * 0 - (index < size) */ -static inline unsigned long array_index_mask_nospec(unsigned long index, +static __always_inline unsigned long array_index_mask_nospec(unsigned long index, unsigned long size) { unsigned long mask; diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index 8a1cd359248f..fb7388bbc212 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -11,9 +11,10 @@ enum cc_vendor { CC_VENDOR_INTEL, }; -extern u64 cc_mask; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM extern enum cc_vendor cc_vendor; +extern u64 cc_mask; + static inline void cc_set_mask(u64 mask) { RIP_REL_REF(cc_mask) = mask; diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index f8f9a9b79395..aa30fd8cad7f 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -9,18 +9,10 @@ #include #include -#ifdef CONFIG_SMP - -extern void prefill_possible_map(void); - -#else /* CONFIG_SMP */ - -static inline void prefill_possible_map(void) {} - +#ifndef CONFIG_SMP #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define cpu_acpi_id(cpu) 0 #define safe_smp_processor_id() 0 - #endif /* CONFIG_SMP */ #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index e35842f7aed0..0343caa016a9 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -324,7 +324,9 @@ #define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 9bee3e7bf973..6b122a31da06 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -127,6 +127,42 @@ static inline unsigned int cpuid_edx(unsigned int op) return edx; } +static inline void __cpuid_read(unsigned int leaf, unsigned int subleaf, u32 *regs) +{ + regs[CPUID_EAX] = leaf; + regs[CPUID_ECX] = subleaf; + __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX); +} + +#define cpuid_subleaf(leaf, subleaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, subleaf, (u32 *)(regs)); \ +} + +#define cpuid_leaf(leaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, 0, (u32 *)(regs)); \ +} + +static inline void __cpuid_read_reg(unsigned int leaf, unsigned int subleaf, + enum cpuid_regs_idx regidx, u32 *reg) +{ + u32 regs[4]; + + __cpuid_read(leaf, subleaf, regs); + *reg = regs[regidx]; +} + +#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \ +} + +#define cpuid_leaf_reg(leaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \ +} + static __always_inline bool cpuid_function_is_indexed(u32 function) { switch (function) { diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index dd4b67101bb7..bf5953883ec3 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -18,7 +18,7 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING u64 call_depth; #endif unsigned long top_of_stack; @@ -37,8 +37,15 @@ static_assert(sizeof(struct pcpu_hot) == 64); DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); +/* const-qualified alias to pcpu_hot, aliased by linker. */ +DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, + const_pcpu_hot); + static __always_inline struct task_struct *get_current(void) { + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return this_cpu_read_const(const_pcpu_hot.current_task); + return this_cpu_read_stable(pcpu_hot.current_task); } diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 0cec92c430cc..fdbbbfec745a 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -5,7 +5,9 @@ #include #include #include + #include +#include DECLARE_PER_CPU(unsigned long, cpu_dr7); @@ -159,4 +161,26 @@ static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) } #endif +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} + #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 52c015017990..62dc9f59ea76 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -403,8 +403,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void alloc_intr_gate(unsigned int n, const void *addr); - static inline void init_idt_data(struct idt_data *data, unsigned int n, const void *addr) { diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 3332d2940020..da4054fbf533 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -44,32 +44,32 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define DISABLE_PTI 0 #else # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE # define DISABLE_RETPOLINE 0 #else # define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK # define DISABLE_RETHUNK 0 #else # define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY # define DISABLE_UNRET 0 #else # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING # define DISABLE_CALL_DEPTH_TRACKING 0 #else # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) @@ -117,6 +117,12 @@ #define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) #endif +#ifdef CONFIG_X86_FRED +# define DISABLE_FRED 0 +#else +# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) +#endif + #ifdef CONFIG_KVM_AMD_SEV #define DISABLE_SEV_SNP 0 #else @@ -139,7 +145,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_LAM) +#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c4555b269a1b..1dc600fa3ba5 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -143,15 +143,6 @@ extern void efi_free_boot_services(void); void arch_efi_call_virt_setup(void); void arch_efi_call_virt_teardown(void); -/* kexec external ABI */ -struct efi_setup_data { - u64 fw_vendor; - u64 __unused; - u64 tables; - u64 smbios; - u64 reserved[8]; -}; - extern u64 efi_setup; #ifdef CONFIG_EFI @@ -418,8 +409,9 @@ extern int __init efi_memmap_split_count(efi_memory_desc_t *md, extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, struct efi_mem_range *mem); -#define arch_ima_efi_boot_mode \ - ({ extern struct boot_params boot_params; boot_params.secure_boot; }) +extern enum efi_secureboot_mode __x86_ima_efi_boot_mode(void); + +#define arch_ima_efi_boot_mode __x86_ima_efi_boot_mode() #ifdef CONFIG_EFI_RUNTIME_MAP int efi_get_runtime_map_size(void); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1e16bd5ac781..1fb83d47711f 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -392,5 +392,4 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index fe6312045042..7acf0383be80 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -64,6 +64,8 @@ #define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) #define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) -#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ +#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ + +#define EX_TYPE_ERETU 21 #endif diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index ca6e5e5f16b2..c485f1944c5f 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -37,10 +37,12 @@ extern void fpu_flush_thread(void); * The FPU context is only stored/restored for a user task and * PF_KTHREAD is used to distinguish between kernel and user threads. */ -static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) +static inline void switch_fpu_prepare(struct task_struct *old, int cpu) { if (cpu_feature_enabled(X86_FEATURE_FPU) && - !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct fpu *old_fpu = &old->thread.fpu; + save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the @@ -60,10 +62,10 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) * Delay loading of the complete FPU state until the return to userland. * PKRU is handled separately. */ -static inline void switch_fpu_finish(void) +static inline void switch_fpu_finish(struct task_struct *new) { if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_thread_flag(TIF_NEED_FPU_LOAD); + set_tsk_thread_flag(new, TIF_NEED_FPU_LOAD); } #endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h new file mode 100644 index 000000000000..e86c7ba32435 --- /dev/null +++ b/arch/x86/include/asm/fred.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macros for Flexible Return and Event Delivery (FRED) + */ + +#ifndef ASM_X86_FRED_H +#define ASM_X86_FRED_H + +#include + +#include +#include + +/* + * FRED event return instruction opcodes for ERET{S,U}; supported in + * binutils >= 2.41. + */ +#define ERETS _ASM_BYTES(0xf2,0x0f,0x01,0xca) +#define ERETU _ASM_BYTES(0xf3,0x0f,0x01,0xca) + +/* + * RSP is aligned to a 64-byte boundary before used to push a new stack frame + */ +#define FRED_STACK_FRAME_RSP_MASK _AT(unsigned long, (~0x3f)) + +/* + * Used for the return address for call emulation during code patching, + * and measured in 64-byte cache lines. + */ +#define FRED_CONFIG_REDZONE_AMOUNT 1 +#define FRED_CONFIG_REDZONE (_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6) +#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9) +#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p)) + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_FRED +#include + +#include + +struct fred_info { + /* Event data: CR2, DR6, ... */ + unsigned long edata; + unsigned long resv; +}; + +/* Full format of the FRED stack frame */ +struct fred_frame { + struct pt_regs regs; + struct fred_info info; +}; + +static __always_inline struct fred_info *fred_info(struct pt_regs *regs) +{ + return &container_of(regs, struct fred_frame, regs)->info; +} + +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) +{ + return fred_info(regs)->edata; +} + +void asm_fred_entrypoint_user(void); +void asm_fred_entrypoint_kernel(void); +void asm_fred_entry_from_kvm(struct fred_ss); + +__visible void fred_entry_from_user(struct pt_regs *regs); +__visible void fred_entry_from_kernel(struct pt_regs *regs); +__visible void __fred_entry_from_kvm(struct pt_regs *regs); + +/* Can be called from noinstr code, thus __always_inline */ +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) +{ + struct fred_ss ss = { + .ss =__KERNEL_DS, + .type = type, + .vector = vector, + .nmi = type == EVENT_TYPE_NMI, + .lm = 1, + }; + + asm_fred_entry_from_kvm(ss); +} + +void cpu_init_fred_exceptions(void); +void fred_complete_exception_setup(void); + +#else /* CONFIG_X86_FRED */ +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static inline void cpu_init_fred_exceptions(void) { } +static inline void fred_complete_exception_setup(void) { } +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } +#endif /* CONFIG_X86_FRED */ +#endif /* !__ASSEMBLY__ */ + +#endif /* ASM_X86_FRED_H */ diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 35cff5f2becf..9e7e8ca8e299 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -6,7 +6,7 @@ #ifdef CONFIG_X86_64 -#include +#include /* * Read/write a task's FSBASE or GSBASE. This returns the value that diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b02c3cd3c0f6..edebf1020e04 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -16,8 +16,6 @@ #include -#define IRQ_MATRIX_BITS NR_VECTORS - #ifndef __ASSEMBLY__ #include diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index c7ef6ea2fa99..4212c00c9708 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -69,7 +69,7 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); extern bool __ia32_enabled; -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return __ia32_enabled; } @@ -81,7 +81,7 @@ static inline void ia32_disable(void) #else /* !CONFIG_IA32_EMULATION */ -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return IS_ENABLED(CONFIG_X86_32); } diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 13639e57e1f8..47d4c04d103d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,15 +13,18 @@ #include +typedef void (*idtentry_t)(struct pt_regs *regs); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware * @vector: Vector number (ignored for C) * @func: Function name of the entry point * - * Declares three functions: + * Declares four functions: * - The ASM entry point: asm_##func * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the FRED event dispatcher (maybe unused) * - The C handler called from the ASM entry point * * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it @@ -31,6 +34,7 @@ #define DECLARE_IDTENTRY(vector, func) \ asmlinkage void asm_##func(void); \ asmlinkage void xen_asm_##func(void); \ + void fred_##func(struct pt_regs *regs); \ __visible void func(struct pt_regs *regs) /** @@ -137,6 +141,17 @@ static __always_inline void __##func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_RAW(func) \ __visible noinstr void func(struct pt_regs *regs) +/** + * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points + * @func: Function name of the entry point + * + * @func is called from the FRED event dispatcher with interrupts disabled. + * + * See @DEFINE_IDTENTRY_RAW for further details. + */ +#define DEFINE_FREDENTRY_RAW(func) \ +noinstr void fred_##func(struct pt_regs *regs) + /** * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points * Error code pushed by hardware @@ -233,17 +248,27 @@ static noinline void __##func(struct pt_regs *regs, u32 vector) #define DEFINE_IDTENTRY_SYSVEC(func) \ static void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + kvm_set_cpu_l1tf_flush_l1d(); \ + run_sysvec_on_irqstack_cond(__##func, regs); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - run_sysvec_on_irqstack_cond(__##func, regs); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static noinline void __##func(struct pt_regs *regs) /** @@ -260,19 +285,29 @@ static noinline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ static __always_inline void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + __irq_enter_raw(); \ + kvm_set_cpu_l1tf_flush_l1d(); \ + __##func (regs); \ + __irq_exit_raw(); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ - __irq_enter_raw(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - __##func (regs); \ - __irq_exit_raw(); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static __always_inline void __##func(struct pt_regs *regs) /** @@ -410,17 +445,35 @@ __visible noinstr void func(struct pt_regs *regs, \ /* C-Code mapping */ #define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW #define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW +#define DEFINE_FREDENTRY_NMI DEFINE_FREDENTRY_RAW #ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_MCE DEFINE_FREDENTRY_RAW #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW #endif +void idt_install_sysvec(unsigned int n, const void *function); + +#ifdef CONFIG_X86_FRED +void fred_install_sysvec(unsigned int vector, const idtentry_t function); +#else +static inline void fred_install_sysvec(unsigned int vector, const idtentry_t function) { } +#endif + +#define sysvec_install(vector, function) { \ + if (cpu_feature_enabled(X86_FEATURE_FRED)) \ + fred_install_sysvec(vector, function); \ + else \ + idt_install_sysvec(vector, asm_##function); \ +} + #else /* !__ASSEMBLY__ */ /* @@ -447,7 +500,7 @@ __visible noinstr void func(struct pt_regs *regs, \ /* System vector entries */ #define DECLARE_IDTENTRY_SYSVEC(vector, func) \ - idtentry_sysvec vector func + DECLARE_IDTENTRY(vector, func) #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ @@ -655,23 +708,36 @@ DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#else +# define fred_sysvec_reschedule_ipi NULL +# define fred_sysvec_reboot NULL +# define fred_sysvec_call_function_single NULL +# define fred_sysvec_call_function NULL #endif #ifdef CONFIG_X86_LOCAL_APIC # ifdef CONFIG_X86_MCE_THRESHOLD DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# else +# define fred_sysvec_threshold NULL # endif # ifdef CONFIG_X86_MCE_AMD DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# else +# define fred_sysvec_deferred_error NULL # endif # ifdef CONFIG_X86_THERMAL_VECTOR DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# else +# define fred_sysvec_thermal NULL # endif # ifdef CONFIG_IRQ_WORK DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# else +# define fred_sysvec_irq_work NULL # endif #endif @@ -679,12 +745,16 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#else +# define fred_sysvec_kvm_posted_intr_ipi NULL +# define fred_sysvec_kvm_posted_intr_wakeup_ipi NULL +# define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 3814a9263d64..294cd2a40818 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -379,7 +379,7 @@ static inline void iosubmit_cmds512(void __iomem *dst, const void *src, const u8 *end = from + count * 64; while (from < end) { - movdir64b(dst, from); + movdir64b_io(dst, from); from += 64; } } diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 51c782600e02..0d806513c4b3 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -140,7 +140,6 @@ extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); -extern void setup_ioapic_ids_from_mpc_nocheck(void); extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index c9f6a6c5de3c..91ca9a9ee3a2 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -25,7 +25,6 @@ #include #include -#include struct kimage; diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index 511b35069187..f163176d6f7f 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -4,8 +4,6 @@ #include -extern struct clocksource kvm_clock; - DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 571fe4d2d232..dc31b13b87a0 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -40,27 +40,27 @@ #ifdef __ASSEMBLY__ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk -#else /* CONFIG_RETPOLINE */ -#ifdef CONFIG_SLS +#else /* CONFIG_MITIGATION_RETPOLINE */ +#ifdef CONFIG_MITIGATION_SLS #define RET ret; int3 #else #define RET ret #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ #else /* __ASSEMBLY__ */ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" -#else /* CONFIG_RETPOLINE */ -#ifdef CONFIG_SLS +#else /* CONFIG_MITIGATION_RETPOLINE */ +#ifdef CONFIG_MITIGATION_SLS #define ASM_RET "ret; int3\n\t" #else #define ASM_RET "ret\n\t" #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 73dba8b94443..59aa966dc212 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -131,8 +131,20 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) (typeof(l->a.counter) *) old, new); } -/* Always has a lock prefix */ -#define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) +/* + * Implement local_xchg using CMPXCHG instruction without the LOCK prefix. + * XCHG is expensive due to the implied LOCK prefix. The processor + * cannot prefetch cachelines if XCHG is used. + */ +static __always_inline long +local_xchg(local_t *l, long n) +{ + long c = local_read(l); + + do { } while (!local_try_cmpxchg(l, &c, n)); + + return c; +} /** * local_add_unless - add unless the number is already a given value diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 4b0f98a8d338..c72c7ff78fcd 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_MPSPEC_H #define _ASM_X86_MPSPEC_H +#include #include #include @@ -46,70 +47,31 @@ extern int smp_found_config; # define smp_found_config 0 #endif -static inline void get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(0); -} - -static inline void early_get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(1); -} - -static inline void find_smp_config(void) -{ - x86_init.mpparse.find_smp_config(); -} - #ifdef CONFIG_X86_MPPARSE extern void e820__memblock_alloc_reserved_mpc_new(void); extern int enable_update_mptable; -extern void default_find_smp_config(void); -extern void default_get_smp_config(unsigned int early); +extern void mpparse_find_mptable(void); +extern void mpparse_parse_early_smp_config(void); +extern void mpparse_parse_smp_config(void); #else static inline void e820__memblock_alloc_reserved_mpc_new(void) { } -#define enable_update_mptable 0 -#define default_find_smp_config x86_init_noop -#define default_get_smp_config x86_init_uint_noop +#define enable_update_mptable 0 +#define mpparse_find_mptable x86_init_noop +#define mpparse_parse_early_smp_config x86_init_noop +#define mpparse_parse_smp_config x86_init_noop #endif -int generic_processor_info(int apicid); +extern DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC); -#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) - -struct physid_mask { - unsigned long mask[PHYSID_ARRAY_SIZE]; -}; - -typedef struct physid_mask physid_mask_t; - -#define physid_set(physid, map) set_bit(physid, (map).mask) -#define physid_isset(physid, map) test_bit(physid, (map).mask) - -#define physids_or(dst, src1, src2) \ - bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) - -#define physids_clear(map) \ - bitmap_zero((map).mask, MAX_LOCAL_APIC) - -#define physids_empty(map) \ - bitmap_empty((map).mask, MAX_LOCAL_APIC) - -static inline void physids_promote(unsigned long physids, physid_mask_t *map) +static inline void reset_phys_cpu_present_map(u32 apicid) { - physids_clear(*map); - map->mask[0] = physids; + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + set_bit(apicid, phys_cpu_present_map); } -static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) +static inline void copy_phys_cpu_present_map(unsigned long *dst) { - physids_clear(*map); - physid_set(physid, *map); + bitmap_copy(dst, phys_cpu_present_map, MAX_LOCAL_APIC); } -#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } -#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } - -extern physid_mask_t phys_cpu_present_map; - #endif /* _ASM_X86_MPSPEC_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f482bc6a5ae7..24c575cdd6b9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -36,8 +36,19 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) @@ -594,36 +605,47 @@ #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) -#define MSR_AMD64_RMP_BASE 0xc0010132 -#define MSR_AMD64_RMP_END 0xc0010133 - -/* SNP feature bits enabled by the hypervisor */ -#define MSR_AMD64_SNP_VTOM BIT_ULL(3) -#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) -#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) -#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) -#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) -#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) -#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) -#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) -#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) -#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) -#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) -#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) -#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) - -/* SNP feature bits reserved for future use. */ -#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) -#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) -#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 65ec1965cd28..d642037f9ed5 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -12,11 +12,13 @@ #include #include +#include + struct msr_info { - u32 msr_no; - struct msr reg; - struct msr *msrs; - int err; + u32 msr_no; + struct msr reg; + struct msr __percpu *msrs; + int err; }; struct msr_regs_info { @@ -97,6 +99,19 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) : : "c" (msr), "a"(low), "d" (high) : "memory"); } +/* + * WRMSRNS behaves exactly like WRMSR with the only difference being + * that it is not a serializing instruction by default. + */ +static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high) +{ + /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */ + asm volatile("1: .byte 0x0f,0x01,0xc6\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) + : : "c" (msr), "a"(low), "d" (high)); +} + #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = __rdmsr((msr)); \ @@ -297,6 +312,11 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ +static __always_inline void wrmsrns(u32 msr, u64 val) +{ + __wrmsrns(msr, val, val >> 32); +} + /* * 64-bit version of wrmsr_safe(): */ @@ -305,8 +325,8 @@ static inline int wrmsrl_safe(u32 msr, u64 val) return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); } -struct msr *msrs_alloc(void); -void msrs_free(struct msr *msrs); +struct msr __percpu *msrs_alloc(void); +void msrs_free(struct msr __percpu *msrs); int msr_set_bit(u32 msr, u8 bit); int msr_clear_bit(u32 msr, u8 bit); @@ -315,8 +335,8 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); -void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); -void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); @@ -345,14 +365,14 @@ static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) return 0; } static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, - struct msr *msrs) + struct msr __percpu *msrs) { - rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); + rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, - struct msr *msrs) + struct msr __percpu *msrs) { - wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); + wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h)); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5c5f1e56c404..41a0ebb699ec 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -14,9 +14,6 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -struct ctl_table; -extern int proc_nmi_enabled(struct ctl_table *, int , - void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; #endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 2aa52cab1e46..d0b8bb762838 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -59,13 +59,13 @@ #ifdef CONFIG_CALL_THUNKS_DEBUG # define CALL_THUNKS_DEBUG_INC_CALLS \ - incq %gs:__x86_call_count; + incq PER_CPU_VAR(__x86_call_count); # define CALL_THUNKS_DEBUG_INC_RETS \ - incq %gs:__x86_ret_count; + incq PER_CPU_VAR(__x86_ret_count); # define CALL_THUNKS_DEBUG_INC_STUFFS \ - incq %gs:__x86_stuffs_count; + incq PER_CPU_VAR(__x86_stuffs_count); # define CALL_THUNKS_DEBUG_INC_CTXSW \ - incq %gs:__x86_ctxsw_count; + incq PER_CPU_VAR(__x86_ctxsw_count); #else # define CALL_THUNKS_DEBUG_INC_CALLS # define CALL_THUNKS_DEBUG_INC_RETS @@ -73,16 +73,13 @@ # define CALL_THUNKS_DEBUG_INC_CTXSW #endif -#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) +#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) #include #define CREDIT_CALL_DEPTH \ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); -#define ASM_CREDIT_CALL_DEPTH \ - movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); - #define RESET_CALL_DEPTH \ xor %eax, %eax; \ bts $63, %rax; \ @@ -95,20 +92,14 @@ CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, %gs:pcpu_hot + X86_call_depth; \ - CALL_THUNKS_DEBUG_INC_CALLS - -#define ASM_INCREMENT_CALL_DEPTH \ sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #else #define CREDIT_CALL_DEPTH -#define ASM_CREDIT_CALL_DEPTH #define RESET_CALL_DEPTH -#define INCREMENT_CALL_DEPTH -#define ASM_INCREMENT_CALL_DEPTH #define RESET_CALL_DEPTH_FROM_CALL +#define INCREMENT_CALL_DEPTH #endif /* @@ -158,7 +149,7 @@ jnz 771b; \ /* barrier for jnz misprediction */ \ lfence; \ - ASM_CREDIT_CALL_DEPTH \ + CREDIT_CALL_DEPTH \ CALL_THUNKS_DEBUG_INC_CTXSW #else /* @@ -212,7 +203,7 @@ */ .macro VALIDATE_UNRET_END #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -241,7 +232,7 @@ * instruction irrespective of kCFI. */ .macro JMP_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg jmp __x86_indirect_thunk_\reg #else @@ -251,7 +242,7 @@ .endm .macro CALL_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg call __x86_indirect_thunk_\reg #else @@ -271,7 +262,7 @@ .Lskip_rsb_\@: .endm -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) #define CALL_UNTRAIN_RET "call entry_untrain_ret" #else #define CALL_UNTRAIN_RET "" @@ -289,7 +280,7 @@ * where we have a stack but before any RET instruction. */ .macro __UNTRAIN_RET ibpb_feature, call_depth_insns -#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY) +#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ @@ -309,9 +300,9 @@ .macro CALL_DEPTH_ACCOUNT -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING ALTERNATIVE "", \ - __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH + __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm @@ -323,7 +314,7 @@ * Note: Only the memory operand variant of VERW clears the CPU buffers. */ .macro CLEAR_CPU_BUFFERS - ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF + ALTERNATIVE "", __stringify(verw mds_verw_sel), X86_FEATURE_CLEAR_CPU_BUF .endm #else /* __ASSEMBLY__ */ @@ -339,19 +330,19 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); #else static inline void __x86_return_thunk(void) {} #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY extern void retbleed_return_thunk(void); #else static inline void retbleed_return_thunk(void) {} #endif -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); #else @@ -368,7 +359,9 @@ extern void entry_ibpb(void); extern void (*x86_return_thunk)(void); -#ifdef CONFIG_CALL_DEPTH_TRACKING +extern void __warn_thunk(void); + +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING extern void call_depth_return_thunk(void); #define CALL_DEPTH_ACCOUNT \ @@ -382,14 +375,14 @@ DECLARE_PER_CPU(u64, __x86_ret_count); DECLARE_PER_CPU(u64, __x86_stuffs_count); DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif -#else /* !CONFIG_CALL_DEPTH_TRACKING */ +#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ static inline void call_depth_return_thunk(void) {} #define CALL_DEPTH_ACCOUNT "" -#endif /* CONFIG_CALL_DEPTH_TRACKING */ +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; @@ -410,7 +403,7 @@ static inline void call_depth_return_thunk(void) {} /* * Inline asm uses the %V modifier which is only in newer GCC - * which is ensured when CONFIG_RETPOLINE is defined. + * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ # define CALL_NOSPEC \ ALTERNATIVE_2( \ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index d18e5c332cb9..1b93ff80b43b 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -66,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) +static __always_inline void *pfn_to_kaddr(unsigned long pfn) +{ + return __va(pfn << PAGE_SHIFT); +} + static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b40c462b4af3..b3ab80a03365 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -10,7 +10,6 @@ #include #include #include -#include struct pci_sysdata { int domain; /* PCI domain */ @@ -124,16 +123,4 @@ cpumask_of_pcibus(const struct pci_bus *bus) } #endif -struct pci_setup_rom { - struct setup_data data; - uint16_t vendor; - uint16_t devid; - uint64_t pcilen; - unsigned long segment; - unsigned long bus; - unsigned long device; - unsigned long function; - uint8_t romdata[]; -}; - #endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 5e01883eb51e..44958ebaf626 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -4,17 +4,21 @@ #ifdef CONFIG_X86_64 #define __percpu_seg gs +#define __percpu_rel (%rip) #else #define __percpu_seg fs +#define __percpu_rel #endif #ifdef __ASSEMBLY__ #ifdef CONFIG_SMP -#define PER_CPU_VAR(var) %__percpu_seg:var -#else /* ! SMP */ -#define PER_CPU_VAR(var) var -#endif /* SMP */ +#define __percpu %__percpu_seg: +#else +#define __percpu +#endif + +#define PER_CPU_VAR(var) __percpu(var)__percpu_rel #ifdef CONFIG_X86_64_SMP #define INIT_PER_CPU_VAR(var) init_per_cpu__##var @@ -24,30 +28,84 @@ #else /* ...!ASSEMBLY */ +#include #include #include #ifdef CONFIG_SMP + +#ifdef CONFIG_CC_HAS_NAMED_AS + +#ifdef __CHECKER__ +#define __seg_gs __attribute__((address_space(__seg_gs))) +#define __seg_fs __attribute__((address_space(__seg_fs))) +#endif + +#ifdef CONFIG_X86_64 +#define __percpu_seg_override __seg_gs +#else +#define __percpu_seg_override __seg_fs +#endif + +#define __percpu_prefix "" + +#else /* CONFIG_CC_HAS_NAMED_AS */ + +#define __percpu_seg_override #define __percpu_prefix "%%"__stringify(__percpu_seg)":" + +#endif /* CONFIG_CC_HAS_NAMED_AS */ + +#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" #define __my_cpu_offset this_cpu_read(this_cpu_off) +#ifdef CONFIG_USE_X86_SEG_SUPPORT +/* + * Efficient implementation for cases in which the compiler supports + * named address spaces. Allows the compiler to perform additional + * optimizations that can save more instructions. + */ +#define arch_raw_cpu_ptr(ptr) \ +({ \ + unsigned long tcp_ptr__; \ + tcp_ptr__ = __raw_cpu_read(, this_cpu_off); \ + \ + tcp_ptr__ += (unsigned long)(ptr); \ + (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +}) +#else /* CONFIG_USE_X86_SEG_SUPPORT */ /* * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define arch_raw_cpu_ptr(ptr) \ -({ \ - unsigned long tcp_ptr__; \ - asm ("add " __percpu_arg(1) ", %0" \ - : "=r" (tcp_ptr__) \ - : "m" (this_cpu_off), "0" (ptr)); \ - (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +#define arch_raw_cpu_ptr(ptr) \ +({ \ + unsigned long tcp_ptr__; \ + asm ("mov " __percpu_arg(1) ", %0" \ + : "=r" (tcp_ptr__) \ + : "m" (__my_cpu_var(this_cpu_off))); \ + \ + tcp_ptr__ += (unsigned long)(ptr); \ + (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ }) -#else -#define __percpu_prefix "" -#endif +#endif /* CONFIG_USE_X86_SEG_SUPPORT */ +#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel + +#else /* CONFIG_SMP */ +#define __percpu_seg_override +#define __percpu_prefix "" +#define __force_percpu_prefix "" + +#define PER_CPU_VAR(var) (var)__percpu_rel + +#endif /* CONFIG_SMP */ + +#define __my_cpu_type(var) typeof(var) __percpu_seg_override +#define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr) +#define __my_cpu_var(var) (*__my_cpu_ptr(&var)) #define __percpu_arg(x) __percpu_prefix "%" #x +#define __force_percpu_arg(x) __force_percpu_prefix "%" #x /* * Initialized pointers to per-cpu variables needed for the boot @@ -107,14 +165,14 @@ do { \ (void)pto_tmp__; \ } \ asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ - : [var] "+m" (_var) \ + : [var] "+m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) #define percpu_unary_op(size, qual, op, _var) \ ({ \ asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ - : [var] "+m" (_var)); \ + : [var] "+m" (__my_cpu_var(_var))); \ }) /* @@ -144,16 +202,16 @@ do { \ __pcpu_type_##size pfo_val__; \ asm qual (__pcpu_op2_##size(op, __percpu_arg([var]), "%[val]") \ : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "m" (_var)); \ + : [var] "m" (__my_cpu_var(_var))); \ (typeof(_var))(unsigned long) pfo_val__; \ }) #define percpu_stable_op(size, op, _var) \ ({ \ __pcpu_type_##size pfo_val__; \ - asm(__pcpu_op2_##size(op, __percpu_arg(P[var]), "%[val]") \ + asm(__pcpu_op2_##size(op, __force_percpu_arg(a[var]), "%[val]") \ : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "p" (&(_var))); \ + : [var] "i" (&(_var))); \ (typeof(_var))(unsigned long) pfo_val__; \ }) @@ -166,7 +224,7 @@ do { \ asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ __percpu_arg([var])) \ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : : "memory"); \ (typeof(_var))(unsigned long) (paro_tmp__ + _val); \ }) @@ -187,7 +245,7 @@ do { \ __percpu_arg([var])) \ "\n\tjnz 1b" \ : [oval] "=&a" (pxo_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pxo_new__) \ : "memory"); \ (typeof(_var))(unsigned long) pxo_old__; \ @@ -204,7 +262,7 @@ do { \ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ __percpu_arg([var])) \ : [oval] "+a" (pco_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ : "memory"); \ (typeof(_var))(unsigned long) pco_old__; \ @@ -221,7 +279,7 @@ do { \ CC_SET(z) \ : CC_OUT(z) (success), \ [oval] "+a" (pco_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ : "memory"); \ if (unlikely(!success)) \ @@ -244,7 +302,7 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : [var] "+m" (_var), \ + : [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -276,7 +334,7 @@ do { \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ CC_SET(z) \ : CC_OUT(z) (success), \ - [var] "+m" (_var), \ + [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -313,7 +371,7 @@ do { \ \ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : [var] "+m" (_var), \ + : [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -345,7 +403,7 @@ do { \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ CC_SET(z) \ : CC_OUT(z) (success), \ - [var] "+m" (_var), \ + [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ @@ -366,9 +424,9 @@ do { \ * accessed while this_cpu_read_stable() allows the value to be cached. * this_cpu_read_stable() is more efficient and can be used if its value * is guaranteed to be valid across cpus. The current users include - * get_current() and get_thread_info() both of which are actually - * per-thread variables implemented as per-cpu variables and thus - * stable for the duration of the respective task. + * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are + * actually per-thread variables implemented as per-CPU variables and + * thus stable for the duration of the respective task. */ #define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp) #define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp) @@ -376,13 +434,72 @@ do { \ #define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp) #define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp) +#ifdef CONFIG_USE_X86_SEG_SUPPORT + +#define __raw_cpu_read(qual, pcp) \ +({ \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \ +}) + +#define __raw_cpu_write(qual, pcp, val) \ +do { \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \ +} while (0) + +#define raw_cpu_read_1(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_read_2(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_read_4(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_write_1(pcp, val) __raw_cpu_write(, pcp, val) +#define raw_cpu_write_2(pcp, val) __raw_cpu_write(, pcp, val) +#define raw_cpu_write_4(pcp, val) __raw_cpu_write(, pcp, val) + +#define this_cpu_read_1(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_read_2(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_read_4(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_write_1(pcp, val) __raw_cpu_write(volatile, pcp, val) +#define this_cpu_write_2(pcp, val) __raw_cpu_write(volatile, pcp, val) +#define this_cpu_write_4(pcp, val) __raw_cpu_write(volatile, pcp, val) + +#ifdef CONFIG_X86_64 +#define raw_cpu_read_8(pcp) __raw_cpu_read(, pcp) +#define raw_cpu_write_8(pcp, val) __raw_cpu_write(, pcp, val) + +#define this_cpu_read_8(pcp) __raw_cpu_read(volatile, pcp) +#define this_cpu_write_8(pcp, val) __raw_cpu_write(volatile, pcp, val) +#endif + +#define this_cpu_read_const(pcp) __raw_cpu_read(, pcp) +#else /* CONFIG_USE_X86_SEG_SUPPORT */ + #define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp) #define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp) #define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp) - #define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val) #define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val) #define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val) + +#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) +#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) + +#ifdef CONFIG_X86_64 +#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) +#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) + +#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) +#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) +#endif + +/* + * The generic per-cpu infrastrucutre is not suitable for + * reading const-qualified variables. + */ +#define this_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; }) +#endif /* CONFIG_USE_X86_SEG_SUPPORT */ + #define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) #define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val) #define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val) @@ -408,12 +525,6 @@ do { \ #define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) #define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) -#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) #define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val) #define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val) #define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val) @@ -452,8 +563,6 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) #define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val) #define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val) #define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val) @@ -462,8 +571,6 @@ do { \ #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) #define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval) -#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) #define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val) #define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val) #define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val) @@ -494,7 +601,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, asm volatile("btl "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) - : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); + : "m" (*__my_cpu_ptr((unsigned long __percpu *)(addr))), "Ir" (nr)); return oldbit; } diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 94de1a05aeba..d65e338b6a5f 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -181,7 +181,7 @@ static inline u64 p4_clear_ht_bit(u64 config) static inline int p4_ht_active(void) { #ifdef CONFIG_SMP - return smp_num_siblings > 1; + return __max_threads_per_core > 1; #endif return 0; } @@ -189,7 +189,7 @@ static inline int p4_ht_active(void) static inline int p4_ht_thread(int cpu) { #ifdef CONFIG_SMP - if (smp_num_siblings == 2) + if (__max_threads_per_core == 2) return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c7ec5bb88334..dcd836b59beb 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -34,7 +34,7 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} */ extern gfp_t __userpte_alloc_gfp; -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 9e7c0b719c3c..dabafba957ea 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -52,7 +52,7 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) static inline void native_set_pud(pud_t *pudp, pud_t pud) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); #endif pxx_xchg64(pud, pudp, native_pud_val(pud)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9d077bca6a10..df0f7d4a96f3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -909,7 +909,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* @@ -923,12 +923,12 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) return pgd; return __pti_set_user_pgtbl(pgdp, pgd); } -#else /* CONFIG_PAGE_TABLE_ISOLATION */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ #endif /* __ASSEMBLY__ */ @@ -1131,7 +1131,7 @@ static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (p4d_flags(p4d) & ~ignore_flags) != 0; @@ -1177,7 +1177,7 @@ static inline int pgd_bad(pgd_t pgd) if (!pgtable_l5_enabled()) return 0; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -1422,9 +1422,9 @@ static inline bool pgdp_maps_userspace(void *__ptr) #define pgd_leaf pgd_large static inline int pgd_large(pgd_t pgd) { return 0; } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and * the user one is in the last 4k. To switch between them, you * just need to flip the 12th bit in their addresses. @@ -1469,7 +1469,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) { return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); @@ -1484,7 +1484,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!static_cpu_has(X86_FEATURE_PTI)) return; /* Clone the user space pgd as well */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 24af25b1551a..7e9db77231ac 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -143,7 +143,8 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; - if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + if (pgtable_l5_enabled() || + !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) { WRITE_ONCE(*p4dp, p4d); return; } diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index af77235fded6..919909d8cb77 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -91,7 +91,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + return GEN_UNARY_RMWcc("decl", __my_cpu_var(pcpu_hot.preempt_count), e, __percpu_arg([var])); } diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index d8cccadc83a6..e5f204b9b33d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -51,7 +51,7 @@ #define CR3_NOFLUSH 0 #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define X86_CR3_PTI_PCID_USER_BIT 11 #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 26620d7642a9..811548f131f4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -20,7 +20,6 @@ struct vm86; #include #include #include -#include #include #include #include @@ -100,6 +99,9 @@ struct cpuinfo_topology { u32 logical_pkg_id; u32 logical_die_id; + // AMD Node ID and Nodes per Package info + u32 amd_node_id; + // Cache level topology IDs u32 llc_id; u32 l2c_id; @@ -119,8 +121,6 @@ struct cpuinfo_x86 { #endif __u8 x86_virt_bits; __u8 x86_phys_bits; - /* CPUID returned core id bits: */ - __u8 x86_coreid_bits; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -148,8 +148,6 @@ struct cpuinfo_x86 { unsigned long loops_per_jiffy; /* protected processor identification number */ u64 ppin; - /* cpuid returned max cores value: */ - u16 x86_max_cores; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; @@ -186,13 +184,8 @@ extern struct cpuinfo_x86 new_cpu_data; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; -#ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) -#else -#define cpu_info boot_cpu_data -#define cpu_data(cpu) boot_cpu_data -#endif extern const struct seq_operations cpuinfo_op; @@ -533,6 +526,9 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return this_cpu_read_const(const_pcpu_hot.top_of_stack); + return this_cpu_read_stable(pcpu_hot.top_of_stack); } @@ -555,7 +551,7 @@ static inline void load_sp0(unsigned long sp0) unsigned long __get_wchan(struct task_struct *p); -extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void select_idle_routine(void); extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; @@ -576,28 +572,6 @@ extern void cpu_init(void); extern void cpu_init_exception_handling(void); extern void cr4_init(void); -static inline unsigned long get_debugctlmsr(void) -{ - unsigned long debugctlmsr = 0; - -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return 0; -#endif - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); - - return debugctlmsr; -} - -static inline void update_debugctlmsr(unsigned long debugctlmsr) -{ -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return; -#endif - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); -} - extern void set_task_blockstep(struct task_struct *task, bool on); /* Boot loader type from the setup header: */ @@ -664,8 +638,10 @@ static __always_inline void prefetchw(const void *x) #else extern unsigned long __end_init_task[]; -#define INIT_THREAD { \ - .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \ +#define INIT_THREAD { \ + .sp = (unsigned long)&__end_init_task - \ + TOP_OF_KERNEL_STACK_PADDING - \ + sizeof(struct pt_regs), \ } extern unsigned long KSTK_ESP(struct task_struct *task); @@ -704,12 +680,10 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu) } #ifdef CONFIG_CPU_SUP_AMD -extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); extern void amd_clear_divider(void); extern void amd_check_microcode(void); #else -static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 65dee2420624..043758a2e627 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -23,11 +23,11 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); void x86_of_pci_init(void); -void x86_dtb_init(void); +void x86_dtb_parse_smp_config(void); #else static inline void add_dtb(u64 data) { } static inline void x86_of_pci_init(void) { } -static inline void x86_dtb_init(void) { } +static inline void x86_dtb_parse_smp_config(void) { } #define of_ioapic 0 #endif diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 07375b476c4f..ab167c96b9ab 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -3,7 +3,7 @@ #define _ASM_X86_PTI_H #ifndef __ASSEMBLY__ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); extern void pti_finalize(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index f4db78b09c8f..5a83fbd9bc0b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -56,18 +56,64 @@ struct pt_regs { #else /* __i386__ */ +struct fred_cs { + /* CS selector */ + u64 cs : 16, + /* Stack level at event time */ + sl : 2, + /* IBT in WAIT_FOR_ENDBRANCH state */ + wfe : 1, + : 45; +}; + +struct fred_ss { + /* SS selector */ + u64 ss : 16, + /* STI state */ + sti : 1, + /* Set if syscall, sysenter or INT n */ + swevent : 1, + /* Event is NMI type */ + nmi : 1, + : 13, + /* Event vector */ + vector : 8, + : 8, + /* Event type */ + type : 4, + : 4, + /* Event was incident to enclave execution */ + enclave : 1, + /* CPU was in long mode */ + lm : 1, + /* + * Nested exception during FRED delivery, not set + * for #DF. + */ + nested : 1, + : 1, + /* + * The length of the instruction causing the event. + * Only set for INTO, INT1, INT3, INT n, SYSCALL + * and SYSENTER. 0 otherwise. + */ + insnlen : 4; +}; + struct pt_regs { -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ + /* + * C ABI says these regs are callee-preserved. They aren't saved on + * kernel entry unless syscall needs a complete, fully filled + * "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* These regs are callee-clobbered. Always saved on kernel entry. */ + + /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -77,18 +123,50 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ + + /* + * orig_ax is used on entry for: + * - the syscall number (syscall, sysenter, int80) + * - error_code stored by the CPU on traps and exceptions + * - the interrupt number for device interrupts + * + * A FRED stack frame starts here: + * 1) It _always_ includes an error code; + * + * 2) The return frame for ERET[US] starts here, but + * the content of orig_ax is ignored. + */ unsigned long orig_ax; -/* Return frame for iretq */ + + /* The IRETQ return frame starts here */ unsigned long ip; - unsigned long cs; + + union { + /* CS selector */ + u16 cs; + /* The extended 64-bit data slot containing CS */ + u64 csx; + /* The FRED CS extension */ + struct fred_cs fred_cs; + }; + unsigned long flags; unsigned long sp; - unsigned long ss; -/* top of stack page */ + + union { + /* SS selector */ + u16 ss; + /* The extended 64-bit data slot containing SS */ + u64 ssx; + /* The FRED SS extension */ + struct fred_ss fred_ss; + }; + + /* + * Top of stack on IDT systems, while FRED systems have extra fields + * defined above for storing exception related information, e.g. CR2 or + * DR6. + */ }; #endif /* !__i386__ */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 255a78d9d906..12dbd2588ca7 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -7,6 +7,13 @@ #include #include +/* + * This value can never be a valid CLOSID, and is used when mapping a + * (closid, rmid) pair to an index and back. On x86 only the RMID is + * needed. The index is a software defined value. + */ +#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0) + /** * struct resctrl_pqr_state - State cache for the PQR MSR * @cur_rmid: The cached Resource Monitoring ID @@ -31,10 +38,47 @@ struct resctrl_pqr_state { DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; + DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); +static inline bool resctrl_arch_alloc_capable(void) +{ + return rdt_alloc_capable; +} + +static inline void resctrl_arch_enable_alloc(void) +{ + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_alloc(void) +{ + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + +static inline bool resctrl_arch_mon_capable(void) +{ + return rdt_mon_capable; +} + +static inline void resctrl_arch_enable_mon(void) +{ + static_branch_enable_cpuslocked(&rdt_mon_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_mon(void) +{ + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * @@ -88,12 +132,58 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) return val * scale; } +static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk, + u32 closid, u32 rmid) +{ + WRITE_ONCE(tsk->closid, closid); + WRITE_ONCE(tsk->rmid, rmid); +} + +static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + return READ_ONCE(tsk->closid) == closid; +} + +static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored, + u32 rmid) +{ + return READ_ONCE(tsk->rmid) == rmid; +} + static inline void resctrl_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) __resctrl_sched_in(tsk); } +static inline u32 resctrl_arch_system_num_rmid_idx(void) +{ + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return boot_cpu_data.x86_cache_max_rmid + 1; +} + +static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *rmid = idx; + *closid = X86_RESCTRL_EMPTY_CLOSID; +} + +static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) +{ + return rmid; +} + +/* x86 can always read an rmid, nothing needs allocating */ +struct rdt_resource; +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) +{ + might_sleep(); + return NULL; +}; + +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, + void *ctx) { }; + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index a5e89641bd2d..9aee31862b4a 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -47,6 +47,7 @@ int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); +int set_memory_p(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/setup_data.h b/arch/x86/include/asm/setup_data.h new file mode 100644 index 000000000000..77c51111a893 --- /dev/null +++ b/arch/x86/include/asm/setup_data.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SETUP_DATA_H +#define _ASM_X86_SETUP_DATA_H + +#include + +#ifndef __ASSEMBLY__ + +struct pci_setup_rom { + struct setup_data data; + uint16_t vendor; + uint16_t devid; + uint64_t pcilen; + unsigned long segment; + unsigned long bus; + unsigned long device; + unsigned long function; + uint8_t romdata[]; +}; + +/* kexec external ABI */ +struct efi_setup_data { + u64 fw_vendor; + u64 __unused; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index cf671138feef..9477b4053bce 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -13,7 +13,6 @@ #include #include -#include #include #define GHCB_PROTOCOL_MIN 1ULL @@ -22,6 +21,8 @@ #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } +struct boot_params; + enum es_result { ES_OK, /* All good */ ES_UNSUPPORTED, /* Requested operation not supported */ @@ -228,6 +229,7 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); void kdump_sev_callback(void); +void sev_show_status(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -257,6 +259,7 @@ static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } static inline void kdump_sev_callback(void) { } +static inline void sev_show_status(void) { } #endif #ifdef CONFIG_KVM_AMD_SEV diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4fab2ed454f3..a35936b512fe 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -8,9 +8,6 @@ #include #include -extern int smp_num_siblings; -extern unsigned int num_processors; - DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); @@ -59,11 +56,6 @@ static inline void stop_other_cpus(void) smp_ops.stop_other_cpus(1); } -static inline void smp_prepare_boot_cpu(void) -{ - smp_ops.smp_prepare_boot_cpu(); -} - static inline void smp_prepare_cpus(unsigned int max_cpus) { smp_ops.smp_prepare_cpus(max_cpus); @@ -110,7 +102,6 @@ void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void smp_prepare_cpus_common(void); void native_smp_prepare_cpus(unsigned int max_cpus); -void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_kick_ap(unsigned int cpu, struct task_struct *tidle); @@ -174,8 +165,6 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) } #endif /* CONFIG_SMP */ -extern unsigned disabled_cpus; - #ifdef CONFIG_DEBUG_NMI_SELFTEST extern void nmi_selftest(void); #else diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index c648502e4535..658b690b2ccb 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -96,4 +96,6 @@ static inline void speculative_store_bypass_ht_init(void) { } extern void speculation_ctrl_update(unsigned long tif); extern void speculation_ctrl_update_current(void); +extern bool itlb_multihit_kvm_mitigation; + #endif diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 48f8dd47cf68..2e9fc5c400cd 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -2,11 +2,11 @@ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H - #ifdef __KERNEL__ - #include #include + +#include #include #include @@ -224,10 +224,10 @@ static inline void serialize(void) } /* The dst parameter must be 64-bytes aligned */ -static inline void movdir64b(void __iomem *dst, const void *src) +static inline void movdir64b(void *dst, const void *src) { const struct { char _[64]; } *__src = src; - struct { char _[64]; } __iomem *__dst = dst; + struct { char _[64]; } *__dst = dst; /* * MOVDIR64B %(rdx), rax. @@ -245,6 +245,11 @@ static inline void movdir64b(void __iomem *dst, const void *src) : "m" (*__src), "a" (__dst), "d" (__src)); } +static inline void movdir64b_io(void __iomem *dst, const void *src) +{ + movdir64b((void __force *)dst, src); +} + /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 343b722ccaf2..125c407e2abe 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -46,7 +46,7 @@ #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk") #else diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index f42dbf17f52b..c3bd0c0758c9 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -70,9 +70,13 @@ static inline void update_task_stack(struct task_struct *task) #ifdef CONFIG_X86_32 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* Xen PV enters the kernel on the thread stack. */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + /* WRMSRNS is a baseline feature for FRED. */ + wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE); + } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* Xen PV enters the kernel on the thread stack. */ load_sp0(task_top_of_stack(task)); + } #endif } diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 0b70653a98c1..345aafbc1964 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -15,6 +15,8 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); +extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len); + /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d63b02940747..12da7dfd5ef1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,9 @@ * In vm86 mode, the hardware frame is much longer still, so add 16 * bytes to make room for the real-mode segments. * - * x86_64 has a fixed-length stack frame. + * x86-64 has a fixed-length stack frame, but it depends on whether + * or not FRED is enabled. Future versions of FRED might make this + * dynamic, but for now it is always 2 words longer. */ #ifdef CONFIG_X86_32 # ifdef CONFIG_VM86 @@ -39,8 +41,12 @@ # else # define TOP_OF_KERNEL_STACK_PADDING 8 # endif -#else -# define TOP_OF_KERNEL_STACK_PADDING 0 +#else /* x86-64 */ +# ifdef CONFIG_X86_FRED +# define TOP_OF_KERNEL_STACK_PADDING (2 * 8) +# else +# define TOP_OF_KERNEL_STACK_PADDING 0 +# endif #endif /* diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 5f87f6b9b09e..abe3a8f22cbd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -102,6 +102,35 @@ static inline void setup_node_to_cpumask_map(void) { } #include +/* Topology information */ +enum x86_topology_domains { + TOPO_SMT_DOMAIN, + TOPO_CORE_DOMAIN, + TOPO_MODULE_DOMAIN, + TOPO_TILE_DOMAIN, + TOPO_DIE_DOMAIN, + TOPO_DIEGRP_DOMAIN, + TOPO_PKG_DOMAIN, + TOPO_MAX_DOMAIN, +}; + +struct x86_topology_system { + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_size[TOPO_MAX_DOMAIN]; +}; + +extern struct x86_topology_system x86_topo_system; + +static inline unsigned int topology_get_domain_size(enum x86_topology_domains dom) +{ + return x86_topo_system.dom_size[dom]; +} + +static inline unsigned int topology_get_domain_shift(enum x86_topology_domains dom) +{ + return dom == TOPO_SMT_DOMAIN ? 0 : x86_topo_system.dom_shifts[dom - 1]; +} + extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_clustergroup_mask(int cpu); @@ -112,7 +141,42 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) -extern unsigned int __max_die_per_package; +#define topology_amd_node_id(cpu) (cpu_data(cpu).topo.amd_node_id) + +extern unsigned int __max_dies_per_package; +extern unsigned int __max_logical_packages; +extern unsigned int __max_threads_per_core; +extern unsigned int __num_threads_per_package; +extern unsigned int __num_cores_per_package; + +static inline unsigned int topology_max_packages(void) +{ + return __max_logical_packages; +} + +static inline unsigned int topology_max_dies_per_package(void) +{ + return __max_dies_per_package; +} + +static inline unsigned int topology_num_cores_per_package(void) +{ + return __num_cores_per_package; +} + +static inline unsigned int topology_num_threads_per_package(void) +{ + return __num_threads_per_package; +} + +#ifdef CONFIG_X86_LOCAL_APIC +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); +#else +static inline int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + return 0; +} +#endif #ifdef CONFIG_SMP #define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id) @@ -121,12 +185,11 @@ extern unsigned int __max_die_per_package; #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) -extern unsigned int __max_logical_packages; -#define topology_max_packages() (__max_logical_packages) -static inline int topology_max_die_per_package(void) +static inline int topology_phys_to_logical_pkg(unsigned int pkg) { - return __max_die_per_package; + return topology_get_logical_id(pkg << x86_topo_system.dom_shifts[TOPO_PKG_DOMAIN], + TOPO_PKG_DOMAIN); } extern int __max_smt_threads; @@ -138,9 +201,12 @@ static inline int topology_max_smt_threads(void) #include -int topology_update_package_map(unsigned int apicid, unsigned int cpu); -int topology_update_die_map(unsigned int dieid, unsigned int cpu); -int topology_phys_to_logical_pkg(unsigned int pkg); +extern unsigned int __amd_nodes_per_pkg; + +static inline unsigned int topology_amd_nodes_per_pkg(void) +{ + return __amd_nodes_per_pkg; +} extern struct cpumask __cpu_primary_thread_mask; #define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) @@ -153,16 +219,12 @@ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_primary_thread_mask); } + #else /* CONFIG_SMP */ -#define topology_max_packages() (1) -static inline int -topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } -static inline int -topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } -static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } +static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } #endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index f5d2325aa0b7..8d1154cdf787 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -2,6 +2,18 @@ #ifndef _ASM_X86_TRAPNR_H #define _ASM_X86_TRAPNR_H +/* + * Event type codes used by FRED, Intel VT-x and AMD SVM + */ +#define EVENT_TYPE_EXTINT 0 // External interrupt +#define EVENT_TYPE_RESERVED 1 +#define EVENT_TYPE_NMI 2 // NMI +#define EVENT_TYPE_HWEXC 3 // Hardware originated traps, exceptions +#define EVENT_TYPE_SWINT 4 // INT n +#define EVENT_TYPE_PRIV_SWEXC 5 // INT1 +#define EVENT_TYPE_SWEXC 6 // INTO, INT3 +#define EVENT_TYPE_OTHER 7 // FRED SYSCALL/SYSENTER, VT-x MTF + /* Interrupts/Exceptions */ #define X86_TRAP_DE 0 /* Divide-by-zero */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 594fce0ca744..405efb3e4996 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -5,8 +5,9 @@ #ifndef _ASM_X86_TSC_H #define _ASM_X86_TSC_H -#include #include +#include +#include /* * Standard way to access the cycle counter. diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f2c02e4469cc..04789f45ab2b 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_ADDRESS_MASKING /* @@ -18,14 +19,10 @@ */ static inline unsigned long __untagged_addr(unsigned long addr) { - /* - * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation - * in alternative instructions. The relocation gets wrong when gets - * copied to the target place. - */ asm (ALTERNATIVE("", - "and %%gs:tlbstate_untag_mask, %[addr]\n\t", X86_FEATURE_LAM) - : [addr] "+r" (addr) : "m" (tlbstate_untag_mask)); + "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM) + : [addr] "+r" (addr) + : [mask] "m" (__my_cpu_var(tlbstate_untag_mask))); return addr; } @@ -54,7 +51,7 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, * half and a user half. When cast to a signed type, user pointers * are positive and kernel pointers are negative. */ -#define valid_user_address(x) ((long)(x) >= 0) +#define valid_user_address(x) ((__force long)(x) >= 0) /* * User pointers can have tag bits on x86-64. This scheme tolerates @@ -87,8 +84,9 @@ static inline bool __access_ok(const void __user *ptr, unsigned long size) if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) { return valid_user_address(ptr); } else { - unsigned long sum = size + (unsigned long)ptr; - return valid_user_address(sum) && sum >= (unsigned long)ptr; + unsigned long sum = size + (__force unsigned long)ptr; + + return valid_user_address(sum) && sum >= (__force unsigned long)ptr; } } #define __access_ok __access_ok diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0e73616b82f3..4dba17363008 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -17,6 +17,7 @@ #include #include +#include #include #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) @@ -374,14 +375,14 @@ enum vmcs_field { #define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK -#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -#define INTR_TYPE_RESERVED (1 << 8) /* reserved */ -#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ -#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ -#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ -#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ -#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */ +#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */ +#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */ +#define INTR_TYPE_NMI_INTR (EVENT_TYPE_NMI << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (EVENT_TYPE_HWEXC << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (EVENT_TYPE_SWINT << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (EVENT_TYPE_PRIV_SWEXC << 8) /* ICE breakpoint */ +#define INTR_TYPE_SOFT_EXCEPTION (EVENT_TYPE_SWEXC << 8) /* software exception */ +#define INTR_TYPE_OTHER_EVENT (EVENT_TYPE_OTHER << 8) /* other event */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c878616a18b8..b89b40f250e6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H -#include - struct ghcb; struct mpc_bus; struct mpc_cpu; @@ -15,13 +13,15 @@ struct irq_domain; /** * struct x86_init_mpparse - platform specific mpparse ops * @setup_ioapic_ids: platform specific ioapic id override - * @find_smp_config: find the smp configuration - * @get_smp_config: get the smp configuration + * @find_mptable: Find MPTABLE early to reserve the memory region + * @early_parse_smp_cfg: Parse the SMP configuration data early before initmem_init() + * @parse_smp_cfg: Parse the SMP configuration data */ struct x86_init_mpparse { void (*setup_ioapic_ids)(void); - void (*find_smp_config)(void); - void (*get_smp_config)(unsigned int early); + void (*find_mptable)(void); + void (*early_parse_smp_cfg)(void); + void (*parse_smp_cfg)(void); }; /** diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index eeea058cf602..9b82eebd7add 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -2,21 +2,7 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H -/* setup_data/setup_indirect types */ -#define SETUP_NONE 0 -#define SETUP_E820_EXT 1 -#define SETUP_DTB 2 -#define SETUP_PCI 3 -#define SETUP_EFI 4 -#define SETUP_APPLE_PROPERTIES 5 -#define SETUP_JAILHOUSE 6 -#define SETUP_CC_BLOB 7 -#define SETUP_IMA 8 -#define SETUP_RNG_SEED 9 -#define SETUP_ENUM_MAX SETUP_RNG_SEED - -#define SETUP_INDIRECT (1<<31) -#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) +#include /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -49,22 +35,6 @@ #include #include