From a6fe043880820981f6e4918240f967ea79bb063e Mon Sep 17 00:00:00 2001 From: Kameron Carr Date: Fri, 23 Jun 2023 15:09:49 -0700 Subject: [PATCH 001/230] Drivers: hv: Change hv_free_hyperv_page() to take void * argument Currently hv_free_hyperv_page() takes an unsigned long argument, which is inconsistent with the void * return value from the corresponding hv_alloc_hyperv_page() function and variants. This creates unnecessary extra casting. Change the hv_free_hyperv_page() argument type to void *. Also remove redundant casts from invocations of hv_alloc_hyperv_page() and variants. Signed-off-by: Kameron Carr Reviewed-by: Nuno Das Neves Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1687558189-19734-1-git-send-email-kameroncarr@linux.microsoft.com Signed-off-by: Wei Liu --- drivers/hv/connection.c | 13 ++++++------- drivers/hv/hv_common.c | 10 +++++----- include/asm-generic/mshyperv.h | 2 +- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 5978e9dbc286..ebf15f31d97e 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -209,8 +209,7 @@ int vmbus_connect(void) * Setup the vmbus event connection for channel interrupt * abstraction stuff */ - vmbus_connection.int_page = - (void *)hv_alloc_hyperv_zeroed_page(); + vmbus_connection.int_page = hv_alloc_hyperv_zeroed_page(); if (vmbus_connection.int_page == NULL) { ret = -ENOMEM; goto cleanup; @@ -225,8 +224,8 @@ int vmbus_connect(void) * Setup the monitor notification facility. The 1st page for * parent->child and the 2nd page for child->parent */ - vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_page(); - vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_page(); + vmbus_connection.monitor_pages[0] = hv_alloc_hyperv_page(); + vmbus_connection.monitor_pages[1] = hv_alloc_hyperv_page(); if ((vmbus_connection.monitor_pages[0] == NULL) || (vmbus_connection.monitor_pages[1] == NULL)) { ret = -ENOMEM; @@ -333,15 +332,15 @@ void vmbus_disconnect(void) destroy_workqueue(vmbus_connection.work_queue); if (vmbus_connection.int_page) { - hv_free_hyperv_page((unsigned long)vmbus_connection.int_page); + hv_free_hyperv_page(vmbus_connection.int_page); vmbus_connection.int_page = NULL; } set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[0], 1); set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[1], 1); - hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[0]); - hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[1]); + hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); + hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); vmbus_connection.monitor_pages[0] = NULL; vmbus_connection.monitor_pages[1] = NULL; } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 542a1d53b303..6a2258fef1fe 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -115,12 +115,12 @@ void *hv_alloc_hyperv_zeroed_page(void) } EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page); -void hv_free_hyperv_page(unsigned long addr) +void hv_free_hyperv_page(void *addr) { if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - free_page(addr); + free_page((unsigned long)addr); else - kfree((void *)addr); + kfree(addr); } EXPORT_SYMBOL_GPL(hv_free_hyperv_page); @@ -253,7 +253,7 @@ static void hv_kmsg_dump_unregister(void) atomic_notifier_chain_unregister(&panic_notifier_list, &hyperv_panic_report_block); - hv_free_hyperv_page((unsigned long)hv_panic_page); + hv_free_hyperv_page(hv_panic_page); hv_panic_page = NULL; } @@ -270,7 +270,7 @@ static void hv_kmsg_dump_register(void) ret = kmsg_dump_register(&hv_kmsg_dumper); if (ret) { pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); - hv_free_hyperv_page((unsigned long)hv_panic_page); + hv_free_hyperv_page(hv_panic_page); hv_panic_page = NULL; } } diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 402a8c1c202d..a8f4b653ef4e 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -190,7 +190,7 @@ int hv_common_cpu_die(unsigned int cpu); void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); -void hv_free_hyperv_page(unsigned long addr); +void hv_free_hyperv_page(void *addr); /** * hv_cpu_number_to_vp_number() - Map CPU to VP. From 55e544e1a922d272b62ec576a3de92329f838ce9 Mon Sep 17 00:00:00 2001 From: Nischala Yelchuri Date: Tue, 20 Jun 2023 11:40:38 -0700 Subject: [PATCH 002/230] x86/hyperv: Improve code for referencing hyperv_pcpu_input_arg Several places in code for Hyper-V reference the per-CPU variable hyperv_pcpu_input_arg. Older code uses a multi-line sequence to reference the variable, and usually includes a cast. Newer code does a much simpler direct assignment. The latter is preferable as the complexity of the older code is unnecessary. Update older code to use the simpler direct assignment. Signed-off-by: Nischala Yelchuri Link: https://lore.kernel.org/r/1687286438-9421-1-git-send-email-niyelchu@linux.microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_apic.c | 4 +--- arch/x86/hyperv/ivm.c | 7 +++---- arch/x86/hyperv/mmu.c | 12 ++---------- arch/x86/hyperv/nested.c | 11 ++--------- drivers/hv/hv_balloon.c | 2 +- 5 files changed, 9 insertions(+), 27 deletions(-) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 1fbda2f94184..b21335e6a210 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -107,7 +107,6 @@ static bool cpu_is_self(int cpu) static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, bool exclude_self) { - struct hv_send_ipi_ex **arg; struct hv_send_ipi_ex *ipi_arg; unsigned long flags; int nr_bank = 0; @@ -117,9 +116,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, return false; local_irq_save(flags); - arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + ipi_arg = *this_cpu_ptr(hyperv_pcpu_input_arg); - ipi_arg = *arg; if (unlikely(!ipi_arg)) goto ipi_mask_ex_done; diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index cc92388b7a99..9b63151bc6cd 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], enum hv_mem_host_visibility visibility) { - struct hv_gpa_range_for_visibility **input_pcpu, *input; + struct hv_gpa_range_for_visibility *input; u16 pages_processed; u64 hv_status; unsigned long flags; @@ -262,9 +262,8 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], } local_irq_save(flags); - input_pcpu = (struct hv_gpa_range_for_visibility **) - this_cpu_ptr(hyperv_pcpu_input_arg); - input = *input_pcpu; + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + if (unlikely(!input)) { local_irq_restore(flags); return -EINVAL; diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 8460bd35e10c..1cc113200ff5 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -61,7 +61,6 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, const struct flush_tlb_info *info) { int cpu, vcpu, gva_n, max_gvas; - struct hv_tlb_flush **flush_pcpu; struct hv_tlb_flush *flush; u64 status; unsigned long flags; @@ -74,10 +73,7 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, local_irq_save(flags); - flush_pcpu = (struct hv_tlb_flush **) - this_cpu_ptr(hyperv_pcpu_input_arg); - - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (unlikely(!flush)) { local_irq_restore(flags); @@ -178,17 +174,13 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, const struct flush_tlb_info *info) { int nr_bank = 0, max_gvas, gva_n; - struct hv_tlb_flush_ex **flush_pcpu; struct hv_tlb_flush_ex *flush; u64 status; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return HV_STATUS_INVALID_PARAMETER; - flush_pcpu = (struct hv_tlb_flush_ex **) - this_cpu_ptr(hyperv_pcpu_input_arg); - - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (info->mm) { /* diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c index 5d70968c8538..9dc259fa322e 100644 --- a/arch/x86/hyperv/nested.c +++ b/arch/x86/hyperv/nested.c @@ -19,7 +19,6 @@ int hyperv_flush_guest_mapping(u64 as) { - struct hv_guest_mapping_flush **flush_pcpu; struct hv_guest_mapping_flush *flush; u64 status; unsigned long flags; @@ -30,10 +29,7 @@ int hyperv_flush_guest_mapping(u64 as) local_irq_save(flags); - flush_pcpu = (struct hv_guest_mapping_flush **) - this_cpu_ptr(hyperv_pcpu_input_arg); - - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (unlikely(!flush)) { local_irq_restore(flags); @@ -90,7 +86,6 @@ EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list); int hyperv_flush_guest_mapping_range(u64 as, hyperv_fill_flush_list_func fill_flush_list_func, void *data) { - struct hv_guest_mapping_flush_list **flush_pcpu; struct hv_guest_mapping_flush_list *flush; u64 status; unsigned long flags; @@ -102,10 +97,8 @@ int hyperv_flush_guest_mapping_range(u64 as, local_irq_save(flags); - flush_pcpu = (struct hv_guest_mapping_flush_list **) - this_cpu_ptr(hyperv_pcpu_input_arg); + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); - flush = *flush_pcpu; if (unlikely(!flush)) { local_irq_restore(flags); goto fault; diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index dffcc894f117..0d7a3ba66396 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1628,7 +1628,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); WARN_ON_ONCE(sgl->length < (HV_HYP_PAGE_SIZE << page_reporting_order)); local_irq_save(flags); - hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg); + hint = *this_cpu_ptr(hyperv_pcpu_input_arg); if (!hint) { local_irq_restore(flags); return -ENOSPC; From 6e2acbfe59b83043bc7ae1bb39fac4fc9dcd5a18 Mon Sep 17 00:00:00 2001 From: Dmitry Rokosov Date: Wed, 5 Jul 2023 00:54:04 +0300 Subject: [PATCH 003/230] clk: meson: change usleep_range() to udelay() for atomic context The function meson_clk_pll_enable() can be invoked under the enable_lock spinlock from the clk core logic, which risks a kernel panic during the usleep_range() call: BUG: scheduling while atomic: kworker/u4:2/36/0x00000002 Modules linked in: g_ffs usb_f_fs libcomposite CPU: 1 PID: 36 Comm: kworker/u4:2 Not tainted 6.4.0-rc5 #273 Workqueue: events_unbound async_run_entry_fn Call trace: dump_backtrace+0x9c/0x128 show_stack+0x20/0x38 dump_stack_lvl+0x48/0x60 dump_stack+0x18/0x28 __schedule_bug+0x58/0x78 __schedule+0x828/0xa88 schedule+0x64/0xd8 schedule_hrtimeout_range_clock+0xd0/0x208 schedule_hrtimeout_range+0x1c/0x30 usleep_range_state+0x6c/0xa8 meson_clk_pll_enable+0x1f4/0x310 clk_core_enable+0x78/0x200 clk_core_enable+0x58/0x200 clk_core_enable+0x58/0x200 clk_core_enable+0x58/0x200 clk_enable+0x34/0x60 So it is required to use the udelay() function instead of usleep_range() for the atomic context safety. Fixes: b6ec400aa153 ("clk: meson: introduce new pll power-on sequence for A1 SoC family") Reported-by: Jan Dakinevich Signed-off-by: Dmitry Rokosov Link: https://lore.kernel.org/r/20230704215404.11533-1-ddrokosov@sberdevices.ru Signed-off-by: Jerome Brunet --- drivers/clk/meson/clk-pll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clk/meson/clk-pll.c b/drivers/clk/meson/clk-pll.c index 8fef90bf962f..6fa7639a3050 100644 --- a/drivers/clk/meson/clk-pll.c +++ b/drivers/clk/meson/clk-pll.c @@ -367,9 +367,9 @@ static int meson_clk_pll_enable(struct clk_hw *hw) * 3. enable the lock detect module */ if (MESON_PARM_APPLICABLE(&pll->current_en)) { - usleep_range(10, 20); + udelay(10); meson_parm_write(clk->map, &pll->current_en, 1); - usleep_range(40, 50); + udelay(40); } if (MESON_PARM_APPLICABLE(&pll->l_detect)) { From b2ec116aad38aa9c8b67fad4314e50823adf6949 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 11 Jul 2023 12:38:20 +0200 Subject: [PATCH 004/230] workqueue: Fix cpu_intensive_thresh_us name in help text There exists no parameter called "cpu_intensive_threshold_us". The actual parameter name is "cpu_intensive_thresh_us". Fixes: 6363845005202148 ("workqueue: Report work funcs that trigger automatic CPU_INTENSIVE mechanism") Signed-off-by: Geert Uytterhoeven Reviewed-by: Randy Dunlap Signed-off-by: Tejun Heo --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index fbc89baf7de6..d6798513a8c2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1200,7 +1200,7 @@ config WQ_CPU_INTENSIVE_REPORT help Say Y here to enable reporting of concurrency-managed per-cpu work items that hog CPUs for longer than - workqueue.cpu_intensive_threshold_us. Workqueue automatically + workqueue.cpu_intensive_thresh_us. Workqueue automatically detects and excludes them from concurrency management to prevent them from stalling other per-cpu work items. Occassional triggering may not necessarily indicate a problem. Repeated From 8544cda94dae6be3f1359539079c68bb731428b1 Mon Sep 17 00:00:00 2001 From: Olivier Maignial Date: Fri, 23 Jun 2023 17:33:36 +0200 Subject: [PATCH 005/230] mtd: spinand: toshiba: Fix ecc_get_status Reading ECC status is failing. tx58cxgxsxraix_ecc_get_status() is using on-stack buffer for SPINAND_GET_FEATURE_OP() output. It is not suitable for DMA needs of spi-mem. Fix this by using the spi-mem operations dedicated buffer spinand->scratchbuf. See spinand->scratchbuf: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/mtd/spinand.h?h=v6.3#n418 spi_mem_check_op(): https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/spi/spi-mem.c?h=v6.3#n199 Fixes: 10949af1681d ("mtd: spinand: Add initial support for Toshiba TC58CVG2S0H") Cc: stable@vger.kernel.org Signed-off-by: Olivier Maignial Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/DB4P250MB1032553D05FBE36DEE0D311EFE23A@DB4P250MB1032.EURP250.PROD.OUTLOOK.COM --- drivers/mtd/nand/spi/toshiba.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/spi/toshiba.c b/drivers/mtd/nand/spi/toshiba.c index 7380b1ebaccd..a80427c13121 100644 --- a/drivers/mtd/nand/spi/toshiba.c +++ b/drivers/mtd/nand/spi/toshiba.c @@ -73,7 +73,7 @@ static int tx58cxgxsxraix_ecc_get_status(struct spinand_device *spinand, { struct nand_device *nand = spinand_to_nand(spinand); u8 mbf = 0; - struct spi_mem_op op = SPINAND_GET_FEATURE_OP(0x30, &mbf); + struct spi_mem_op op = SPINAND_GET_FEATURE_OP(0x30, spinand->scratchbuf); switch (status & STATUS_ECC_MASK) { case STATUS_ECC_NO_BITFLIPS: @@ -92,7 +92,7 @@ static int tx58cxgxsxraix_ecc_get_status(struct spinand_device *spinand, if (spi_mem_exec_op(spinand->spimem, &op)) return nanddev_get_ecc_conf(nand)->strength; - mbf >>= 4; + mbf = *(spinand->scratchbuf) >> 4; if (WARN_ON(mbf > nanddev_get_ecc_conf(nand)->strength || !mbf)) return nanddev_get_ecc_conf(nand)->strength; From f5a05060670a4d8d6523afc7963eb559c2e3615f Mon Sep 17 00:00:00 2001 From: Olivier Maignial Date: Fri, 23 Jun 2023 17:33:37 +0200 Subject: [PATCH 006/230] mtd: spinand: winbond: Fix ecc_get_status Reading ECC status is failing. w25n02kv_ecc_get_status() is using on-stack buffer for SPINAND_GET_FEATURE_OP() output. It is not suitable for DMA needs of spi-mem. Fix this by using the spi-mem operations dedicated buffer spinand->scratchbuf. See spinand->scratchbuf: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/mtd/spinand.h?h=v6.3#n418 spi_mem_check_op(): https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/spi/spi-mem.c?h=v6.3#n199 Fixes: 6154c7a58348 ("mtd: spinand: winbond: add Winbond W25N02KV flash support") Cc: stable@vger.kernel.org Signed-off-by: Olivier Maignial Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/DB4P250MB1032EDB9E36B764A33769039FE23A@DB4P250MB1032.EURP250.PROD.OUTLOOK.COM --- drivers/mtd/nand/spi/winbond.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c index 3ad58cd284d8..f507e3759301 100644 --- a/drivers/mtd/nand/spi/winbond.c +++ b/drivers/mtd/nand/spi/winbond.c @@ -108,7 +108,7 @@ static int w25n02kv_ecc_get_status(struct spinand_device *spinand, { struct nand_device *nand = spinand_to_nand(spinand); u8 mbf = 0; - struct spi_mem_op op = SPINAND_GET_FEATURE_OP(0x30, &mbf); + struct spi_mem_op op = SPINAND_GET_FEATURE_OP(0x30, spinand->scratchbuf); switch (status & STATUS_ECC_MASK) { case STATUS_ECC_NO_BITFLIPS: @@ -126,7 +126,7 @@ static int w25n02kv_ecc_get_status(struct spinand_device *spinand, if (spi_mem_exec_op(spinand->spimem, &op)) return nanddev_get_ecc_conf(nand)->strength; - mbf >>= 4; + mbf = *(spinand->scratchbuf) >> 4; if (WARN_ON(mbf > nanddev_get_ecc_conf(nand)->strength || !mbf)) return nanddev_get_ecc_conf(nand)->strength; From d8403b9eeee66d5dd81ecb9445800b108c267ce3 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Sun, 25 Jun 2023 00:10:21 +0530 Subject: [PATCH 007/230] mtd: rawnand: omap_elm: Fix incorrect type in assignment Once the ECC word endianness is converted to BE32, we force cast it to u32 so we can use elm_write_reg() which in turn uses writel(). Fixes below sparse warnings: drivers/mtd/nand/raw/omap_elm.c:180:37: sparse: expected unsigned int [usertype] val drivers/mtd/nand/raw/omap_elm.c:180:37: sparse: got restricted __be32 [usertype] drivers/mtd/nand/raw/omap_elm.c:185:37: sparse: expected unsigned int [usertype] val drivers/mtd/nand/raw/omap_elm.c:185:37: sparse: got restricted __be32 [usertype] drivers/mtd/nand/raw/omap_elm.c:190:37: sparse: expected unsigned int [usertype] val drivers/mtd/nand/raw/omap_elm.c:190:37: sparse: got restricted __be32 [usertype] >> drivers/mtd/nand/raw/omap_elm.c:200:40: sparse: sparse: restricted __be32 degrades to integer drivers/mtd/nand/raw/omap_elm.c:206:39: sparse: sparse: restricted __be32 degrades to integer drivers/mtd/nand/raw/omap_elm.c:210:37: sparse: expected unsigned int [assigned] [usertype] val drivers/mtd/nand/raw/omap_elm.c:210:37: sparse: got restricted __be32 [usertype] drivers/mtd/nand/raw/omap_elm.c:213:37: sparse: expected unsigned int [assigned] [usertype] val drivers/mtd/nand/raw/omap_elm.c:213:37: sparse: got restricted __be32 [usertype] drivers/mtd/nand/raw/omap_elm.c:216:37: sparse: expected unsigned int [assigned] [usertype] val drivers/mtd/nand/raw/omap_elm.c:216:37: sparse: got restricted __be32 [usertype] drivers/mtd/nand/raw/omap_elm.c:219:37: sparse: expected unsigned int [assigned] [usertype] val drivers/mtd/nand/raw/omap_elm.c:219:37: sparse: got restricted __be32 [usertype] drivers/mtd/nand/raw/omap_elm.c:222:37: sparse: expected unsigned int [assigned] [usertype] val drivers/mtd/nand/raw/omap_elm.c:222:37: sparse: got restricted __be32 [usertype] drivers/mtd/nand/raw/omap_elm.c:225:37: sparse: expected unsigned int [assigned] [usertype] val drivers/mtd/nand/raw/omap_elm.c:225:37: sparse: got restricted __be32 [usertype] drivers/mtd/nand/raw/omap_elm.c:228:39: sparse: sparse: restricted __be32 degrades to integer Fixes: bf22433575ef ("mtd: devices: elm: Add support for ELM error correction") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306212211.WDXokuWh-lkp@intel.com/ Signed-off-by: Roger Quadros Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20230624184021.7740-1-rogerq@kernel.org --- drivers/mtd/nand/raw/omap_elm.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/mtd/nand/raw/omap_elm.c b/drivers/mtd/nand/raw/omap_elm.c index 6e1eac6644a6..4a97d4a76454 100644 --- a/drivers/mtd/nand/raw/omap_elm.c +++ b/drivers/mtd/nand/raw/omap_elm.c @@ -177,17 +177,17 @@ static void elm_load_syndrome(struct elm_info *info, switch (info->bch_type) { case BCH8_ECC: /* syndrome fragment 0 = ecc[9-12B] */ - val = cpu_to_be32(*(u32 *) &ecc[9]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[9]); elm_write_reg(info, offset, val); /* syndrome fragment 1 = ecc[5-8B] */ offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[5]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[5]); elm_write_reg(info, offset, val); /* syndrome fragment 2 = ecc[1-4B] */ offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[1]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[1]); elm_write_reg(info, offset, val); /* syndrome fragment 3 = ecc[0B] */ @@ -197,35 +197,35 @@ static void elm_load_syndrome(struct elm_info *info, break; case BCH4_ECC: /* syndrome fragment 0 = ecc[20-52b] bits */ - val = (cpu_to_be32(*(u32 *) &ecc[3]) >> 4) | + val = ((__force u32)cpu_to_be32(*(u32 *)&ecc[3]) >> 4) | ((ecc[2] & 0xf) << 28); elm_write_reg(info, offset, val); /* syndrome fragment 1 = ecc[0-20b] bits */ offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[0]) >> 12; + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[0]) >> 12; elm_write_reg(info, offset, val); break; case BCH16_ECC: - val = cpu_to_be32(*(u32 *) &ecc[22]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[22]); elm_write_reg(info, offset, val); offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[18]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[18]); elm_write_reg(info, offset, val); offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[14]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[14]); elm_write_reg(info, offset, val); offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[10]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[10]); elm_write_reg(info, offset, val); offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[6]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[6]); elm_write_reg(info, offset, val); offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[2]); + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[2]); elm_write_reg(info, offset, val); offset += 4; - val = cpu_to_be32(*(u32 *) &ecc[0]) >> 16; + val = (__force u32)cpu_to_be32(*(u32 *)&ecc[0]) >> 16; elm_write_reg(info, offset, val); break; default: From 7e6b04f9238eab0f684fafd158c1f32ea65b9eaa Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Wed, 5 Jul 2023 09:52:10 +0300 Subject: [PATCH 008/230] mtd: rawnand: meson: fix OOB available bytes for ECC It is incorrect to calculate number of OOB bytes for ECC engine using some "already known" ECC step size (1024 bytes here). Number of such bytes for ECC engine must be whole OOB except 2 bytes for bad block marker, while proper ECC step size and strength will be selected by ECC logic. Fixes: 8fae856c5350 ("mtd: rawnand: meson: add support for Amlogic NAND flash controller") Cc: Signed-off-by: Arseniy Krasnov Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20230705065211.293500-1-AVKrasnov@sberdevices.ru --- drivers/mtd/nand/raw/meson_nand.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/meson_nand.c b/drivers/mtd/nand/raw/meson_nand.c index d3faf8086631..b10011dec1e6 100644 --- a/drivers/mtd/nand/raw/meson_nand.c +++ b/drivers/mtd/nand/raw/meson_nand.c @@ -1278,7 +1278,6 @@ static int meson_nand_attach_chip(struct nand_chip *nand) struct meson_nfc *nfc = nand_get_controller_data(nand); struct meson_nfc_nand_chip *meson_chip = to_meson_nand(nand); struct mtd_info *mtd = nand_to_mtd(nand); - int nsectors = mtd->writesize / 1024; int raw_writesize; int ret; @@ -1304,7 +1303,7 @@ static int meson_nand_attach_chip(struct nand_chip *nand) nand->options |= NAND_NO_SUBPAGE_WRITE; ret = nand_ecc_choose_conf(nand, nfc->data->ecc_caps, - mtd->oobsize - 2 * nsectors); + mtd->oobsize - 2); if (ret) { dev_err(nfc->dev, "failed to ECC init\n"); return -EINVAL; From b1e213a9e31c20206f111ec664afcf31cbfe0dbb Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 7 Jul 2023 21:58:45 +0800 Subject: [PATCH 009/230] idmaengine: make FSL_EDMA and INTEL_IDMA64 depends on HAS_IOMEM On s390 systems (aka mainframes), it has classic channel devices for networking and permanent storage that are currently even more common than PCI devices. Hence it could have a fully functional s390 kernel with CONFIG_PCI=n, then the relevant iomem mapping functions [including ioremap(), devm_ioremap(), etc.] are not available. Here let FSL_EDMA and INTEL_IDMA64 depend on HAS_IOMEM so that it won't be built to cause below compiling error if PCI is unset. -------- ERROR: modpost: "devm_platform_ioremap_resource" [drivers/dma/fsl-edma.ko] undefined! ERROR: modpost: "devm_platform_ioremap_resource" [drivers/dma/idma64.ko] undefined! -------- Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306211329.ticOJCSv-lkp@intel.com/ Signed-off-by: Baoquan He Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20230707135852.24292-2-bhe@redhat.com Signed-off-by: Vinod Koul --- drivers/dma/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 644c188d6a11..08fdd0e2ed1b 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -211,6 +211,7 @@ config FSL_DMA config FSL_EDMA tristate "Freescale eDMA engine support" depends on OF + depends on HAS_IOMEM select DMA_ENGINE select DMA_VIRTUAL_CHANNELS help @@ -280,6 +281,7 @@ config IMX_SDMA config INTEL_IDMA64 tristate "Intel integrated DMA 64-bit support" + depends on HAS_IOMEM select DMA_ENGINE select DMA_VIRTUAL_CHANNELS help From a68b48afc050a9456ed4ed19d8755e0f925b44e6 Mon Sep 17 00:00:00 2001 From: Minjie Du Date: Wed, 5 Jul 2023 19:39:12 +0800 Subject: [PATCH 010/230] dmaengine: xilinx: xdma: Fix Judgment of the return value Fix: make IS_ERR() judge the devm_ioremap_resource() function return. Fixes: 17ce252266c7 ("dmaengine: xilinx: xdma: Add xilinx xdma driver") Signed-off-by: Minjie Du Acked-by: Michal Simek Link: https://lore.kernel.org/r/20230705113912.16247-1-duminjie@vivo.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 93ee298d52b8..ad5ff63354cf 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -892,7 +892,7 @@ static int xdma_probe(struct platform_device *pdev) } reg_base = devm_ioremap_resource(&pdev->dev, res); - if (!reg_base) { + if (IS_ERR(reg_base)) { xdma_err(xdev, "ioremap failed"); goto failed; } From d44263222134b5635932974c6177a5cba65a07e8 Mon Sep 17 00:00:00 2001 From: Sergei Antonov Date: Tue, 27 Jun 2023 15:05:49 +0300 Subject: [PATCH 011/230] mmc: moxart: read scr register without changing byte order Conversion from big-endian to native is done in a common function mmc_app_send_scr(). Converting in moxart_transfer_pio() is extra. Double conversion on a LE system returns an incorrect SCR value, leads to errors: mmc0: unrecognised SCR structure version 8 Fixes: 1b66e94e6b99 ("mmc: moxart: Add MOXA ART SD/MMC driver") Signed-off-by: Sergei Antonov Cc: Jonas Jensen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230627120549.2400325-1-saproj@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/moxart-mmc.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index 2d002c81dcf3..d0d6ffcf78d4 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -338,13 +338,7 @@ static void moxart_transfer_pio(struct moxart_host *host) return; } for (len = 0; len < remain && len < host->fifo_width;) { - /* SCR data must be read in big endian. */ - if (data->mrq->cmd->opcode == SD_APP_SEND_SCR) - *sgp = ioread32be(host->base + - REG_DATA_WINDOW); - else - *sgp = ioread32(host->base + - REG_DATA_WINDOW); + *sgp = ioread32(host->base + REG_DATA_WINDOW); sgp++; len += 4; } From 5def5c1c15bf22934ee227af85c1716762f3829f Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Fri, 30 Jun 2023 09:45:33 +0900 Subject: [PATCH 012/230] mmc: sdhci-f-sdh30: Replace with sdhci_pltfm Even if sdhci_pltfm_pmops is specified for PM, this driver doesn't apply sdhci_pltfm, so the structure is not correctly referenced in PM functions. This applies sdhci_pltfm to this driver to fix this issue. - Call sdhci_pltfm_init() instead of sdhci_alloc_host() and other functions that covered by sdhci_pltfm. - Move ops and quirks to sdhci_pltfm_data - Replace sdhci_priv() with own private function sdhci_f_sdh30_priv(). Fixes: 87a507459f49 ("mmc: sdhci: host: add new f_sdh30") Signed-off-by: Kunihiko Hayashi Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230630004533.26644-1-hayashi.kunihiko@socionext.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_f_sdh30.c | 60 ++++++++++++++------------------ 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/drivers/mmc/host/sdhci_f_sdh30.c b/drivers/mmc/host/sdhci_f_sdh30.c index a202a69a4b08..b01ffb4d0973 100644 --- a/drivers/mmc/host/sdhci_f_sdh30.c +++ b/drivers/mmc/host/sdhci_f_sdh30.c @@ -29,9 +29,16 @@ struct f_sdhost_priv { bool enable_cmd_dat_delay; }; +static void *sdhci_f_sdhost_priv(struct sdhci_host *host) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + + return sdhci_pltfm_priv(pltfm_host); +} + static void sdhci_f_sdh30_soft_voltage_switch(struct sdhci_host *host) { - struct f_sdhost_priv *priv = sdhci_priv(host); + struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host); u32 ctrl = 0; usleep_range(2500, 3000); @@ -64,7 +71,7 @@ static unsigned int sdhci_f_sdh30_get_min_clock(struct sdhci_host *host) static void sdhci_f_sdh30_reset(struct sdhci_host *host, u8 mask) { - struct f_sdhost_priv *priv = sdhci_priv(host); + struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host); u32 ctl; if (sdhci_readw(host, SDHCI_CLOCK_CONTROL) == 0) @@ -95,30 +102,32 @@ static const struct sdhci_ops sdhci_f_sdh30_ops = { .set_uhs_signaling = sdhci_set_uhs_signaling, }; +static const struct sdhci_pltfm_data sdhci_f_sdh30_pltfm_data = { + .ops = &sdhci_f_sdh30_ops, + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC + | SDHCI_QUIRK_INVERTED_WRITE_PROTECT, + .quirks2 = SDHCI_QUIRK2_SUPPORT_SINGLE + | SDHCI_QUIRK2_TUNING_WORK_AROUND, +}; + static int sdhci_f_sdh30_probe(struct platform_device *pdev) { struct sdhci_host *host; struct device *dev = &pdev->dev; - int irq, ctrl = 0, ret = 0; + int ctrl = 0, ret = 0; struct f_sdhost_priv *priv; + struct sdhci_pltfm_host *pltfm_host; u32 reg = 0; - irq = platform_get_irq(pdev, 0); - if (irq < 0) - return irq; - - host = sdhci_alloc_host(dev, sizeof(struct f_sdhost_priv)); + host = sdhci_pltfm_init(pdev, &sdhci_f_sdh30_pltfm_data, + sizeof(struct f_sdhost_priv)); if (IS_ERR(host)) return PTR_ERR(host); - priv = sdhci_priv(host); + pltfm_host = sdhci_priv(host); + priv = sdhci_pltfm_priv(pltfm_host); priv->dev = dev; - host->quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC | - SDHCI_QUIRK_INVERTED_WRITE_PROTECT; - host->quirks2 = SDHCI_QUIRK2_SUPPORT_SINGLE | - SDHCI_QUIRK2_TUNING_WORK_AROUND; - priv->enable_cmd_dat_delay = device_property_read_bool(dev, "fujitsu,cmd-dat-delay-select"); @@ -126,18 +135,6 @@ static int sdhci_f_sdh30_probe(struct platform_device *pdev) if (ret) goto err; - platform_set_drvdata(pdev, host); - - host->hw_name = "f_sdh30"; - host->ops = &sdhci_f_sdh30_ops; - host->irq = irq; - - host->ioaddr = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(host->ioaddr)) { - ret = PTR_ERR(host->ioaddr); - goto err; - } - if (dev_of_node(dev)) { sdhci_get_of_property(pdev); @@ -204,24 +201,21 @@ err_rst: err_clk: clk_disable_unprepare(priv->clk_iface); err: - sdhci_free_host(host); + sdhci_pltfm_free(pdev); + return ret; } static int sdhci_f_sdh30_remove(struct platform_device *pdev) { struct sdhci_host *host = platform_get_drvdata(pdev); - struct f_sdhost_priv *priv = sdhci_priv(host); - - sdhci_remove_host(host, readl(host->ioaddr + SDHCI_INT_STATUS) == - 0xffffffff); + struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host); reset_control_assert(priv->rst); clk_disable_unprepare(priv->clk); clk_disable_unprepare(priv->clk_iface); - sdhci_free_host(host); - platform_set_drvdata(pdev, NULL); + sdhci_pltfm_unregister(pdev); return 0; } From d0ca3b92b7a6f42841ea9da8492aaf649db79780 Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Fri, 14 Jul 2023 17:21:01 +0200 Subject: [PATCH 013/230] mtd: rawnand: rockchip: fix oobfree offset and description Rockchip boot blocks are written per 4 x 512 byte sectors per page. Each page with boot blocks must have a page address (PA) pointer in OOB to the next page. The currently advertised free OOB area starts at offset 6, like if 4 PA bytes were located right after the BBM. This is wrong as the PA bytes are located right before the ECC bytes. Fix the layout by allowing access to all bytes between the BBM and the PA bytes instead of reserving 4 bytes right after the BBM. This change breaks existing jffs2 users. Fixes: 058e0e847d54 ("mtd: rawnand: rockchip: NFC driver for RK3308, RK2928 and others") Signed-off-by: Johan Jonker Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/d202f12d-188c-20e8-f2c2-9cc874ad4d22@gmail.com --- drivers/mtd/nand/raw/rockchip-nand-controller.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/mtd/nand/raw/rockchip-nand-controller.c b/drivers/mtd/nand/raw/rockchip-nand-controller.c index 2312e27362cb..37fc07ba57aa 100644 --- a/drivers/mtd/nand/raw/rockchip-nand-controller.c +++ b/drivers/mtd/nand/raw/rockchip-nand-controller.c @@ -562,9 +562,10 @@ static int rk_nfc_write_page_raw(struct nand_chip *chip, const u8 *buf, * BBM OOB1 OOB2 OOB3 |......| PA0 PA1 PA2 PA3 * * The rk_nfc_ooblayout_free() function already has reserved - * these 4 bytes with: + * these 4 bytes together with 2 bytes for BBM + * by reducing it's length: * - * oob_region->offset = NFC_SYS_DATA_SIZE + 2; + * oob_region->length = rknand->metadata_size - NFC_SYS_DATA_SIZE - 2; */ if (!i) memcpy(rk_nfc_oob_ptr(chip, i), @@ -933,12 +934,8 @@ static int rk_nfc_ooblayout_free(struct mtd_info *mtd, int section, if (section) return -ERANGE; - /* - * The beginning of the OOB area stores the reserved data for the NFC, - * the size of the reserved data is NFC_SYS_DATA_SIZE bytes. - */ oob_region->length = rknand->metadata_size - NFC_SYS_DATA_SIZE - 2; - oob_region->offset = NFC_SYS_DATA_SIZE + 2; + oob_region->offset = 2; return 0; } From ea690ad78dd611e3906df5b948a516000b05c1cb Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Fri, 14 Jul 2023 17:21:21 +0200 Subject: [PATCH 014/230] mtd: rawnand: rockchip: Align hwecc vs. raw page helper layouts Currently, read/write_page_hwecc() and read/write_page_raw() are not aligned: there is a mismatch in the OOB bytes which are not read/written at the same offset in both cases (raw vs. hwecc). This is a real problem when relying on the presence of the Page Addresses (PA) when using the NAND chip as a boot device, as the BootROM expects additional data in the OOB area at specific locations. Rockchip boot blocks are written per 4 x 512 byte sectors per page. Each page with boot blocks must have a page address (PA) pointer in OOB to the next page. Pages are written in a pattern depending on the NAND chip ID. Generate boot block page address and pattern for hwecc in user space and copy PA data to/from the already reserved last 4 bytes before ECC in the chip->oob_poi data layout. Align the different helpers. This change breaks existing jffs2 users. Fixes: 058e0e847d54 ("mtd: rawnand: rockchip: NFC driver for RK3308, RK2928 and others") Signed-off-by: Johan Jonker Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/5e782c08-862b-51ae-47ff-3299940928ca@gmail.com --- .../mtd/nand/raw/rockchip-nand-controller.c | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/mtd/nand/raw/rockchip-nand-controller.c b/drivers/mtd/nand/raw/rockchip-nand-controller.c index 37fc07ba57aa..5a04680342c3 100644 --- a/drivers/mtd/nand/raw/rockchip-nand-controller.c +++ b/drivers/mtd/nand/raw/rockchip-nand-controller.c @@ -598,7 +598,7 @@ static int rk_nfc_write_page_hwecc(struct nand_chip *chip, const u8 *buf, int pages_per_blk = mtd->erasesize / mtd->writesize; int ret = 0, i, boot_rom_mode = 0; dma_addr_t dma_data, dma_oob; - u32 reg; + u32 tmp; u8 *oob; nand_prog_page_begin_op(chip, page, 0, NULL, 0); @@ -625,6 +625,13 @@ static int rk_nfc_write_page_hwecc(struct nand_chip *chip, const u8 *buf, * * 0xFF 0xFF 0xFF 0xFF | BBM OOB1 OOB2 OOB3 | ... * + * The code here just swaps the first 4 bytes with the last + * 4 bytes without losing any data. + * + * The chip->oob_poi data layout: + * + * BBM OOB1 OOB2 OOB3 |......| PA0 PA1 PA2 PA3 + * * Configure the ECC algorithm supported by the boot ROM. */ if ((page < (pages_per_blk * rknand->boot_blks)) && @@ -635,21 +642,17 @@ static int rk_nfc_write_page_hwecc(struct nand_chip *chip, const u8 *buf, } for (i = 0; i < ecc->steps; i++) { - if (!i) { - reg = 0xFFFFFFFF; - } else { + if (!i) + oob = chip->oob_poi + (ecc->steps - 1) * NFC_SYS_DATA_SIZE; + else oob = chip->oob_poi + (i - 1) * NFC_SYS_DATA_SIZE; - reg = oob[0] | oob[1] << 8 | oob[2] << 16 | - oob[3] << 24; - } - if (!i && boot_rom_mode) - reg = (page & (pages_per_blk - 1)) * 4; + tmp = oob[0] | oob[1] << 8 | oob[2] << 16 | oob[3] << 24; if (nfc->cfg->type == NFC_V9) - nfc->oob_buf[i] = reg; + nfc->oob_buf[i] = tmp; else - nfc->oob_buf[i * (oob_step / 4)] = reg; + nfc->oob_buf[i * (oob_step / 4)] = tmp; } dma_data = dma_map_single(nfc->dev, (void *)nfc->page_buf, @@ -812,12 +815,17 @@ static int rk_nfc_read_page_hwecc(struct nand_chip *chip, u8 *buf, int oob_on, goto timeout_err; } - for (i = 1; i < ecc->steps; i++) { - oob = chip->oob_poi + (i - 1) * NFC_SYS_DATA_SIZE; + for (i = 0; i < ecc->steps; i++) { + if (!i) + oob = chip->oob_poi + (ecc->steps - 1) * NFC_SYS_DATA_SIZE; + else + oob = chip->oob_poi + (i - 1) * NFC_SYS_DATA_SIZE; + if (nfc->cfg->type == NFC_V9) tmp = nfc->oob_buf[i]; else tmp = nfc->oob_buf[i * (oob_step / 4)]; + *oob++ = (u8)tmp; *oob++ = (u8)(tmp >> 8); *oob++ = (u8)(tmp >> 16); From 0e52740ffd10c6c316837c6c128f460f1aaba1ea Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 8 Jul 2023 10:21:35 +0200 Subject: [PATCH 015/230] x86/bugs: Increase the x86 bugs vector size to two u32s There was never a doubt in my mind that they would not fit into a single u32 eventually. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/include/asm/cpufeatures.h | 2 +- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cb8ca46213be..1f6d904c6481 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -14,7 +14,7 @@ * Defines x86 CPU feature bits */ #define NCAPINTS 21 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ +#define NBUGINTS 2 /* N 32-bit bug flags */ /* * Note: If the comment begins with a quoted string, that string is used diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index cb8ca46213be..1f6d904c6481 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -14,7 +14,7 @@ * Defines x86 CPU feature bits */ #define NCAPINTS 21 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ +#define NBUGINTS 2 /* N 32-bit bug flags */ /* * Note: If the comment begins with a quoted string, that string is used From 84f68679032147dcdac9bb4d8eb8f4638e995dc6 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 18 Jul 2023 18:45:37 +0000 Subject: [PATCH 016/230] KVM: arm64: Allow pKVM on v1.0 compatible FF-A implementations pKVM initialization fails on systems with v1.1+ FF-A implementations, as the hyp does a strict match on the returned version from FFA_VERSION. This is a stronger assertion than required by the specification, which requires minor revisions be backwards compatible with earlier revisions of the same major version. Relax the check in hyp_ffa_init() to only test the returned major version. Even though v1.1 broke ABI, the expectation is that firmware incapable of using the v1.0 ABI return NOT_SUPPORTED instead of a valid version. Acked-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20230718184537.3220867-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/ffa.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 58dcd92bf346..ab4f5d160c58 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -705,7 +705,20 @@ int hyp_ffa_init(void *pages) if (res.a0 == FFA_RET_NOT_SUPPORTED) return 0; - if (res.a0 != FFA_VERSION_1_0) + /* + * Firmware returns the maximum supported version of the FF-A + * implementation. Check that the returned version is + * backwards-compatible with the hyp according to the rules in DEN0077A + * v1.1 REL0 13.2.1. + * + * Of course, things are never simple when dealing with firmware. v1.1 + * broke ABI with v1.0 on several structures, which is itself + * incompatible with the aforementioned versioning scheme. The + * expectation is that v1.x implementations that do not support the v1.0 + * ABI return NOT_SUPPORTED rather than a version number, according to + * DEN0077A v1.1 REL0 18.6.4. + */ + if (FFA_MAJOR_VERSION(res.a0) != 1) return -EOPNOTSUPP; arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); From 1eb8d61ac5c9c7ec56bb96d433532807509b9288 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 19 Jul 2023 15:42:50 +0800 Subject: [PATCH 017/230] clk: mediatek: mt8183: Add back SSPM related clocks This reverts commit 860690a93ef23b567f781c1b631623e27190f101. On the MT8183, the SSPM related clocks were removed claiming a lack of usage. This however causes some issues when the driver was converted to the new simple-probe mechanism. This mechanism allocates enough space for all the clocks defined in the clock driver, not the highest index in the DT binding. This leads to out-of-bound writes if their are holes in the DT binding or the driver (due to deprecated or unimplemented clocks). These errors can go unnoticed and cause memory corruption, leading to crashes in unrelated areas, or nothing at all. KASAN will detect them. Add the SSPM related clocks back to the MT8183 clock driver to fully implement the DT binding. The SSPM clocks are for the power management co-processor, and should never be turned off. They are marked as such. Fixes: 3f37ba7cc385 ("clk: mediatek: mt8183: Convert all remaining clocks to common probe") Signed-off-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20230719074251.1219089-1-wenst@chromium.org Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Stephen Boyd --- drivers/clk/mediatek/clk-mt8183.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/clk/mediatek/clk-mt8183.c b/drivers/clk/mediatek/clk-mt8183.c index 1ba421b38ec5..e31f94387d87 100644 --- a/drivers/clk/mediatek/clk-mt8183.c +++ b/drivers/clk/mediatek/clk-mt8183.c @@ -328,6 +328,14 @@ static const char * const atb_parents[] = { "syspll_d5" }; +static const char * const sspm_parents[] = { + "clk26m", + "univpll_d2_d4", + "syspll_d2_d2", + "univpll_d2_d2", + "syspll_d3" +}; + static const char * const dpi0_parents[] = { "clk26m", "tvdpll_d2", @@ -507,6 +515,9 @@ static const struct mtk_mux top_muxes[] = { /* CLK_CFG_6 */ MUX_GATE_CLR_SET_UPD(CLK_TOP_MUX_ATB, "atb_sel", atb_parents, 0xa0, 0xa4, 0xa8, 0, 2, 7, 0x004, 24), + MUX_GATE_CLR_SET_UPD_FLAGS(CLK_TOP_MUX_SSPM, "sspm_sel", + sspm_parents, 0xa0, 0xa4, 0xa8, 8, 3, 15, 0x004, 25, + CLK_IS_CRITICAL | CLK_SET_RATE_PARENT), MUX_GATE_CLR_SET_UPD(CLK_TOP_MUX_DPI0, "dpi0_sel", dpi0_parents, 0xa0, 0xa4, 0xa8, 16, 4, 23, 0x004, 26), MUX_GATE_CLR_SET_UPD(CLK_TOP_MUX_SCAM, "scam_sel", @@ -673,10 +684,18 @@ static const struct mtk_gate_regs infra3_cg_regs = { GATE_MTK(_id, _name, _parent, &infra2_cg_regs, _shift, \ &mtk_clk_gate_ops_setclr) +#define GATE_INFRA2_FLAGS(_id, _name, _parent, _shift, _flag) \ + GATE_MTK_FLAGS(_id, _name, _parent, &infra2_cg_regs, \ + _shift, &mtk_clk_gate_ops_setclr, _flag) + #define GATE_INFRA3(_id, _name, _parent, _shift) \ GATE_MTK(_id, _name, _parent, &infra3_cg_regs, _shift, \ &mtk_clk_gate_ops_setclr) +#define GATE_INFRA3_FLAGS(_id, _name, _parent, _shift, _flag) \ + GATE_MTK_FLAGS(_id, _name, _parent, &infra3_cg_regs, \ + _shift, &mtk_clk_gate_ops_setclr, _flag) + static const struct mtk_gate infra_clks[] = { /* INFRA0 */ GATE_INFRA0(CLK_INFRA_PMIC_TMR, "infra_pmic_tmr", "axi_sel", 0), @@ -748,7 +767,11 @@ static const struct mtk_gate infra_clks[] = { GATE_INFRA2(CLK_INFRA_UNIPRO_TICK, "infra_unipro_tick", "fufs_sel", 12), GATE_INFRA2(CLK_INFRA_UFS_MP_SAP_BCLK, "infra_ufs_mp_sap_bck", "fufs_sel", 13), GATE_INFRA2(CLK_INFRA_MD32_BCLK, "infra_md32_bclk", "axi_sel", 14), + /* infra_sspm is main clock in co-processor, should not be closed in Linux. */ + GATE_INFRA2_FLAGS(CLK_INFRA_SSPM, "infra_sspm", "sspm_sel", 15, CLK_IS_CRITICAL), GATE_INFRA2(CLK_INFRA_UNIPRO_MBIST, "infra_unipro_mbist", "axi_sel", 16), + /* infra_sspm_bus_hclk is main clock in co-processor, should not be closed in Linux. */ + GATE_INFRA2_FLAGS(CLK_INFRA_SSPM_BUS_HCLK, "infra_sspm_bus_hclk", "axi_sel", 17, CLK_IS_CRITICAL), GATE_INFRA2(CLK_INFRA_I2C5, "infra_i2c5", "i2c_sel", 18), GATE_INFRA2(CLK_INFRA_I2C5_ARBITER, "infra_i2c5_arbiter", "i2c_sel", 19), GATE_INFRA2(CLK_INFRA_I2C5_IMM, "infra_i2c5_imm", "i2c_sel", 20), @@ -766,6 +789,10 @@ static const struct mtk_gate infra_clks[] = { GATE_INFRA3(CLK_INFRA_MSDC0_SELF, "infra_msdc0_self", "msdc50_0_sel", 0), GATE_INFRA3(CLK_INFRA_MSDC1_SELF, "infra_msdc1_self", "msdc50_0_sel", 1), GATE_INFRA3(CLK_INFRA_MSDC2_SELF, "infra_msdc2_self", "msdc50_0_sel", 2), + /* infra_sspm_26m_self is main clock in co-processor, should not be closed in Linux. */ + GATE_INFRA3_FLAGS(CLK_INFRA_SSPM_26M_SELF, "infra_sspm_26m_self", "f_f26m_ck", 3, CLK_IS_CRITICAL), + /* infra_sspm_32k_self is main clock in co-processor, should not be closed in Linux. */ + GATE_INFRA3_FLAGS(CLK_INFRA_SSPM_32K_SELF, "infra_sspm_32k_self", "f_f26m_ck", 4, CLK_IS_CRITICAL), GATE_INFRA3(CLK_INFRA_UFS_AXI, "infra_ufs_axi", "axi_sel", 5), GATE_INFRA3(CLK_INFRA_I2C6, "infra_i2c6", "i2c_sel", 6), GATE_INFRA3(CLK_INFRA_AP_MSDC0, "infra_ap_msdc0", "msdc50_hclk_sel", 7), From a29b2fccf5f2689a9637be85ff1f51c834c6fb33 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 11 Jul 2023 17:08:12 +0200 Subject: [PATCH 018/230] clk: imx93: Propagate correct error in imx93_clocks_probe() smatch reports: drivers/clk/imx/clk-imx93.c:294 imx93_clocks_probe() error: uninitialized symbol 'base'. Indeed, in case of an error, the wrong (yet uninitialized) variable is converted to an error code and returned. Fix this by propagating the error code in the correct variable. Fixes: e02ba11b45764705 ("clk: imx93: fix memory leak and missing unwind goto in imx93_clocks_probe") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/9c2acd81-3ad8-485d-819e-9e4201277831@kadam.mountain Reported-by: kernel test robot Closes: https://lore.kernel.org/all/202306161533.4YDmL22b-lkp@intel.com/ Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230711150812.3562221-1-geert+renesas@glider.be Reviewed-by: Peng Fan Signed-off-by: Stephen Boyd --- drivers/clk/imx/clk-imx93.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/imx/clk-imx93.c b/drivers/clk/imx/clk-imx93.c index b6c7c2725906..44f435103c65 100644 --- a/drivers/clk/imx/clk-imx93.c +++ b/drivers/clk/imx/clk-imx93.c @@ -291,7 +291,7 @@ static int imx93_clocks_probe(struct platform_device *pdev) anatop_base = devm_of_iomap(dev, np, 0, NULL); of_node_put(np); if (WARN_ON(IS_ERR(anatop_base))) { - ret = PTR_ERR(base); + ret = PTR_ERR(anatop_base); goto unregister_hws; } From e7dd44f4f3166db45248414f5df8f615392de47a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 7 Jul 2023 21:58:51 +0800 Subject: [PATCH 019/230] clk: fixed-mmio: make COMMON_CLK_FIXED_MMIO depend on HAS_IOMEM On s390 systems (aka mainframes), it has classic channel devices for networking and permanent storage that are currently even more common than PCI devices. Hence it could have a fully functional s390 kernel with CONFIG_PCI=n, then the relevant iomem mapping functions [including ioremap(), devm_ioremap(), etc.] are not available. Here let COMMON_CLK_FIXED_MMIO depend on HAS_IOMEM so that it won't be built to cause below compiling error if PCI is unset: ------ ld: drivers/clk/clk-fixed-mmio.o: in function `fixed_mmio_clk_setup': clk-fixed-mmio.c:(.text+0x5e): undefined reference to `of_iomap' ld: clk-fixed-mmio.c:(.text+0xba): undefined reference to `iounmap' ------ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306211329.ticOJCSv-lkp@intel.com/ Signed-off-by: Baoquan He Cc: Michael Turquette Cc: Stephen Boyd Cc: linux-clk@vger.kernel.org Link: https://lore.kernel.org/r/20230707135852.24292-8-bhe@redhat.com Signed-off-by: Stephen Boyd --- drivers/clk/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index 93f38a8178ba..6b3b424addab 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -444,6 +444,7 @@ config COMMON_CLK_BD718XX config COMMON_CLK_FIXED_MMIO bool "Clock driver for Memory Mapped Fixed values" depends on COMMON_CLK && OF + depends on HAS_IOMEM help Support for Memory Mapped IO Fixed clocks From 8974eb588283b7d44a7c91fa09fcbaf380339f3a Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Wed, 12 Jul 2023 19:43:11 -0700 Subject: [PATCH 020/230] x86/speculation: Add Gather Data Sampling mitigation Gather Data Sampling (GDS) is a hardware vulnerability which allows unprivileged speculative access to data which was previously stored in vector registers. Intel processors that support AVX2 and AVX512 have gather instructions that fetch non-contiguous data elements from memory. On vulnerable hardware, when a gather instruction is transiently executed and encounters a fault, stale data from architectural or internal vector registers may get transiently stored to the destination vector register allowing an attacker to infer the stale data using typical side channel techniques like cache timing attacks. This mitigation is different from many earlier ones for two reasons. First, it is enabled by default and a bit must be set to *DISABLE* it. This is the opposite of normal mitigation polarity. This means GDS can be mitigated simply by updating microcode and leaving the new control bit alone. Second, GDS has a "lock" bit. This lock bit is there because the mitigation affects the hardware security features KeyLocker and SGX. It needs to be enabled and *STAY* enabled for these features to be mitigated against GDS. The mitigation is enabled in the microcode by default. Disable it by setting gather_data_sampling=off or by disabling all mitigations with mitigations=off. The mitigation status can be checked by reading: /sys/devices/system/cpu/vulnerabilities/gather_data_sampling Signed-off-by: Daniel Sneddon Signed-off-by: Dave Hansen Acked-by: Josh Poimboeuf --- .../ABI/testing/sysfs-devices-system-cpu | 15 +- .../hw-vuln/gather_data_sampling.rst | 99 ++++++++++++++ Documentation/admin-guide/hw-vuln/index.rst | 1 + .../admin-guide/kernel-parameters.txt | 41 ++++-- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 11 ++ arch/x86/kernel/cpu/bugs.c | 129 ++++++++++++++++++ arch/x86/kernel/cpu/common.c | 34 +++-- arch/x86/kernel/cpu/cpu.h | 1 + drivers/base/cpu.c | 8 ++ 10 files changed, 311 insertions(+), 29 deletions(-) create mode 100644 Documentation/admin-guide/hw-vuln/gather_data_sampling.rst diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index ecd585ca2d50..77942eedf4f6 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -513,17 +513,18 @@ Description: information about CPUs heterogeneity. cpu_capacity: capacity of cpuX. What: /sys/devices/system/cpu/vulnerabilities - /sys/devices/system/cpu/vulnerabilities/meltdown - /sys/devices/system/cpu/vulnerabilities/spectre_v1 - /sys/devices/system/cpu/vulnerabilities/spectre_v2 - /sys/devices/system/cpu/vulnerabilities/spec_store_bypass + /sys/devices/system/cpu/vulnerabilities/gather_data_sampling + /sys/devices/system/cpu/vulnerabilities/itlb_multihit /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds - /sys/devices/system/cpu/vulnerabilities/srbds - /sys/devices/system/cpu/vulnerabilities/tsx_async_abort - /sys/devices/system/cpu/vulnerabilities/itlb_multihit + /sys/devices/system/cpu/vulnerabilities/meltdown /sys/devices/system/cpu/vulnerabilities/mmio_stale_data /sys/devices/system/cpu/vulnerabilities/retbleed + /sys/devices/system/cpu/vulnerabilities/spec_store_bypass + /sys/devices/system/cpu/vulnerabilities/spectre_v1 + /sys/devices/system/cpu/vulnerabilities/spectre_v2 + /sys/devices/system/cpu/vulnerabilities/srbds + /sys/devices/system/cpu/vulnerabilities/tsx_async_abort Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst new file mode 100644 index 000000000000..74dab6af7fe1 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst @@ -0,0 +1,99 @@ +.. SPDX-License-Identifier: GPL-2.0 + +GDS - Gather Data Sampling +========================== + +Gather Data Sampling is a hardware vulnerability which allows unprivileged +speculative access to data which was previously stored in vector registers. + +Problem +------- +When a gather instruction performs loads from memory, different data elements +are merged into the destination vector register. However, when a gather +instruction that is transiently executed encounters a fault, stale data from +architectural or internal vector registers may get transiently forwarded to the +destination vector register instead. This will allow a malicious attacker to +infer stale data using typical side channel techniques like cache timing +attacks. GDS is a purely sampling-based attack. + +The attacker uses gather instructions to infer the stale vector register data. +The victim does not need to do anything special other than use the vector +registers. The victim does not need to use gather instructions to be +vulnerable. + +Because the buffers are shared between Hyper-Threads cross Hyper-Thread attacks +are possible. + +Attack scenarios +---------------- +Without mitigation, GDS can infer stale data across virtually all +permission boundaries: + + Non-enclaves can infer SGX enclave data + Userspace can infer kernel data + Guests can infer data from hosts + Guest can infer guest from other guests + Users can infer data from other users + +Because of this, it is important to ensure that the mitigation stays enabled in +lower-privilege contexts like guests and when running outside SGX enclaves. + +The hardware enforces the mitigation for SGX. Likewise, VMMs should ensure +that guests are not allowed to disable the GDS mitigation. If a host erred and +allowed this, a guest could theoretically disable GDS mitigation, mount an +attack, and re-enable it. + +Mitigation mechanism +-------------------- +This issue is mitigated in microcode. The microcode defines the following new +bits: + + ================================ === ============================ + IA32_ARCH_CAPABILITIES[GDS_CTRL] R/O Enumerates GDS vulnerability + and mitigation support. + IA32_ARCH_CAPABILITIES[GDS_NO] R/O Processor is not vulnerable. + IA32_MCU_OPT_CTRL[GDS_MITG_DIS] R/W Disables the mitigation + 0 by default. + IA32_MCU_OPT_CTRL[GDS_MITG_LOCK] R/W Locks GDS_MITG_DIS=0. Writes + to GDS_MITG_DIS are ignored + Can't be cleared once set. + ================================ === ============================ + +GDS can also be mitigated on systems that don't have updated microcode by +disabling AVX. This can be done by setting "clearcpuid=avx" on the kernel +command-line. + +Mitigation control on the kernel command line +--------------------------------------------- +The mitigation can be disabled by setting "gather_data_sampling=off" or +"mitigations=off" on the kernel command line. Not specifying either will +default to the mitigation being enabled. + +GDS System Information +------------------------ +The kernel provides vulnerability status information through sysfs. For +GDS this can be accessed by the following sysfs file: + +/sys/devices/system/cpu/vulnerabilities/gather_data_sampling + +The possible values contained in this file are: + + ============================== ============================================= + Not affected Processor not vulnerable. + Vulnerable Processor vulnerable and mitigation disabled. + Vulnerable: No microcode Processor vulnerable and microcode is missing + mitigation. + Mitigation: Microcode Processor is vulnerable and mitigation is in + effect. + Mitigation: Microcode (locked) Processor is vulnerable and mitigation is in + effect and cannot be disabled. + Unknown: Dependent on + hypervisor status Running on a virtual guest processor that is + affected but with no way to know if host + processor is mitigated or vulnerable. + ============================== ============================================= + +GDS Default mitigation +---------------------- +The updated microcode will enable the mitigation by default. The kernel's +default action is to leave the mitigation enabled. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index e0614760a99e..436fac0bd9c3 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -19,3 +19,4 @@ are configurable at compile, boot or run time. l1d_flush.rst processor_mmio_stale_data.rst cross-thread-rsb.rst + gather_data_sampling.rst diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a1457995fd41..c21d42140d6b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1623,6 +1623,20 @@ Format: off | on default: on + gather_data_sampling= + [X86,INTEL] Control the Gather Data Sampling (GDS) + mitigation. + + Gather Data Sampling is a hardware vulnerability which + allows unprivileged speculative access to data which was + previously stored in vector registers. + + This issue is mitigated by default in updated microcode. + The mitigation may have a performance impact but can be + disabled. + + off: Disable GDS mitigation. + gcov_persist= [GCOV] When non-zero (default), profiling data for kernel modules is saved and remains accessible via debugfs, even when the module is unloaded/reloaded. @@ -3273,24 +3287,25 @@ Disable all optional CPU mitigations. This improves system performance, but it may also expose users to several CPU vulnerabilities. - Equivalent to: nopti [X86,PPC] - if nokaslr then kpti=0 [ARM64] - nospectre_v1 [X86,PPC] - nobp=0 [S390] - nospectre_v2 [X86,PPC,S390,ARM64] - spectre_v2_user=off [X86] - spec_store_bypass_disable=off [X86,PPC] - ssbd=force-off [ARM64] - nospectre_bhb [ARM64] + Equivalent to: if nokaslr then kpti=0 [ARM64] + gather_data_sampling=off [X86] + kvm.nx_huge_pages=off [X86] l1tf=off [X86] mds=off [X86] - tsx_async_abort=off [X86] - kvm.nx_huge_pages=off [X86] - srbds=off [X86,INTEL] + mmio_stale_data=off [X86] no_entry_flush [PPC] no_uaccess_flush [PPC] - mmio_stale_data=off [X86] + nobp=0 [S390] + nopti [X86,PPC] + nospectre_bhb [ARM64] + nospectre_v1 [X86,PPC] + nospectre_v2 [X86,PPC,S390,ARM64] retbleed=off [X86] + spec_store_bypass_disable=off [X86,PPC] + spectre_v2_user=off [X86] + srbds=off [X86,INTEL] + ssbd=force-off [ARM64] + tsx_async_abort=off [X86] Exceptions: This does not have any effect on diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cb8ca46213be..8d6b34726033 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -483,5 +483,6 @@ #define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ +#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3aedae61af4f..348f3da8b0f6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -155,6 +155,15 @@ * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. */ +#define ARCH_CAP_GDS_CTRL BIT(25) /* + * CPU is vulnerable to Gather + * Data Sampling (GDS) and + * has controls for mitigation. + */ +#define ARCH_CAP_GDS_NO BIT(26) /* + * CPU is not vulnerable to Gather + * Data Sampling (GDS). + */ #define ARCH_CAP_XAPIC_DISABLE BIT(21) /* * IA32_XAPIC_DISABLE_STATUS MSR @@ -178,6 +187,8 @@ #define RNGDS_MITG_DIS BIT(0) /* SRBDS support */ #define RTM_ALLOW BIT(1) /* TSX development mode */ #define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */ +#define GDS_MITG_DIS BIT(4) /* Disable GDS mitigation */ +#define GDS_MITG_LOCKED BIT(5) /* GDS mitigation locked */ #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9e2a91830f72..7824b101ddfe 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -47,6 +47,7 @@ static void __init taa_select_mitigation(void); static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); +static void __init gds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -160,6 +161,7 @@ void __init cpu_select_mitigations(void) md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); + gds_select_mitigation(); } /* @@ -645,6 +647,120 @@ static int __init l1d_flush_parse_cmdline(char *str) } early_param("l1d_flush", l1d_flush_parse_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "GDS: " fmt + +enum gds_mitigations { + GDS_MITIGATION_OFF, + GDS_MITIGATION_UCODE_NEEDED, + GDS_MITIGATION_FULL, + GDS_MITIGATION_FULL_LOCKED, + GDS_MITIGATION_HYPERVISOR, +}; + +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; + +static const char * const gds_strings[] = { + [GDS_MITIGATION_OFF] = "Vulnerable", + [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [GDS_MITIGATION_FULL] = "Mitigation: Microcode", + [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", + [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +void update_gds_msr(void) +{ + u64 mcu_ctrl_after; + u64 mcu_ctrl; + + switch (gds_mitigation) { + case GDS_MITIGATION_OFF: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl |= GDS_MITG_DIS; + break; + case GDS_MITIGATION_FULL_LOCKED: + /* + * The LOCKED state comes from the boot CPU. APs might not have + * the same state. Make sure the mitigation is enabled on all + * CPUs. + */ + case GDS_MITIGATION_FULL: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl &= ~GDS_MITG_DIS; + break; + case GDS_MITIGATION_UCODE_NEEDED: + case GDS_MITIGATION_HYPERVISOR: + return; + }; + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + /* + * Check to make sure that the WRMSR value was not ignored. Writes to + * GDS_MITG_DIS will be ignored if this processor is locked but the boot + * processor was not. + */ + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); +} + +static void __init gds_select_mitigation(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + gds_mitigation = GDS_MITIGATION_HYPERVISOR; + goto out; + } + + if (cpu_mitigations_off()) + gds_mitigation = GDS_MITIGATION_OFF; + /* Will verify below that mitigation _can_ be disabled */ + + /* No microcode */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; + goto out; + } + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + if (mcu_ctrl & GDS_MITG_LOCKED) { + if (gds_mitigation == GDS_MITIGATION_OFF) + pr_warn("Mitigation locked. Disable failed.\n"); + + /* + * The mitigation is selected from the boot CPU. All other CPUs + * _should_ have the same state. If the boot CPU isn't locked + * but others are then update_gds_msr() will WARN() of the state + * mismatch. If the boot CPU is locked update_gds_msr() will + * ensure the other CPUs have the mitigation enabled. + */ + gds_mitigation = GDS_MITIGATION_FULL_LOCKED; + } + + update_gds_msr(); +out: + pr_info("%s\n", gds_strings[gds_mitigation]); +} + +static int __init gds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return 0; + + if (!strcmp(str, "off")) + gds_mitigation = GDS_MITIGATION_OFF; + + return 0; +} +early_param("gather_data_sampling", gds_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt @@ -2382,6 +2498,11 @@ static ssize_t retbleed_show_state(char *buf) return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); } +static ssize_t gds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -2431,6 +2552,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RETBLEED: return retbleed_show_state(buf); + case X86_BUG_GDS: + return gds_show_state(buf); + default: break; } @@ -2495,4 +2619,9 @@ ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, cha { return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); } + +ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_GDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52683fddafaf..53224fe3ca6f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1250,6 +1250,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define RETBLEED BIT(3) /* CPU is affected by SMT (cross-thread) return predictions */ #define SMT_RSB BIT(4) +/* CPU is affected by GDS */ +#define GDS BIT(5) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1262,19 +1264,21 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS), VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), @@ -1406,6 +1410,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) setup_force_cpu_bug(X86_BUG_SMT_RSB); + /* + * Check if CPU is vulnerable to GDS. If running in a virtual machine on + * an affected processor, the VMM may have disabled the use of GATHER by + * disabling AVX2. The only way to do this in HW is to clear XCR0[2], + * which means that AVX will be disabled. + */ + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + boot_cpu_has(X86_FEATURE_AVX)) + setup_force_cpu_bug(X86_BUG_GDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1962,6 +1976,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); + if (boot_cpu_has_bug(X86_BUG_GDS)) + update_gds_msr(); tsx_ap_init(); } diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1c44630d4789..1dcd7d4e38ef 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -83,6 +83,7 @@ void cpu_select_mitigations(void); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); +extern void update_gds_msr(void); extern enum spectre_v2_mitigation spectre_v2_enabled; diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index c1815b9dae68..0469c09e8a8c 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -577,6 +577,12 @@ ssize_t __weak cpu_show_retbleed(struct device *dev, return sysfs_emit(buf, "Not affected\n"); } +ssize_t __weak cpu_show_gds(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -588,6 +594,7 @@ static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL); static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL); +static DEVICE_ATTR(gather_data_sampling, 0444, cpu_show_gds, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -601,6 +608,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_srbds.attr, &dev_attr_mmio_stale_data.attr, &dev_attr_retbleed.attr, + &dev_attr_gather_data_sampling.attr, NULL }; From c718ca0e99401d80d2480c08e1b02cf5f7cd7033 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Wed, 19 Jul 2023 17:54:00 +0000 Subject: [PATCH 021/230] KVM: arm64: Fix hardware enable/disable flows for pKVM When running in protected mode, the hyp stub is disabled after pKVM is initialized, meaning the host cannot enable/disable the hyp at runtime. As such, kvm_arm_hardware_enabled is always 1 after initialization, and kvm_arch_hardware_enable() never enables the vgic maintenance irq or timer irqs. Unconditionally enable/disable the vgic + timer irqs in the respective calls, instead relying on the percpu bookkeeping in the generic code to keep track of which cpus have the interrupts unmasked. Fixes: 466d27e48d7c ("KVM: arm64: Simplify the CPUHP logic") Reported-by: Oliver Upton Suggested-by: Oliver Upton Signed-off-by: Raghavendra Rao Ananta Link: https://lore.kernel.org/r/20230719175400.647154-1-rananta@google.com Acked-by: Marc Zyngier Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 72dc53a75d1c..d8540563a623 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1874,8 +1874,6 @@ static void _kvm_arch_hardware_enable(void *discard) int kvm_arch_hardware_enable(void) { - int was_enabled; - /* * Most calls to this function are made with migration * disabled, but not with preemption disabled. The former is @@ -1884,13 +1882,10 @@ int kvm_arch_hardware_enable(void) */ preempt_disable(); - was_enabled = __this_cpu_read(kvm_arm_hardware_enabled); _kvm_arch_hardware_enable(NULL); - if (!was_enabled) { - kvm_vgic_cpu_up(); - kvm_timer_cpu_up(); - } + kvm_vgic_cpu_up(); + kvm_timer_cpu_up(); preempt_enable(); @@ -1907,10 +1902,8 @@ static void _kvm_arch_hardware_disable(void *discard) void kvm_arch_hardware_disable(void) { - if (__this_cpu_read(kvm_arm_hardware_enabled)) { - kvm_timer_cpu_down(); - kvm_vgic_cpu_down(); - } + kvm_timer_cpu_down(); + kvm_vgic_cpu_down(); if (!is_protected_kvm_enabled()) _kvm_arch_hardware_disable(NULL); From 733c758e509b86a5d38b9af927817258b88ededd Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Jul 2023 23:18:55 +0000 Subject: [PATCH 022/230] KVM: arm64: Rephrase percpu enable/disable tracking in terms of hyp kvm_arm_hardware_enabled is rather misleading, since it doesn't track the state of all hardware resources needed for running a VM. What it actually tracks is whether or not the hyp cpu context has been initialized. Since we're now at the point where vgic + timer irq management has been separated from kvm_arm_hardware_enabled, rephrase it (and the associated helpers) to make it clear what state is being tracked. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20230719231855.262973-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 46 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index d8540563a623..d1cb298a58a0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -55,7 +55,7 @@ DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); static bool vgic_present, kvm_arm_initialised; -static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); +static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized); DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); bool is_kvm_arm_initialised(void) @@ -1864,11 +1864,19 @@ static void cpu_hyp_reinit(void) cpu_hyp_init_features(); } -static void _kvm_arch_hardware_enable(void *discard) +static void cpu_hyp_init(void *discard) { - if (!__this_cpu_read(kvm_arm_hardware_enabled)) { + if (!__this_cpu_read(kvm_hyp_initialized)) { cpu_hyp_reinit(); - __this_cpu_write(kvm_arm_hardware_enabled, 1); + __this_cpu_write(kvm_hyp_initialized, 1); + } +} + +static void cpu_hyp_uninit(void *discard) +{ + if (__this_cpu_read(kvm_hyp_initialized)) { + cpu_hyp_reset(); + __this_cpu_write(kvm_hyp_initialized, 0); } } @@ -1882,7 +1890,7 @@ int kvm_arch_hardware_enable(void) */ preempt_disable(); - _kvm_arch_hardware_enable(NULL); + cpu_hyp_init(NULL); kvm_vgic_cpu_up(); kvm_timer_cpu_up(); @@ -1892,21 +1900,13 @@ int kvm_arch_hardware_enable(void) return 0; } -static void _kvm_arch_hardware_disable(void *discard) -{ - if (__this_cpu_read(kvm_arm_hardware_enabled)) { - cpu_hyp_reset(); - __this_cpu_write(kvm_arm_hardware_enabled, 0); - } -} - void kvm_arch_hardware_disable(void) { kvm_timer_cpu_down(); kvm_vgic_cpu_down(); if (!is_protected_kvm_enabled()) - _kvm_arch_hardware_disable(NULL); + cpu_hyp_uninit(NULL); } #ifdef CONFIG_CPU_PM @@ -1915,16 +1915,16 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self, void *v) { /* - * kvm_arm_hardware_enabled is left with its old value over + * kvm_hyp_initialized is left with its old value over * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should * re-enable hyp. */ switch (cmd) { case CPU_PM_ENTER: - if (__this_cpu_read(kvm_arm_hardware_enabled)) + if (__this_cpu_read(kvm_hyp_initialized)) /* - * don't update kvm_arm_hardware_enabled here - * so that the hardware will be re-enabled + * don't update kvm_hyp_initialized here + * so that the hyp will be re-enabled * when we resume. See below. */ cpu_hyp_reset(); @@ -1932,8 +1932,8 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self, return NOTIFY_OK; case CPU_PM_ENTER_FAILED: case CPU_PM_EXIT: - if (__this_cpu_read(kvm_arm_hardware_enabled)) - /* The hardware was enabled before suspend. */ + if (__this_cpu_read(kvm_hyp_initialized)) + /* The hyp was enabled before suspend. */ cpu_hyp_reinit(); return NOTIFY_OK; @@ -2014,7 +2014,7 @@ static int __init init_subsystems(void) /* * Enable hardware so that subsystem initialisation can access EL2. */ - on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); + on_each_cpu(cpu_hyp_init, NULL, 1); /* * Register CPU lower-power notifier @@ -2052,7 +2052,7 @@ out: hyp_cpu_pm_exit(); if (err || !is_protected_kvm_enabled()) - on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); + on_each_cpu(cpu_hyp_uninit, NULL, 1); return err; } @@ -2090,7 +2090,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits) * The stub hypercalls are now disabled, so set our local flag to * prevent a later re-init attempt in kvm_arch_hardware_enable(). */ - __this_cpu_write(kvm_arm_hardware_enabled, 1); + __this_cpu_write(kvm_hyp_initialized, 1); preempt_enable(); return ret; From 553a5c03e90a6087e88f8ff878335ef0621536fb Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Wed, 12 Jul 2023 19:43:12 -0700 Subject: [PATCH 023/230] x86/speculation: Add force option to GDS mitigation The Gather Data Sampling (GDS) vulnerability allows malicious software to infer stale data previously stored in vector registers. This may include sensitive data such as cryptographic keys. GDS is mitigated in microcode, and systems with up-to-date microcode are protected by default. However, any affected system that is running with older microcode will still be vulnerable to GDS attacks. Since the gather instructions used by the attacker are part of the AVX2 and AVX512 extensions, disabling these extensions prevents gather instructions from being executed, thereby mitigating the system from GDS. Disabling AVX2 is sufficient, but we don't have the granularity to do this. The XCR0[2] disables AVX, with no option to just disable AVX2. Add a kernel parameter gather_data_sampling=force that will enable the microcode mitigation if available, otherwise it will disable AVX on affected systems. This option will be ignored if cmdline mitigations=off. This is a *big* hammer. It is known to break buggy userspace that uses incomplete, buggy AVX enumeration. Unfortunately, such userspace does exist in the wild: https://www.mail-archive.com/bug-coreutils@gnu.org/msg33046.html [ dhansen: add some more ominous warnings about disabling AVX ] Signed-off-by: Daniel Sneddon Signed-off-by: Dave Hansen Acked-by: Josh Poimboeuf --- .../hw-vuln/gather_data_sampling.rst | 18 +++++++++++++---- .../admin-guide/kernel-parameters.txt | 8 +++++++- arch/x86/kernel/cpu/bugs.c | 20 ++++++++++++++++++- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst index 74dab6af7fe1..40b7a6260010 100644 --- a/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst +++ b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst @@ -60,14 +60,21 @@ bits: ================================ === ============================ GDS can also be mitigated on systems that don't have updated microcode by -disabling AVX. This can be done by setting "clearcpuid=avx" on the kernel -command-line. +disabling AVX. This can be done by setting gather_data_sampling="force" or +"clearcpuid=avx" on the kernel command-line. + +If used, these options will disable AVX use by turning on XSAVE YMM support. +However, the processor will still enumerate AVX support. Userspace that +does not follow proper AVX enumeration to check both AVX *and* XSAVE YMM +support will break. Mitigation control on the kernel command line --------------------------------------------- The mitigation can be disabled by setting "gather_data_sampling=off" or -"mitigations=off" on the kernel command line. Not specifying either will -default to the mitigation being enabled. +"mitigations=off" on the kernel command line. Not specifying either will default +to the mitigation being enabled. Specifying "gather_data_sampling=force" will +use the microcode mitigation when available or disable AVX on affected systems +where the microcode hasn't been updated to include the mitigation. GDS System Information ------------------------ @@ -83,6 +90,9 @@ The possible values contained in this file are: Vulnerable Processor vulnerable and mitigation disabled. Vulnerable: No microcode Processor vulnerable and microcode is missing mitigation. + Mitigation: AVX disabled, + no microcode Processor is vulnerable and microcode is missing + mitigation. AVX disabled as mitigation. Mitigation: Microcode Processor is vulnerable and mitigation is in effect. Mitigation: Microcode (locked) Processor is vulnerable and mitigation is in diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c21d42140d6b..816b966bed0f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1633,7 +1633,13 @@ This issue is mitigated by default in updated microcode. The mitigation may have a performance impact but can be - disabled. + disabled. On systems without the microcode mitigation + disabling AVX serves as a mitigation. + + force: Disable AVX to mitigate systems without + microcode mitigation. No effect if the microcode + mitigation is present. Known to cause crashes in + userspace with buggy AVX enumeration. off: Disable GDS mitigation. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7824b101ddfe..155e8d1c325e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -653,6 +653,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline); enum gds_mitigations { GDS_MITIGATION_OFF, GDS_MITIGATION_UCODE_NEEDED, + GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, GDS_MITIGATION_FULL_LOCKED, GDS_MITIGATION_HYPERVISOR, @@ -663,6 +664,7 @@ static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode", [GDS_MITIGATION_FULL] = "Mitigation: Microcode", [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", @@ -688,6 +690,7 @@ void update_gds_msr(void) rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl &= ~GDS_MITG_DIS; break; + case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: return; @@ -722,10 +725,23 @@ static void __init gds_select_mitigation(void) /* No microcode */ if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { - gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; + if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } else { + gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; + } goto out; } + /* Microcode has mitigation, use it */ + if (gds_mitigation == GDS_MITIGATION_FORCE) + gds_mitigation = GDS_MITIGATION_FULL; + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); if (mcu_ctrl & GDS_MITG_LOCKED) { if (gds_mitigation == GDS_MITIGATION_OFF) @@ -756,6 +772,8 @@ static int __init gds_parse_cmdline(char *str) if (!strcmp(str, "off")) gds_mitigation = GDS_MITIGATION_OFF; + else if (!strcmp(str, "force")) + gds_mitigation = GDS_MITIGATION_FORCE; return 0; } From 53cf5797f114ba2bd86d23a862302119848eff19 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Wed, 12 Jul 2023 19:43:13 -0700 Subject: [PATCH 024/230] x86/speculation: Add Kconfig option for GDS Gather Data Sampling (GDS) is mitigated in microcode. However, on systems that haven't received the updated microcode, disabling AVX can act as a mitigation. Add a Kconfig option that uses the microcode mitigation if available and disables AVX otherwise. Setting this option has no effect on systems not affected by GDS. This is the equivalent of setting gather_data_sampling=force. Signed-off-by: Daniel Sneddon Signed-off-by: Dave Hansen Acked-by: Josh Poimboeuf --- arch/x86/Kconfig | 19 +++++++++++++++++++ arch/x86/kernel/cpu/bugs.c | 4 ++++ 2 files changed, 23 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db409770..36d3f61c4ed9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2603,6 +2603,25 @@ config SLS against straight line speculation. The kernel image might be slightly larger. +config GDS_FORCE_MITIGATION + bool "Force GDS Mitigation" + depends on CPU_SUP_INTEL + default n + help + Gather Data Sampling (GDS) is a hardware vulnerability which allows + unprivileged speculative access to data which was previously stored in + vector registers. + + This option is equivalent to setting gather_data_sampling=force on the + command line. The microcode mitigation is used if present, otherwise + AVX is disabled as a mitigation. On affected systems that are missing + the microcode any userspace code that unconditionally uses AVX will + break with this option set. + + Setting this option on systems not vulnerable to GDS has no effect. + + If in doubt, say N. + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 155e8d1c325e..41821a4551a2 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -659,7 +659,11 @@ enum gds_mitigations { GDS_MITIGATION_HYPERVISOR, }; +#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; +#else static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; +#endif static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", From 81ac7e5d741742d650b4ed6186c4826c1a0631a7 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Wed, 12 Jul 2023 19:43:14 -0700 Subject: [PATCH 025/230] KVM: Add GDS_NO support to KVM Gather Data Sampling (GDS) is a transient execution attack using gather instructions from the AVX2 and AVX512 extensions. This attack allows malicious code to infer data that was previously stored in vector registers. Systems that are not vulnerable to GDS will set the GDS_NO bit of the IA32_ARCH_CAPABILITIES MSR. This is useful for VM guests that may think they are on vulnerable systems that are, in fact, not affected. Guests that are running on affected hosts where the mitigation is enabled are protected as if they were running on an unaffected system. On all hosts that are not affected or that are mitigated, set the GDS_NO bit. Signed-off-by: Daniel Sneddon Signed-off-by: Dave Hansen Acked-by: Josh Poimboeuf --- arch/x86/kernel/cpu/bugs.c | 7 +++++++ arch/x86/kvm/x86.c | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 41821a4551a2..7985c658d129 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -674,6 +674,13 @@ static const char * const gds_strings[] = { [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", }; +bool gds_ucode_mitigated(void) +{ + return (gds_mitigation == GDS_MITIGATION_FULL || + gds_mitigation == GDS_MITIGATION_FULL_LOCKED); +} +EXPORT_SYMBOL_GPL(gds_ucode_mitigated); + void update_gds_msr(void) { u64 mcu_ctrl_after; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a6b9bea62fb8..ad09c8540189 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -314,6 +314,8 @@ u64 __read_mostly host_xcr0; static struct kmem_cache *x86_emulator_cache; +extern bool gds_ucode_mitigated(void); + /* * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. @@ -1607,7 +1609,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ - ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1664,6 +1666,9 @@ static u64 kvm_get_arch_capabilities(void) */ } + if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) + data |= ARCH_CAP_GDS_NO; + return data; } From d5ace2a776442d80674eff9ed42e737f7dd95056 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Fri, 21 Jul 2023 21:51:16 -0700 Subject: [PATCH 026/230] x86/hyperv: Disable IBT when hypercall page lacks ENDBR instruction On hardware that supports Indirect Branch Tracking (IBT), Hyper-V VMs with ConfigVersion 9.3 or later support IBT in the guest. However, current versions of Hyper-V have a bug in that there's not an ENDBR64 instruction at the beginning of the hypercall page. Since hypercalls are made with an indirect call to the hypercall page, all hypercall attempts fail with an exception and Linux panics. A Hyper-V fix is in progress to add ENDBR64. But guard against the Linux panic by clearing X86_FEATURE_IBT if the hypercall page doesn't start with ENDBR. The VM will boot and run without IBT. If future Linux 32-bit kernels were to support IBT, additional hypercall page hackery would be needed to make IBT work for such kernels in a Hyper-V VM. Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1690001476-98594-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 6c04b52f139b..953e280c07c3 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -471,6 +472,26 @@ void __init hyperv_init(void) wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); } + /* + * Some versions of Hyper-V that provide IBT in guest VMs have a bug + * in that there's no ENDBR64 instruction at the entry to the + * hypercall page. Because hypercalls are invoked via an indirect call + * to the hypercall page, all hypercall attempts fail when IBT is + * enabled, and Linux panics. For such buggy versions, disable IBT. + * + * Fixed versions of Hyper-V always provide ENDBR64 on the hypercall + * page, so if future Linux kernel versions enable IBT for 32-bit + * builds, additional hypercall page hackery will be required here + * to provide an ENDBR32. + */ +#ifdef CONFIG_X86_KERNEL_IBT + if (cpu_feature_enabled(X86_FEATURE_IBT) && + *(u32 *)hv_hypercall_pg != gen_endbr()) { + setup_clear_cpu_cap(X86_FEATURE_IBT); + pr_warn("Hyper-V: Disabling IBT because of Hyper-V bug\n"); + } +#endif + /* * hyperv_init() is called before LAPIC is initialized: see * apic_intr_mode_init() -> x86_platform.apic_post_init() and From 060f2b979c4e0e894c381c76a4dcad24376feddd Mon Sep 17 00:00:00 2001 From: ZhiHu Date: Sun, 23 Jul 2023 23:12:47 +0000 Subject: [PATCH 027/230] x86/hyperv: fix a warning in mshyperv.h The following checkpatch warning is removed: WARNING: Use #include instead of Signed-off-by: ZhiHu Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/include/asm/mshyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 49bb4f2bd300..cd17e706f9ca 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include From ed0cf84e9cc42e6310961c87709621f1825c2bb8 Mon Sep 17 00:00:00 2001 From: Ani Sinha Date: Wed, 5 Jul 2023 19:14:07 +0530 Subject: [PATCH 028/230] vmbus_testing: fix wrong python syntax for integer value comparison It is incorrect in python to compare integer values using the "is" keyword. The "is" keyword in python is used to compare references to two objects, not their values. Newer version of python3 (version 3.8) throws a warning when such incorrect comparison is made. For value comparison, "==" should be used. Fix this in the code and suppress the following warning: /usr/sbin/vmbus_testing:167: SyntaxWarning: "is" with a literal. Did you mean "=="? Signed-off-by: Ani Sinha Link: https://lore.kernel.org/r/20230705134408.6302-1-anisinha@redhat.com Signed-off-by: Wei Liu --- tools/hv/vmbus_testing | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/hv/vmbus_testing b/tools/hv/vmbus_testing index e7212903dd1d..4467979d8f69 100755 --- a/tools/hv/vmbus_testing +++ b/tools/hv/vmbus_testing @@ -164,7 +164,7 @@ def recursive_file_lookup(path, file_map): def get_all_devices_test_status(file_map): for device in file_map: - if (get_test_state(locate_state(device, file_map)) is 1): + if (get_test_state(locate_state(device, file_map)) == 1): print("Testing = ON for: {}" .format(device.split("/")[5])) else: @@ -203,7 +203,7 @@ def write_test_files(path, value): def set_test_state(state_path, state_value, quiet): write_test_files(state_path, state_value) - if (get_test_state(state_path) is 1): + if (get_test_state(state_path) == 1): if (not quiet): print("Testing = ON for device: {}" .format(state_path.split("/")[5])) From 9754353d0ab123d71bf572a483ecc8b330ef36a3 Mon Sep 17 00:00:00 2001 From: Haixin Yu Date: Mon, 24 Jul 2023 13:06:54 +0800 Subject: [PATCH 029/230] perf pmu arm64: Fix reading the PMU cpu slots in sysfs Commit f8ad6018ce3c065a ("perf pmu: Remove duplication around EVENT_SOURCE_DEVICE_PATH") uses sysfs__read_ull() to read a full sysfs path, which will never succeeds as it already comes with the sysfs mount point in it, which sysfs__read_ull() will add again. Fix it by reading the file using filename__read_ull(), that will not add the sysfs mount point. Fixes: f8ad6018ce3c065a ("perf pmu: Remove duplication around EVENT_SOURCE_DEVICE_PATH") Signed-off-by: Haixin Yu Tested-by: Jing Zhang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/ZL4G7rWXkfv-Ectq@B-Q60VQ05P-2326.local Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/pmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c index 561de0cb6b95..512a8f13c4de 100644 --- a/tools/perf/arch/arm64/util/pmu.c +++ b/tools/perf/arch/arm64/util/pmu.c @@ -54,10 +54,11 @@ double perf_pmu__cpu_slots_per_cycle(void) perf_pmu__pathname_scnprintf(path, sizeof(path), pmu->name, "caps/slots"); /* - * The value of slots is not greater than 32 bits, but sysfs__read_int - * can't read value with 0x prefix, so use sysfs__read_ull instead. + * The value of slots is not greater than 32 bits, but + * filename__read_int can't read value with 0x prefix, + * so use filename__read_ull instead. */ - sysfs__read_ull(path, &slots); + filename__read_ull(path, &slots); } return slots ? (double)slots : NAN; From 341e0e9f59e26676c88d0aa9a5a9b2d3c44bf21c Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 24 Jul 2023 22:28:15 +0530 Subject: [PATCH 030/230] perf callchain powerpc: Fix addr location init during arch_skip_callchain_idx function 'perf record; with callchain recording fails as below in powerpc: ./perf record -a -gR sleep 10 ./perf report perf: Segmentation fault gdb trace points to thread__find_map 0 0x00000000101df314 in atomic_cmpxchg (newval=1818846826, oldval=1818846827, v=0x1001a8f3) at /home/athira/linux/tools/include/asm-generic/atomic-gcc.h:70 1 refcount_sub_and_test (i=1, r=0x1001a8f3) at /home/athira/linux/tools/include/linux/refcount.h:135 2 refcount_dec_and_test (r=0x1001a8f3) at /home/athira/linux/tools/include/linux/refcount.h:148 3 map__put (map=0x1001a8b3) at util/map.c:311 4 0x000000001016842c in __map__zput (map=0x7fffffffa368) at util/map.h:190 5 thread__find_map (thread=0x105b92f0, cpumode=, addr=13835058055283572736, al=al@entry=0x7fffffffa358) at util/event.c:582 6 0x000000001016882c in thread__find_symbol (thread=, cpumode=, addr=, al=0x7fffffffa358) at util/event.c:656 7 0x00000000102e12b4 in arch_skip_callchain_idx (thread=, chain=) at arch/powerpc/util/skip-callchain-idx.c:255 8 0x00000000101d3bf4 in thread__resolve_callchain_sample (thread=0x105b92f0, cursor=0x1053d160, evsel=, sample=0x7fffffffa908, parent=0x7fffffffa778, root_al=0x7fffffffa710, max_stack=) at util/machine.c:2940 9 0x00000000101cd210 in sample__resolve_callchain (sample=, cursor=, parent=, evsel=, al=, max_stack=) at util/callchain.c:1112 10 0x000000001022a9d8 in hist_entry_iter__add (iter=0x7fffffffa750, al=0x7fffffffa710, max_stack_depth=, arg=0x7fffffffbbd0) at util/hist.c:1232 11 0x0000000010056d98 in process_sample_event (tool=0x7fffffffbbd0, event=0x7ffff6223c38, sample=0x7fffffffa908, evsel=, machine=0x10524ef8) at builtin-report.c:332 Here arch_skip_callchain_idx calls thread__find_symbol and which invokes thread__find_map with uninitialised "addr_location". Snippet: thread__find_symbol(thread, PERF_RECORD_MISC_USER, ip, &al); Recent change with commit 0dd5041c9a0eaf8c ("perf addr_location: Add init/exit/copy functions") , introduced "maps__zput" in the function thread__find_map. This could result in segfault while accessing uninitialised map from "struct addr_location". Fix this by adding addr_location__init and addr_location__exit in arch_skip_callchain_idx. Fixes: 0dd5041c9a0eaf8c ("perf addr_location: Add init/exit/copy functions") Reported-by: Aneesh Kumar K.V Signed-off-by: Athira Rajeev Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20230724165815.17810-1-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/skip-callchain-idx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index b7223feec770..5f3edb3004d8 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -250,6 +250,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain) if (!chain || chain->nr < 3) return skip_slot; + addr_location__init(&al); ip = chain->ips[1]; thread__find_symbol(thread, PERF_RECORD_MISC_USER, ip, &al); @@ -259,6 +260,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain) if (!dso) { pr_debug("%" PRIx64 " dso is NULL\n", ip); + addr_location__exit(&al); return skip_slot; } @@ -279,5 +281,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain) */ skip_slot = 3; } + + addr_location__exit(&al); return skip_slot; } From aa6fde93f3a49e42c0fe0490d7f3711bac0d162e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 17 Jul 2023 12:50:02 -1000 Subject: [PATCH 031/230] workqueue: Scale up wq_cpu_intensive_thresh_us if BogoMIPS is below 4000 wq_cpu_intensive_thresh_us is used to detect CPU-hogging per-cpu work items. Once detected, they're excluded from concurrency management to prevent them from blocking other per-cpu work items. If CONFIG_WQ_CPU_INTENSIVE_REPORT is enabled, repeat offenders are also reported so that the code can be updated. The default threshold is 10ms which is long enough to do fair bit of work on modern CPUs while short enough to be usually not noticeable. This unfortunately leads to a lot of, arguable spurious, detections on very slow CPUs. Using the same threshold across CPUs whose performance levels may be apart by multiple levels of magnitude doesn't make whole lot of sense. This patch scales up wq_cpu_intensive_thresh_us upto 1 second when BogoMIPS is below 4000. This is obviously very inaccurate but it doesn't have to be accurate to be useful. The mechanism is still useful when the threshold is fully scaled up and the benefits of reports are usually shared with everyone regardless of who's reporting, so as long as there are sufficient number of fast machines reporting, we don't lose much. Some (or is it all?) ARM CPUs systemtically report significantly lower BogoMIPS. While this doesn't break anything, given how widespread ARM CPUs are, it's at least a missed opportunity and it probably would be a good idea to teach workqueue about it. Signed-off-by: Tejun Heo Reported-and-Tested-by: Geert Uytterhoeven --- kernel/workqueue.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02a8f402eeb5..800b4208dba9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -338,8 +339,10 @@ static cpumask_var_t *wq_numa_possible_cpumask; * Per-cpu work items which run for longer than the following threshold are * automatically considered CPU intensive and excluded from concurrency * management to prevent them from noticeably delaying other per-cpu work items. + * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. + * The actual value is initialized in wq_cpu_intensive_thresh_init(). */ -static unsigned long wq_cpu_intensive_thresh_us = 10000; +static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); static bool wq_disable_numa; @@ -6513,6 +6516,42 @@ void __init workqueue_init_early(void) !system_freezable_power_efficient_wq); } +static void __init wq_cpu_intensive_thresh_init(void) +{ + unsigned long thresh; + unsigned long bogo; + + /* if the user set it to a specific value, keep it */ + if (wq_cpu_intensive_thresh_us != ULONG_MAX) + return; + + /* + * The default of 10ms is derived from the fact that most modern (as of + * 2023) processors can do a lot in 10ms and that it's just below what + * most consider human-perceivable. However, the kernel also runs on a + * lot slower CPUs including microcontrollers where the threshold is way + * too low. + * + * Let's scale up the threshold upto 1 second if BogoMips is below 4000. + * This is by no means accurate but it doesn't have to be. The mechanism + * is still useful even when the threshold is fully scaled up. Also, as + * the reports would usually be applicable to everyone, some machines + * operating on longer thresholds won't significantly diminish their + * usefulness. + */ + thresh = 10 * USEC_PER_MSEC; + + /* see init/calibrate.c for lpj -> BogoMIPS calculation */ + bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); + if (bogo < 4000) + thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); + + pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", + loops_per_jiffy, bogo, thresh); + + wq_cpu_intensive_thresh_us = thresh; +} + /** * workqueue_init - bring workqueue subsystem fully online * @@ -6528,6 +6567,8 @@ void __init workqueue_init(void) struct worker_pool *pool; int cpu, bkt; + wq_cpu_intensive_thresh_init(); + /* * It'd be simpler to initialize NUMA in workqueue_init_early() but * CPU to node mapping may not be available that early on some From ce92232614a5fb16992de8eb85bba7cb90772a1f Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 24 Jul 2023 13:38:23 +0100 Subject: [PATCH 032/230] KVM: arm64: Factor out code for checking (h)VHE mode into a macro The code for checking whether the kernel is in (h)VHE mode is repeated, and will be needed again in future patches. Factor it out in a macro. No functional change intended. No change in emitted assembly code intended. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/kvmarm/20230724123829.2929609-3-tabba@google.com/ Reviewed-by: Marc Zyngier Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 8e5ffb58f83e..16d3bafa715d 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -31,6 +31,13 @@ .Lskip_hcrx_\@: .endm +/* Check if running in host at EL2 mode, i.e., (h)VHE. Jump to fail if not. */ +.macro __check_hvhe fail, tmp + mrs \tmp, hcr_el2 + and \tmp, \tmp, #HCR_E2H + cbz \tmp, \fail +.endm + /* * Allow Non-secure EL1 and EL0 to access physical timer and counter. * This is not necessary for VHE, since the host kernel runs in EL2, @@ -43,9 +50,7 @@ */ .macro __init_el2_timers mov x0, #3 // Enable EL1 physical timers - mrs x1, hcr_el2 - and x1, x1, #HCR_E2H - cbz x1, .LnVHE_\@ + __check_hvhe .LnVHE_\@, x1 lsl x0, x0, #10 .LnVHE_\@: msr cnthctl_el2, x0 @@ -139,9 +144,7 @@ /* Coprocessor traps */ .macro __init_el2_cptr - mrs x1, hcr_el2 - and x1, x1, #HCR_E2H - cbz x1, .LnVHE_\@ + __check_hvhe .LnVHE_\@, x1 mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN) b .Lset_cptr_\@ .LnVHE_\@: @@ -269,9 +272,7 @@ .Linit_sve_\@: /* SVE register access */ mrs x0, cptr_el2 // Disable SVE traps - mrs x1, hcr_el2 - and x1, x1, #HCR_E2H - cbz x1, .Lcptr_nvhe_\@ + __check_hvhe .Lcptr_nvhe_\@, x1 // VHE case orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) From 45a3681a10ff1732fcd7a177fbf1f9ceeaffd7c9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 24 Jul 2023 13:38:24 +0100 Subject: [PATCH 033/230] KVM: arm64: Use the appropriate feature trap register for SVE at EL2 setup Use the architectural feature trap/control register that corresponds to the current KVM mode, i.e., CPTR_EL2 or CPACR_EL1, when setting up SVE feature traps. Signed-off-by: Fuad Tabba Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230724123829.2929609-4-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 16d3bafa715d..41c5b02f38c5 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -146,11 +146,12 @@ .macro __init_el2_cptr __check_hvhe .LnVHE_\@, x1 mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN) - b .Lset_cptr_\@ + msr cpacr_el1, x0 + b .Lskip_set_cptr_\@ .LnVHE_\@: mov x0, #0x33ff -.Lset_cptr_\@: msr cptr_el2, x0 // Disable copro. traps to EL2 +.Lskip_set_cptr_\@: .endm /* Disable any fine grained traps */ @@ -271,17 +272,19 @@ check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2 .Linit_sve_\@: /* SVE register access */ - mrs x0, cptr_el2 // Disable SVE traps __check_hvhe .Lcptr_nvhe_\@, x1 - // VHE case + // (h)VHE case + mrs x0, cpacr_el1 // Disable SVE traps orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) - b .Lset_cptr_\@ + msr cpacr_el1, x0 + b .Lskip_set_cptr_\@ .Lcptr_nvhe_\@: // nVHE case + mrs x0, cptr_el2 // Disable SVE traps bic x0, x0, #CPTR_EL2_TZ -.Lset_cptr_\@: msr cptr_el2, x0 +.Lskip_set_cptr_\@: isb mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector msr_s SYS_ZCR_EL2, x1 // length for EL1. From 380624d4358b0150804d279c20632555e453bc1f Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 24 Jul 2023 13:38:25 +0100 Subject: [PATCH 034/230] KVM: arm64: Disable SME traps for (h)VHE at setup Ensure that SME traps are disabled for (h)VHE when setting up EL2, as they are for nVHE. Signed-off-by: Fuad Tabba Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230724123829.2929609-5-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 41c5b02f38c5..b7afaa026842 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -293,9 +293,19 @@ check_override id_aa64pfr1, ID_AA64PFR1_EL1_SME_SHIFT, .Linit_sme_\@, .Lskip_sme_\@, x1, x2 .Linit_sme_\@: /* SME register access and priority mapping */ + __check_hvhe .Lcptr_nvhe_sme_\@, x1 + + // (h)VHE case + mrs x0, cpacr_el1 // Disable SME traps + orr x0, x0, #(CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN) + msr cpacr_el1, x0 + b .Lskip_set_cptr_sme_\@ + +.Lcptr_nvhe_sme_\@: // nVHE case mrs x0, cptr_el2 // Disable SME traps bic x0, x0, #CPTR_EL2_TSM msr cptr_el2, x0 +.Lskip_set_cptr_sme_\@: isb mrs x1, sctlr_el2 From 90ae31c65d5afdd0864017c9354247ddb601917f Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 24 Jul 2023 13:38:26 +0100 Subject: [PATCH 035/230] KVM: arm64: Helper to write to appropriate feature trap register based on mode Factor out the code that decides whether to write to the feature trap registers, CPTR_EL2 or CPACR_EL1, based on the KVM mode, i.e., (h)VHE or nVHE. This function will be used in the subsequent patch. No functional change intended. Signed-off-by: Fuad Tabba Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230724123829.2929609-6-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index efc0b45d79c3..f5941f6dce49 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -571,6 +571,14 @@ static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) return test_bit(feature, vcpu->arch.features); } +static __always_inline void kvm_write_cptr_el2(u64 val) +{ + if (has_vhe() || has_hvhe()) + write_sysreg(val, cpacr_el1); + else + write_sysreg(val, cptr_el2); +} + static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) { u64 val; @@ -597,9 +605,6 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) { u64 val = kvm_get_reset_cptr_el2(vcpu); - if (has_vhe() || has_hvhe()) - write_sysreg(val, cpacr_el1); - else - write_sysreg(val, cptr_el2); + kvm_write_cptr_el2(val); } #endif /* __ARM64_KVM_EMULATE_H__ */ From a9626099a51f697939d35983b92a9384a4c4a676 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 24 Jul 2023 13:38:27 +0100 Subject: [PATCH 036/230] KVM: arm64: Use the appropriate feature trap register when activating traps Instead of writing directly to cptr_el2, use the helper that selects which feature trap register to write to based on the KVM mode. Fixes: 75c76ab5a641 ("KVM: arm64: Rework CPTR_EL2 programming for HVHE configuration") Signed-off-by: Fuad Tabba Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230724123829.2929609-7-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 0a6271052def..e89a23153e85 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -63,7 +63,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) __activate_traps_fpsimd32(vcpu); } - write_sysreg(val, cptr_el2); + kvm_write_cptr_el2(val); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { From 7af0d5e50006614cbf313373df708df79d9f4657 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 24 Jul 2023 13:38:28 +0100 Subject: [PATCH 037/230] KVM: arm64: Fix resetting SVE trap values on reset for hVHE Ensure that SVE traps are disabled for hVHE, if the FPSIMD state isn't owned by the guest, when getting the reset value for the architectural feature control register. Fixes: 75c76ab5a641 ("KVM: arm64: Rework CPTR_EL2 programming for HVHE configuration") Signed-off-by: Fuad Tabba Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230724123829.2929609-8-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index f5941f6dce49..adfb7d0ac55b 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -588,6 +588,10 @@ static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) CPACR_EL1_ZEN_EL1EN); } else if (has_hvhe()) { val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); + + if (!vcpu_has_sve(vcpu) || + (vcpu->arch.fp_state != FP_STATE_GUEST_OWNED)) + val |= CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN; } else { val = CPTR_NVHE_EL2_RES1; From 375110ab51dec5dcd077b8fa95075b2c67499119 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 24 Jul 2023 13:38:29 +0100 Subject: [PATCH 038/230] KVM: arm64: Fix resetting SME trap values on reset for (h)VHE Ensure that SME traps are disabled for (h)VHE when getting the reset value for the architectural feature control register. Fixes: 75c76ab5a641 ("KVM: arm64: Rework CPTR_EL2 programming for HVHE configuration") Signed-off-by: Fuad Tabba Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230724123829.2929609-9-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index adfb7d0ac55b..3d6725ff0bf6 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -586,12 +586,16 @@ static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) if (has_vhe()) { val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | CPACR_EL1_ZEN_EL1EN); + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN_EL1EN; } else if (has_hvhe()) { val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); if (!vcpu_has_sve(vcpu) || (vcpu->arch.fp_state != FP_STATE_GUEST_OWNED)) val |= CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN; } else { val = CPTR_NVHE_EL2_RES1; From 01b94b0f3922039f7d3e0d1eeb33b8891746b65f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 24 Jul 2023 14:18:42 +0200 Subject: [PATCH 039/230] KVM: arm64: fix __kvm_host_psci_cpu_entry() prototype The kvm_host_psci_cpu_entry() function was renamed in order to add a wrapper around it, but the prototype did not change, so now the missing-prototype warning came back in W=1 builds: arch/arm64/kvm/hyp/nvhe/psci-relay.c:203:28: error: no previous prototype for function '__kvm_host_psci_cpu_entry' [-Werror,-Wmissing-prototypes] asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) Fixes: dcf89d1111995 ("KVM: arm64: Add missing BTI instructions") Signed-off-by: Arnd Bergmann Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20230724121850.1386668-1-arnd@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_asm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 7d170aaa2db4..24e28bb2d95b 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -278,7 +278,7 @@ asmlinkage void __noreturn hyp_panic_bad_stack(void); asmlinkage void kvm_unexpected_el2_exception(void); struct kvm_cpu_context; void handle_trap(struct kvm_cpu_context *host_ctxt); -asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on); +asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on); void __noreturn __pkvm_init_finalise(void); void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); void kvm_patch_vector_branch(struct alt_instr *alt, From a8f014ec6a214c94ed6a9ff5ba8904a5cadd42a6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 20 Jul 2023 08:15:06 -0700 Subject: [PATCH 040/230] vboxsf: Use flexible arrays for trailing string member The declaration of struct shfl_string used trailing fake flexible arrays for the string member. This was tripping FORTIFY_SOURCE since commit df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3"). Replace the utf8 and utf16 members with actual flexible arrays, drop the unused ucs2 member, and retriain a 2 byte padding to keep the structure size the same. Reported-by: Larry Finger Closes: https://lore.kernel.org/lkml/ab3a70e9-60ed-0f13-e3d4-8866eaccc8c1@lwfinger.net/ Tested-by: Larry Finger Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20230720151458.never.673-kees@kernel.org Signed-off-by: Kees Cook --- fs/vboxsf/shfl_hostintf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/vboxsf/shfl_hostintf.h b/fs/vboxsf/shfl_hostintf.h index aca829062c12..069a019c9247 100644 --- a/fs/vboxsf/shfl_hostintf.h +++ b/fs/vboxsf/shfl_hostintf.h @@ -68,9 +68,9 @@ struct shfl_string { /** UTF-8 or UTF-16 string. Nul terminated. */ union { - u8 utf8[2]; - u16 utf16[1]; - u16 ucs2[1]; /* misnomer, use utf16. */ + u8 legacy_padding[2]; + DECLARE_FLEX_ARRAY(u8, utf8); + DECLARE_FLEX_ARRAY(u16, utf16); } string; }; VMMDEV_ASSERT_SIZE(shfl_string, 6); From 2dedcf414bb01b8d966eb445db1d181d92304fb2 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Mon, 24 Jul 2023 10:42:29 +0800 Subject: [PATCH 041/230] drm/ttm: check null pointer before accessing when swapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a check to avoid null pointer dereference as below: [ 90.002283] general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 90.002292] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] [ 90.002346] ? exc_general_protection+0x159/0x240 [ 90.002352] ? asm_exc_general_protection+0x26/0x30 [ 90.002357] ? ttm_bo_evict_swapout_allowable+0x322/0x5e0 [ttm] [ 90.002365] ? ttm_bo_evict_swapout_allowable+0x42e/0x5e0 [ttm] [ 90.002373] ttm_bo_swapout+0x134/0x7f0 [ttm] [ 90.002383] ? __pfx_ttm_bo_swapout+0x10/0x10 [ttm] [ 90.002391] ? lock_acquire+0x44d/0x4f0 [ 90.002398] ? ttm_device_swapout+0xa5/0x260 [ttm] [ 90.002412] ? lock_acquired+0x355/0xa00 [ 90.002416] ? do_raw_spin_trylock+0xb6/0x190 [ 90.002421] ? __pfx_lock_acquired+0x10/0x10 [ 90.002426] ? ttm_global_swapout+0x25/0x210 [ttm] [ 90.002442] ttm_device_swapout+0x198/0x260 [ttm] [ 90.002456] ? __pfx_ttm_device_swapout+0x10/0x10 [ttm] [ 90.002472] ttm_global_swapout+0x75/0x210 [ttm] [ 90.002486] ttm_tt_populate+0x187/0x3f0 [ttm] [ 90.002501] ttm_bo_handle_move_mem+0x437/0x590 [ttm] [ 90.002517] ttm_bo_validate+0x275/0x430 [ttm] [ 90.002530] ? __pfx_ttm_bo_validate+0x10/0x10 [ttm] [ 90.002544] ? kasan_save_stack+0x33/0x60 [ 90.002550] ? kasan_set_track+0x25/0x30 [ 90.002554] ? __kasan_kmalloc+0x8f/0xa0 [ 90.002558] ? amdgpu_gtt_mgr_new+0x81/0x420 [amdgpu] [ 90.003023] ? ttm_resource_alloc+0xf6/0x220 [ttm] [ 90.003038] amdgpu_bo_pin_restricted+0x2dd/0x8b0 [amdgpu] [ 90.003210] ? __x64_sys_ioctl+0x131/0x1a0 [ 90.003210] ? do_syscall_64+0x60/0x90 Fixes: a2848d08742c ("drm/ttm: never consider pinned BOs for eviction&swap") Tested-by: Mikhail Gavrilov Signed-off-by: Guchun Chen Reviewed-by: Alex Deucher Reviewed-by: Christian König Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20230724024229.1118444-1-guchun.chen@amd.com Signed-off-by: Christian König --- drivers/gpu/drm/ttm/ttm_bo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 7139a522b2f3..54e3083076b7 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -519,7 +519,8 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo, if (bo->pin_count) { *locked = false; - *busy = false; + if (busy) + *busy = false; return false; } From fb3bd914b3ec28f5fb697ac55c4846ac2d542855 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 28 Jun 2023 11:02:39 +0200 Subject: [PATCH 042/230] x86/srso: Add a Speculative RAS Overflow mitigation Add a mitigation for the speculative return address stack overflow vulnerability found on AMD processors. The mitigation works by ensuring all RET instructions speculate to a controlled location, similar to how speculation is controlled in the retpoline sequence. To accomplish this, the __x86_return_thunk forces the CPU to mispredict every function return using a 'safe return' sequence. To ensure the safety of this mitigation, the kernel must ensure that the safe return sequence is itself free from attacker interference. In Zen3 and Zen4, this is accomplished by creating a BTB alias between the untraining function srso_untrain_ret_alias() and the safe return function srso_safe_ret_alias() which results in evicting a potentially poisoned BTB entry and using that safe one for all function returns. In older Zen1 and Zen2, this is accomplished using a reinterpretation technique similar to Retbleed one: srso_untrain_ret() and srso_safe_ret(). Signed-off-by: Borislav Petkov (AMD) --- Documentation/admin-guide/hw-vuln/index.rst | 1 + Documentation/admin-guide/hw-vuln/srso.rst | 133 ++++++++++++++++++ .../admin-guide/kernel-parameters.txt | 11 ++ arch/x86/Kconfig | 7 + arch/x86/include/asm/cpufeatures.h | 5 + arch/x86/include/asm/nospec-branch.h | 15 +- arch/x86/include/asm/processor.h | 2 + arch/x86/kernel/alternative.c | 4 +- arch/x86/kernel/cpu/amd.c | 14 ++ arch/x86/kernel/cpu/bugs.c | 106 ++++++++++++++ arch/x86/kernel/cpu/common.c | 8 +- arch/x86/kernel/vmlinux.lds.S | 29 +++- arch/x86/lib/retpoline.S | 82 ++++++++++- drivers/base/cpu.c | 8 ++ include/linux/cpu.h | 2 + tools/objtool/arch/x86/decode.c | 5 +- 16 files changed, 422 insertions(+), 10 deletions(-) create mode 100644 Documentation/admin-guide/hw-vuln/srso.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index e0614760a99e..ff4d3fa2a75c 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -19,3 +19,4 @@ are configurable at compile, boot or run time. l1d_flush.rst processor_mmio_stale_data.rst cross-thread-rsb.rst + srso diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst new file mode 100644 index 000000000000..32eb5e6db272 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/srso.rst @@ -0,0 +1,133 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Speculative Return Stack Overflow (SRSO) +======================================== + +This is a mitigation for the speculative return stack overflow (SRSO) +vulnerability found on AMD processors. The mechanism is by now the well +known scenario of poisoning CPU functional units - the Branch Target +Buffer (BTB) and Return Address Predictor (RAP) in this case - and then +tricking the elevated privilege domain (the kernel) into leaking +sensitive data. + +AMD CPUs predict RET instructions using a Return Address Predictor (aka +Return Address Stack/Return Stack Buffer). In some cases, a non-architectural +CALL instruction (i.e., an instruction predicted to be a CALL but is +not actually a CALL) can create an entry in the RAP which may be used +to predict the target of a subsequent RET instruction. + +The specific circumstances that lead to this varies by microarchitecture +but the concern is that an attacker can mis-train the CPU BTB to predict +non-architectural CALL instructions in kernel space and use this to +control the speculative target of a subsequent kernel RET, potentially +leading to information disclosure via a speculative side-channel. + +The issue is tracked under CVE-2023-20569. + +Affected processors +------------------- + +AMD Zen, generations 1-4. That is, all families 0x17 and 0x19. Older +processors have not been investigated. + +System information and options +------------------------------ + +First of all, it is required that the latest microcode be loaded for +mitigations to be effective. + +The sysfs file showing SRSO mitigation status is: + + /sys/devices/system/cpu/vulnerabilities/spec_rstack_overflow + +The possible values in this file are: + + - 'Not affected' The processor is not vulnerable + + - 'Vulnerable: no microcode' The processor is vulnerable, no + microcode extending IBPB functionality + to address the vulnerability has been + applied. + + - 'Mitigation: microcode' Extended IBPB functionality microcode + patch has been applied. It does not + address User->Kernel and Guest->Host + transitions protection but it does + address User->User and VM->VM attack + vectors. + + (spec_rstack_overflow=microcode) + + - 'Mitigation: safe RET' Software-only mitigation. It complements + the extended IBPB microcode patch + functionality by addressing User->Kernel + and Guest->Host transitions protection. + + Selected by default or by + spec_rstack_overflow=safe-ret + + - 'Mitigation: IBPB' Similar protection as "safe RET" above + but employs an IBPB barrier on privilege + domain crossings (User->Kernel, + Guest->Host). + + (spec_rstack_overflow=ibpb) + + - 'Mitigation: IBPB on VMEXIT' Mitigation addressing the cloud provider + scenario - the Guest->Host transitions + only. + + (spec_rstack_overflow=ibpb-vmexit) + +In order to exploit vulnerability, an attacker needs to: + + - gain local access on the machine + + - break kASLR + + - find gadgets in the running kernel in order to use them in the exploit + + - potentially create and pin an additional workload on the sibling + thread, depending on the microarchitecture (not necessary on fam 0x19) + + - run the exploit + +Considering the performance implications of each mitigation type, the +default one is 'Mitigation: safe RET' which should take care of most +attack vectors, including the local User->Kernel one. + +As always, the user is advised to keep her/his system up-to-date by +applying software updates regularly. + +The default setting will be reevaluated when needed and especially when +new attack vectors appear. + +As one can surmise, 'Mitigation: safe RET' does come at the cost of some +performance depending on the workload. If one trusts her/his userspace +and does not want to suffer the performance impact, one can always +disable the mitigation with spec_rstack_overflow=off. + +Similarly, 'Mitigation: IBPB' is another full mitigation type employing +an indrect branch prediction barrier after having applied the required +microcode patch for one's system. This mitigation comes also at +a performance cost. + +Mitigation: safe RET +-------------------- + +The mitigation works by ensuring all RET instructions speculate to +a controlled location, similar to how speculation is controlled in the +retpoline sequence. To accomplish this, the __x86_return_thunk forces +the CPU to mispredict every function return using a 'safe return' +sequence. + +To ensure the safety of this mitigation, the kernel must ensure that the +safe return sequence is itself free from attacker interference. In Zen3 +and Zen4, this is accomplished by creating a BTB alias between the +untraining function srso_untrain_ret_alias() and the safe return +function srso_safe_ret_alias() which results in evicting a potentially +poisoned BTB entry and using that safe one for all function returns. + +In older Zen1 and Zen2, this is accomplished using a reinterpretation +technique similar to Retbleed one: srso_untrain_ret() and +srso_safe_ret(). diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a1457995fd41..f5ec3dade58e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5875,6 +5875,17 @@ Not specifying this option is equivalent to spectre_v2_user=auto. + spec_rstack_overflow= + [X86] Control RAS overflow mitigation on AMD Zen CPUs + + off - Disable mitigation + microcode - Enable microcode mitigation only + safe-ret - Enable sw-only safe RET mitigation (default) + ibpb - Enable mitigation by issuing IBPB on + kernel entry + ibpb-vmexit - Issue IBPB only on VMEXIT + (cloud-specific mitigation) + spec_store_bypass_disable= [HW] Control Speculative Store Bypass (SSB) Disable mitigation (Speculative Store Bypass vulnerability) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db409770..d29f1e28a936 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2593,6 +2593,13 @@ config CPU_IBRS_ENTRY This mitigates both spectre_v2 and retbleed at great cost to performance. +config CPU_SRSO + bool "Mitigate speculative RAS overflow on AMD" + depends on CPU_SUP_AMD && X86_64 && RETHUNK + default y + help + Enable the SRSO mitigation needed on AMD Zen1-4 machines. + config SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1f6d904c6481..bc1b4d68e616 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -309,6 +309,9 @@ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ +#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ + /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ @@ -484,4 +487,6 @@ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ +/* BUG word 2 */ +#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1a65cf4acb2b..43fe1c747085 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -211,7 +211,8 @@ * eventually turn into it's own annotation. */ .macro VALIDATE_UNRET_END -#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY) +#if defined(CONFIG_NOINSTR_VALIDATION) && \ + (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -296,6 +297,11 @@ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif + +#ifdef CONFIG_CPU_SRSO + ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \ + "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS +#endif .endm .macro UNTRAIN_RET_FROM_CALL @@ -307,6 +313,11 @@ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH #endif + +#ifdef CONFIG_CPU_SRSO + ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \ + "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS +#endif .endm @@ -332,6 +343,8 @@ extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); +extern void srso_untrain_ret(void); +extern void srso_untrain_ret_alias(void); extern void entry_ibpb(void); #ifdef CONFIG_CALL_THUNKS diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d46300e94f85..7c67db7c9f53 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -682,9 +682,11 @@ extern u16 get_llc_id(unsigned int cpu); #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); +extern bool cpu_has_ibpb_brtype_microcode(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } +static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; } #endif extern unsigned long arch_align_stack(unsigned long sp); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2dcf3a06af09..920a8ca7a8f8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -707,7 +707,9 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) int i = 0; /* Patch the custom return thunks... */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_feature_enabled(X86_FEATURE_RETHUNK) || + cpu_feature_enabled(X86_FEATURE_SRSO) || + cpu_feature_enabled(X86_FEATURE_SRSO_ALIAS)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 571abf808ea3..169cb255c483 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1235,3 +1235,17 @@ u32 amd_get_highest_perf(void) return 255; } EXPORT_SYMBOL_GPL(amd_get_highest_perf); + +bool cpu_has_ibpb_brtype_microcode(void) +{ + u8 fam = boot_cpu_data.x86; + + if (fam == 0x17) { + /* Zen1/2 IBPB flushes branch type predictions too. */ + return boot_cpu_has(X86_FEATURE_AMD_IBPB); + } else if (fam == 0x19) { + return false; + } + + return false; +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9e2a91830f72..31cef61da03a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -47,6 +47,7 @@ static void __init taa_select_mitigation(void); static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); +static void __init srso_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -160,6 +161,7 @@ void __init cpu_select_mitigations(void) md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); + srso_select_mitigation(); } /* @@ -2185,6 +2187,95 @@ static int __init l1tf_cmdline(char *str) } early_param("l1tf", l1tf_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt + +enum srso_mitigation { + SRSO_MITIGATION_NONE, + SRSO_MITIGATION_MICROCODE, + SRSO_MITIGATION_SAFE_RET, +}; + +enum srso_mitigation_cmd { + SRSO_CMD_OFF, + SRSO_CMD_MICROCODE, + SRSO_CMD_SAFE_RET, +}; + +static const char * const srso_strings[] = { + [SRSO_MITIGATION_NONE] = "Vulnerable", + [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", + [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", +}; + +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; +static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; + +static int __init srso_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + srso_cmd = SRSO_CMD_OFF; + else if (!strcmp(str, "microcode")) + srso_cmd = SRSO_CMD_MICROCODE; + else if (!strcmp(str, "safe-ret")) + srso_cmd = SRSO_CMD_SAFE_RET; + else + pr_err("Ignoring unknown SRSO option (%s).", str); + + return 0; +} +early_param("spec_rstack_overflow", srso_parse_cmdline); + +#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options." + +static void __init srso_select_mitigation(void) +{ + bool has_microcode; + + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + return; + + has_microcode = cpu_has_ibpb_brtype_microcode(); + if (!has_microcode) { + pr_warn("IBPB-extending microcode not applied!\n"); + pr_warn(SRSO_NOTICE); + } + + switch (srso_cmd) { + case SRSO_CMD_OFF: + return; + + case SRSO_CMD_MICROCODE: + if (has_microcode) { + srso_mitigation = SRSO_MITIGATION_MICROCODE; + pr_warn(SRSO_NOTICE); + } + break; + + case SRSO_CMD_SAFE_RET: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (boot_cpu_data.x86 == 0x19) + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + else + setup_force_cpu_cap(X86_FEATURE_SRSO); + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + return; + } + break; + + default: + break; + + } + + pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); +} + #undef pr_fmt #define pr_fmt(fmt) fmt @@ -2382,6 +2473,13 @@ static ssize_t retbleed_show_state(char *buf) return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); } +static ssize_t srso_show_state(char *buf) +{ + return sysfs_emit(buf, "%s%s\n", + srso_strings[srso_mitigation], + (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode")); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -2431,6 +2529,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RETBLEED: return retbleed_show_state(buf); + case X86_BUG_SRSO: + return srso_show_state(buf); + default: break; } @@ -2495,4 +2596,9 @@ ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, cha { return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); } + +ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRSO); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52683fddafaf..d4d823eae0fc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1250,6 +1250,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define RETBLEED BIT(3) /* CPU is affected by SMT (cross-thread) return predictions */ #define SMT_RSB BIT(4) +/* CPU is affected by SRSO */ +#define SRSO BIT(5) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1281,8 +1283,9 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SMT_RSB), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), + VULNBL_AMD(0x19, SRSO), {} }; @@ -1406,6 +1409,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) setup_force_cpu_bug(X86_BUG_SMT_RSB); + if (cpu_matches(cpu_vuln_blacklist, SRSO)) + setup_force_cpu_bug(X86_BUG_SRSO); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 03c885d3640f..e76813230192 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -134,13 +134,27 @@ SECTIONS SOFTIRQENTRY_TEXT #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; - *(.text.__x86.*) + *(.text.__x86.indirect_thunk) + *(.text.__x86.return_thunk) __indirect_thunk_end = .; #endif STATIC_CALL_TEXT ALIGN_ENTRY_TEXT_BEGIN +#ifdef CONFIG_CPU_SRSO + *(.text.__x86.rethunk_untrain) +#endif + ENTRY_TEXT + +#ifdef CONFIG_CPU_SRSO + /* + * See the comment above srso_untrain_ret_alias()'s + * definition. + */ + . = srso_untrain_ret_alias | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20); + *(.text.__x86.rethunk_safe) +#endif ALIGN_ENTRY_TEXT_END *(.gnu.warning) @@ -509,7 +523,18 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #ifdef CONFIG_RETHUNK -. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned"); +. = ASSERT((__ret & 0x3f) == 0, "__ret not cacheline-aligned"); +. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); +#endif + +#ifdef CONFIG_CPU_SRSO +/* + * GNU ld cannot do XOR so do: (A | B) - (A & B) in order to compute the XOR + * of the two function addresses: + */ +. = ASSERT(((srso_untrain_ret_alias | srso_safe_ret_alias) - + (srso_untrain_ret_alias & srso_safe_ret_alias)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)), + "SRSO function pair won't alias"); #endif #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 3fd066d42ec0..845cfb0d748f 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -11,6 +11,7 @@ #include #include #include +#include .section .text.__x86.indirect_thunk @@ -131,6 +132,45 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) */ #ifdef CONFIG_RETHUNK +/* + * srso_untrain_ret_alias() and srso_safe_ret_alias() are placed at + * special addresses: + * + * - srso_untrain_ret_alias() is 2M aligned + * - srso_safe_ret_alias() is also in the same 2M page but bits 2, 8, 14 + * and 20 in its virtual address are set (while those bits in the + * srso_untrain_ret_alias() function are cleared). + * + * This guarantees that those two addresses will alias in the branch + * target buffer of Zen3/4 generations, leading to any potential + * poisoned entries at that BTB slot to get evicted. + * + * As a result, srso_safe_ret_alias() becomes a safe return. + */ +#ifdef CONFIG_CPU_SRSO + .section .text.__x86.rethunk_untrain + +SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE) + ASM_NOP2 + lfence + jmp __x86_return_thunk +SYM_FUNC_END(srso_untrain_ret_alias) +__EXPORT_THUNK(srso_untrain_ret_alias) + + .section .text.__x86.rethunk_safe +#endif + +/* Needs a definition for the __x86_return_thunk alternative below. */ +SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE) +#ifdef CONFIG_CPU_SRSO + add $8, %_ASM_SP + UNWIND_HINT_FUNC +#endif + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(srso_safe_ret_alias) + .section .text.__x86.return_thunk /* @@ -143,7 +183,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) * from re-poisioning the BTB prediction. */ .align 64 - .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc + .skip 64 - (__ret - zen_untrain_ret), 0xcc SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) ANNOTATE_NOENDBR /* @@ -175,10 +215,10 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) * evicted, __x86_return_thunk will suffer Straight Line Speculation * which will be contained safely by the INT3. */ -SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL) +SYM_INNER_LABEL(__ret, SYM_L_GLOBAL) ret int3 -SYM_CODE_END(__x86_return_thunk) +SYM_CODE_END(__ret) /* * Ensure the TEST decoding / BTB invalidation is complete. @@ -189,11 +229,45 @@ SYM_CODE_END(__x86_return_thunk) * Jump back and execute the RET in the middle of the TEST instruction. * INT3 is for SLS protection. */ - jmp __x86_return_thunk + jmp __ret int3 SYM_FUNC_END(zen_untrain_ret) __EXPORT_THUNK(zen_untrain_ret) +/* + * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret() + * above. On kernel entry, srso_untrain_ret() is executed which is a + * + * movabs $0xccccccc308c48348,%rax + * + * and when the return thunk executes the inner label srso_safe_ret() + * later, it is a stack manipulation and a RET which is mispredicted and + * thus a "safe" one to use. + */ + .align 64 + .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc +SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + ANNOTATE_NOENDBR + .byte 0x48, 0xb8 + +SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL) + add $8, %_ASM_SP + ret + int3 + int3 + int3 + lfence + call srso_safe_ret + int3 +SYM_CODE_END(srso_safe_ret) +SYM_FUNC_END(srso_untrain_ret) +__EXPORT_THUNK(srso_untrain_ret) + +SYM_FUNC_START(__x86_return_thunk) + ALTERNATIVE_2 "jmp __ret", "call srso_safe_ret", X86_FEATURE_SRSO, \ + "call srso_safe_ret_alias", X86_FEATURE_SRSO_ALIAS + int3 +SYM_CODE_END(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_RETHUNK */ diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index c1815b9dae68..f111586d1cce 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -577,6 +577,12 @@ ssize_t __weak cpu_show_retbleed(struct device *dev, return sysfs_emit(buf, "Not affected\n"); } +ssize_t __weak cpu_show_spec_rstack_overflow(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -588,6 +594,7 @@ static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL); static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL); +static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -601,6 +608,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_srbds.attr, &dev_attr_mmio_stale_data.attr, &dev_attr_retbleed.attr, + &dev_attr_spec_rstack_overflow.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6e6e57ec69e8..23ac87be1ff1 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -70,6 +70,8 @@ extern ssize_t cpu_show_mmio_stale_data(struct device *dev, char *buf); extern ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 2e1caabecb18..2d51fa8da9e8 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -824,5 +824,8 @@ bool arch_is_retpoline(struct symbol *sym) bool arch_is_rethunk(struct symbol *sym) { - return !strcmp(sym->name, "__x86_return_thunk"); + return !strcmp(sym->name, "__x86_return_thunk") || + !strcmp(sym->name, "srso_untrain_ret") || + !strcmp(sym->name, "srso_safe_ret") || + !strcmp(sym->name, "__ret"); } From 79113e4060aba744787a81edb9014f2865193854 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 18 Jul 2023 11:13:40 +0200 Subject: [PATCH 043/230] x86/srso: Add IBPB_BRTYPE support Add support for the synthetic CPUID flag which "if this bit is 1, it indicates that MSR 49h (PRED_CMD) bit 0 (IBPB) flushes all branch type predictions from the CPU branch predictor." This flag is there so that this capability in guests can be detected easily (otherwise one would have to track microcode revisions which is impossible for guests). It is also needed only for Zen3 and -4. The other two (Zen1 and -2) always flush branch type predictions by default. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/kernel/cpu/bugs.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index bc1b4d68e616..8aebe95d2fad 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -445,6 +445,8 @@ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ +#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ + /* * BUG word(s) */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 31cef61da03a..ff61ef61277a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2238,10 +2238,20 @@ static void __init srso_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) return; - has_microcode = cpu_has_ibpb_brtype_microcode(); + /* + * The first check is for the kernel running as a guest in order + * for guests to verify whether IBPB is a viable mitigation. + */ + has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode(); if (!has_microcode) { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); + } else { + /* + * Enable the synthetic (even if in a real CPUID leaf) + * flag for guests. + */ + setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); } switch (srso_cmd) { From 1b5277c0ea0b247393a9c426769fde18cff5e2f6 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Thu, 29 Jun 2023 17:43:40 +0200 Subject: [PATCH 044/230] x86/srso: Add SRSO_NO support Add support for the CPUID flag which denotes that the CPU is not affected by SRSO. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/amd.c | 12 ++++++------ arch/x86/kernel/cpu/bugs.c | 24 ++++++++++++++++++++---- arch/x86/kernel/cpu/common.c | 6 ++++-- arch/x86/kvm/cpuid.c | 3 +++ 7 files changed, 39 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8aebe95d2fad..93070aabbb2f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -445,7 +445,9 @@ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ +#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ +#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ /* * BUG word(s) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3aedae61af4f..c81483a3c13d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -57,6 +57,7 @@ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ +#define PRED_CMD_SBPB BIT(7) /* Selective Branch Prediction Barrier */ #define MSR_PPIN_CTL 0x0000004e #define MSR_PPIN 0x0000004f diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 43fe1c747085..8346c33760c1 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -492,11 +492,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } +extern u64 x86_pred_cmd; + static inline void indirect_branch_prediction_barrier(void) { - u64 val = PRED_CMD_IBPB; - - alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB); + alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB); } /* The Intel SPEC CTRL MSR base value cache */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 169cb255c483..834f310b2f1a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1240,12 +1240,12 @@ bool cpu_has_ibpb_brtype_microcode(void) { u8 fam = boot_cpu_data.x86; - if (fam == 0x17) { - /* Zen1/2 IBPB flushes branch type predictions too. */ + /* Zen1/2 IBPB flushes branch type predictions too. */ + if (fam == 0x17) return boot_cpu_has(X86_FEATURE_AMD_IBPB); - } else if (fam == 0x19) { + /* Poke the MSR bit on Zen3/4 to check its presence. */ + else if (fam == 0x19) + return !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB); + else return false; - } - - return false; } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ff61ef61277a..439ecad62317 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -57,6 +57,9 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); +u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; +EXPORT_SYMBOL_GPL(x86_pred_cmd); + static DEFINE_MUTEX(spec_ctrl_mutex); /* Update SPEC_CTRL MSR and its cached copy unconditionally */ @@ -2236,7 +2239,7 @@ static void __init srso_select_mitigation(void) bool has_microcode; if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) - return; + goto pred_cmd; /* * The first check is for the kernel running as a guest in order @@ -2249,9 +2252,18 @@ static void __init srso_select_mitigation(void) } else { /* * Enable the synthetic (even if in a real CPUID leaf) - * flag for guests. + * flags for guests. */ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); + setup_force_cpu_cap(X86_FEATURE_SBPB); + + /* + * Zen1/2 with SMT off aren't vulnerable after the right + * IBPB microcode has been applied. + */ + if ((boot_cpu_data.x86 < 0x19) && + (cpu_smt_control == CPU_SMT_DISABLED)) + setup_force_cpu_cap(X86_FEATURE_SRSO_NO); } switch (srso_cmd) { @@ -2274,16 +2286,20 @@ static void __init srso_select_mitigation(void) srso_mitigation = SRSO_MITIGATION_SAFE_RET; } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - return; + goto pred_cmd; } break; default: break; - } pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); + +pred_cmd: + if (boot_cpu_has(X86_FEATURE_SRSO_NO) || + srso_cmd == SRSO_CMD_OFF) + x86_pred_cmd = PRED_CMD_SBPB; } #undef pr_fmt diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d4d823eae0fc..5576cdac3b4a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1409,8 +1409,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) setup_force_cpu_bug(X86_BUG_SMT_RSB); - if (cpu_matches(cpu_vuln_blacklist, SRSO)) - setup_force_cpu_bug(X86_BUG_SRSO); + if (!cpu_has(c, X86_FEATURE_SRSO_NO)) { + if (cpu_matches(cpu_vuln_blacklist, SRSO)) + setup_force_cpu_bug(X86_BUG_SRSO); + } if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7f4d13383cf2..d3432687c9e6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -729,6 +729,9 @@ void kvm_set_cpu_caps(void) F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); + if (cpu_feature_enabled(X86_FEATURE_SRSO_NO)) + kvm_cpu_cap_set(X86_FEATURE_SRSO_NO); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, F(PERFMON_V2) ); From 233d6f68b98d480a7c42ebe78c38f79d44741ca9 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Thu, 6 Jul 2023 15:04:35 +0200 Subject: [PATCH 045/230] x86/srso: Add IBPB Add the option to mitigate using IBPB on a kernel entry. Pull in the Retbleed alternative so that the IBPB call from there can be used. Also, if Retbleed mitigation is done using IBPB, the same mitigation can and must be used here. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/include/asm/nospec-branch.h | 2 +- arch/x86/kernel/cpu/bugs.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 8346c33760c1..3faf044569a5 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -290,7 +290,7 @@ */ .macro UNTRAIN_RET #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) + defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 439ecad62317..f3cc432ed818 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2197,18 +2197,21 @@ enum srso_mitigation { SRSO_MITIGATION_NONE, SRSO_MITIGATION_MICROCODE, SRSO_MITIGATION_SAFE_RET, + SRSO_MITIGATION_IBPB, }; enum srso_mitigation_cmd { SRSO_CMD_OFF, SRSO_CMD_MICROCODE, SRSO_CMD_SAFE_RET, + SRSO_CMD_IBPB, }; static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", + [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; @@ -2225,6 +2228,8 @@ static int __init srso_parse_cmdline(char *str) srso_cmd = SRSO_CMD_MICROCODE; else if (!strcmp(str, "safe-ret")) srso_cmd = SRSO_CMD_SAFE_RET; + else if (!strcmp(str, "ibpb")) + srso_cmd = SRSO_CMD_IBPB; else pr_err("Ignoring unknown SRSO option (%s).", str); @@ -2266,6 +2271,14 @@ static void __init srso_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_SRSO_NO); } + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (has_microcode) { + pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n"); + srso_mitigation = SRSO_MITIGATION_IBPB; + goto pred_cmd; + } + } + switch (srso_cmd) { case SRSO_CMD_OFF: return; @@ -2290,6 +2303,16 @@ static void __init srso_select_mitigation(void) } break; + case SRSO_CMD_IBPB: + if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + if (has_microcode) { + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + srso_mitigation = SRSO_MITIGATION_IBPB; + } + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto pred_cmd; + } default: break; } From d893832d0e1ef41c72cdae444268c1d64a2be8ad Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 7 Jul 2023 13:53:41 +0200 Subject: [PATCH 046/230] x86/srso: Add IBPB on VMEXIT Add the option to flush IBPB only on VMEXIT in order to protect from malicious guests but one otherwise trusts the software that runs on the hypervisor. Signed-off-by: Borislav Petkov (AMD) --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 19 +++++++++++++++++++ arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/svm/vmenter.S | 3 +++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 93070aabbb2f..7600a8a1589f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -311,6 +311,7 @@ #define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ #define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ +#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index f3cc432ed818..d4109eb5eb2e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2198,6 +2198,7 @@ enum srso_mitigation { SRSO_MITIGATION_MICROCODE, SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, + SRSO_MITIGATION_IBPB_ON_VMEXIT, }; enum srso_mitigation_cmd { @@ -2205,6 +2206,7 @@ enum srso_mitigation_cmd { SRSO_CMD_MICROCODE, SRSO_CMD_SAFE_RET, SRSO_CMD_IBPB, + SRSO_CMD_IBPB_ON_VMEXIT, }; static const char * const srso_strings[] = { @@ -2212,6 +2214,7 @@ static const char * const srso_strings[] = { [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; @@ -2230,6 +2233,8 @@ static int __init srso_parse_cmdline(char *str) srso_cmd = SRSO_CMD_SAFE_RET; else if (!strcmp(str, "ibpb")) srso_cmd = SRSO_CMD_IBPB; + else if (!strcmp(str, "ibpb-vmexit")) + srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); @@ -2313,6 +2318,20 @@ static void __init srso_select_mitigation(void) pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); goto pred_cmd; } + break; + + case SRSO_CMD_IBPB_ON_VMEXIT: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + } + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + goto pred_cmd; + } + break; + default: break; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d381ad424554..0a51fd56f960 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1498,7 +1498,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; - indirect_branch_prediction_barrier(); + + if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) + indirect_branch_prediction_barrier(); } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 8e8295e774f0..265452fc9ebe 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -224,6 +224,9 @@ SYM_FUNC_START(__svm_vcpu_run) */ UNTRAIN_RET + /* SRSO */ + ALTERNATIVE "", "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT + /* * Clear all general purpose registers except RSP and RAX to prevent * speculative use of the guest's values, even those that are reloaded From 5c49b6c3f2cc6d44f691c4aa3d22ca49bedf637b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 18 Jul 2023 17:18:34 -0700 Subject: [PATCH 047/230] perf parse-events: Extra care around force grouped events Perf metric (topdown) events on Intel Icelake+ machines require a group, however, they may be next to events that don't require a group. Consider: cycles,slots,topdown-fe-bound The cycles event needn't be grouped but slots and topdown-fe-bound need grouping. Prior to this change, as slots and topdown-fe-bound need a group forcing and all events share the same PMU, slots and topdown-fe-bound would be forced into a group with cycles. This is a bug on two fronts, cycles wasn't supposed to be grouped and cycles can't be a group leader with a perf metric event. This change adds recognition that cycles isn't force grouped and so it shouldn't be force grouped with slots and topdown-fe-bound. Fixes: a90cc5a9eeab45ea ("perf evsel: Don't let evsel__group_pmu_name() traverse unsorted group") Signed-off-by: Ian Rogers Tested-by: Andi Kleen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230719001836.198363-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index acde097e327c..af4683526d5f 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2149,7 +2149,7 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) int idx = 0, unsorted_idx = -1; struct evsel *pos, *cur_leader = NULL; struct perf_evsel *cur_leaders_grp = NULL; - bool idx_changed = false; + bool idx_changed = false, cur_leader_force_grouped = false; int orig_num_leaders = 0, num_leaders = 0; int ret; @@ -2190,7 +2190,7 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) const struct evsel *pos_leader = evsel__leader(pos); const char *pos_pmu_name = pos->group_pmu_name; const char *cur_leader_pmu_name, *pos_leader_pmu_name; - bool force_grouped = arch_evsel__must_be_in_group(pos); + bool pos_force_grouped = arch_evsel__must_be_in_group(pos); /* Reset index and nr_members. */ if (pos->core.idx != idx) @@ -2206,7 +2206,8 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) cur_leader = pos; cur_leader_pmu_name = cur_leader->group_pmu_name; - if ((cur_leaders_grp != pos->core.leader && !force_grouped) || + if ((cur_leaders_grp != pos->core.leader && + (!pos_force_grouped || !cur_leader_force_grouped)) || strcmp(cur_leader_pmu_name, pos_pmu_name)) { /* Event is for a different group/PMU than last. */ cur_leader = pos; @@ -2216,9 +2217,14 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) * group. */ cur_leaders_grp = pos->core.leader; + /* + * Avoid forcing events into groups with events that + * don't need to be in the group. + */ + cur_leader_force_grouped = pos_force_grouped; } pos_leader_pmu_name = pos_leader->group_pmu_name; - if (strcmp(pos_leader_pmu_name, pos_pmu_name) || force_grouped) { + if (strcmp(pos_leader_pmu_name, pos_pmu_name) || pos_force_grouped) { /* * Event's PMU differs from its leader's. Groups can't * span PMUs, so update leader from the group/PMU From e8d38345da249be17046b74d7bb64b77cdd07a08 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 18 Jul 2023 17:18:35 -0700 Subject: [PATCH 048/230] perf parse-events: When fixing group leaders always set the leader The evsel grouping fix iterates over evsels tracking the leader group and the current position's group, updating the current position's leader if an evsel is being forced into a group or groups changed. However, groups changing isn't a sufficient condition as sorting may have reordered events and the leader may no longer come first. For this reason update all leaders whenever they disagree. This change breaks certain Icelake+ metrics due to bugs in the kernel. For example, tma_l3_bound with threshold enabled tries to program the events: {topdown-retiring,slots,CYCLE_ACTIVITY.STALLS_L2_MISS,topdown-fe-bound,EXE_ACTIVITY.BOUND_ON_STORES,EXE_ACTIVITY.1_PORTS_UTIL,topdown-be-bound,cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/,CYCLE_ACTIVITY.STALLS_L3_MISS,CPU_CLK_UNHALTED.THREAD,CYCLE_ACTIVITY.STALLS_MEM_ANY,EXE_ACTIVITY.2_PORTS_UTIL,CYCLE_ACTIVITY.STALLS_TOTAL,topdown-bad-spec}:W fixing the perf metric event order gives: {slots,topdown-retiring,topdown-fe-bound,topdown-be-bound,topdown-bad-spec,CYCLE_ACTIVITY.STALLS_L2_MISS,EXE_ACTIVITY.BOUND_ON_STORES,EXE_ACTIVITY.1_PORTS_UTIL,cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/,CYCLE_ACTIVITY.STALLS_L3_MISS,CPU_CLK_UNHALTED.THREAD,CYCLE_ACTIVITY.STALLS_MEM_ANY,EXE_ACTIVITY.2_PORTS_UTIL,CYCLE_ACTIVITY.STALLS_TOTAL}:W Both of these return "" for all events, whilst they work with the group removed respecting that the perf metric events must still be grouped. A vendor events update will need to add METRIC_NO_GROUP to these metrics to workaround the kernel PMU driver issue. Fixes: a90cc5a9eeab45ea ("perf evsel: Don't let evsel__group_pmu_name() traverse unsorted group") Signed-off-by: Ian Rogers Tested-by: Andi Kleen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230719001836.198363-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index af4683526d5f..62d5ff5d8dae 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2189,7 +2189,7 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) list_for_each_entry(pos, list, core.node) { const struct evsel *pos_leader = evsel__leader(pos); const char *pos_pmu_name = pos->group_pmu_name; - const char *cur_leader_pmu_name, *pos_leader_pmu_name; + const char *cur_leader_pmu_name; bool pos_force_grouped = arch_evsel__must_be_in_group(pos); /* Reset index and nr_members. */ @@ -2223,13 +2223,8 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) */ cur_leader_force_grouped = pos_force_grouped; } - pos_leader_pmu_name = pos_leader->group_pmu_name; - if (strcmp(pos_leader_pmu_name, pos_pmu_name) || pos_force_grouped) { - /* - * Event's PMU differs from its leader's. Groups can't - * span PMUs, so update leader from the group/PMU - * tracker. - */ + if (pos_leader != cur_leader) { + /* The leader changed so update it. */ evsel__set_leader(pos, cur_leader); } } From b161f25fa30644598007d752a1b802cda0140788 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 18 Jul 2023 17:18:36 -0700 Subject: [PATCH 049/230] perf parse-events: Only move force grouped evsels when sorting Prior to this change, events without a group would be sorted as if they were from the location of the first event without a group. For example instructions and cycles are without a group: instructions,{imc_free_running/data_read/,imc_free_running/data_write/},cycles parse events would create an eventual evlist like: instructions,cycles,{uncore_imc_free_running_0/data_read/,uncore_imc_free_running_1/data_read/,uncore_imc_free_running_0/data_write/,uncore_imc_free_running_1/data_write/} This is done so that perf metric events, that must always be in a group, will be adjacent and so can be forced into a group. This change modifies the sorting so that only force grouped events, like perf metrics, are sorted and all other events keep their position with respect to groups in the evlist. The location of the force grouped event is chosen to match the first force grouped event. For architectures without force grouped events, ie anything not Intel Icelake or newer, this should mean sorting and fixing doesn't modify the event positions except when fixing the grouping for PMUs of things like uncore events. Fixes: 347c2f0a0988c59c ("perf parse-events: Sort and group parsed events") Reported-by: Andi Kleen Signed-off-by: Ian Rogers Tested-by: Andi Kleen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230719001836.198363-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 39 ++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 62d5ff5d8dae..c9ec0cafb69d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2100,16 +2100,16 @@ __weak int arch_evlist__cmp(const struct evsel *lhs, const struct evsel *rhs) return lhs->core.idx - rhs->core.idx; } -static int evlist__cmp(void *state, const struct list_head *l, const struct list_head *r) +static int evlist__cmp(void *_fg_idx, const struct list_head *l, const struct list_head *r) { const struct perf_evsel *lhs_core = container_of(l, struct perf_evsel, node); const struct evsel *lhs = container_of(lhs_core, struct evsel, core); const struct perf_evsel *rhs_core = container_of(r, struct perf_evsel, node); const struct evsel *rhs = container_of(rhs_core, struct evsel, core); - int *leader_idx = state; - int lhs_leader_idx = *leader_idx, rhs_leader_idx = *leader_idx, ret; + int *force_grouped_idx = _fg_idx; + int lhs_sort_idx, rhs_sort_idx, ret; const char *lhs_pmu_name, *rhs_pmu_name; - bool lhs_has_group = false, rhs_has_group = false; + bool lhs_has_group, rhs_has_group; /* * First sort by grouping/leader. Read the leader idx only if the evsel @@ -2121,15 +2121,25 @@ static int evlist__cmp(void *state, const struct list_head *l, const struct list */ if (lhs_core->leader != lhs_core || lhs_core->nr_members > 1) { lhs_has_group = true; - lhs_leader_idx = lhs_core->leader->idx; + lhs_sort_idx = lhs_core->leader->idx; + } else { + lhs_has_group = false; + lhs_sort_idx = *force_grouped_idx != -1 && arch_evsel__must_be_in_group(lhs) + ? *force_grouped_idx + : lhs_core->idx; } if (rhs_core->leader != rhs_core || rhs_core->nr_members > 1) { rhs_has_group = true; - rhs_leader_idx = rhs_core->leader->idx; + rhs_sort_idx = rhs_core->leader->idx; + } else { + rhs_has_group = false; + rhs_sort_idx = *force_grouped_idx != -1 && arch_evsel__must_be_in_group(rhs) + ? *force_grouped_idx + : rhs_core->idx; } - if (lhs_leader_idx != rhs_leader_idx) - return lhs_leader_idx - rhs_leader_idx; + if (lhs_sort_idx != rhs_sort_idx) + return lhs_sort_idx - rhs_sort_idx; /* Group by PMU if there is a group. Groups can't span PMUs. */ if (lhs_has_group && rhs_has_group) { @@ -2146,7 +2156,7 @@ static int evlist__cmp(void *state, const struct list_head *l, const struct list static int parse_events__sort_events_and_fix_groups(struct list_head *list) { - int idx = 0, unsorted_idx = -1; + int idx = 0, force_grouped_idx = -1; struct evsel *pos, *cur_leader = NULL; struct perf_evsel *cur_leaders_grp = NULL; bool idx_changed = false, cur_leader_force_grouped = false; @@ -2174,12 +2184,14 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) */ pos->core.idx = idx++; - if (unsorted_idx == -1 && pos == pos_leader && pos->core.nr_members < 2) - unsorted_idx = pos->core.idx; + /* Remember an index to sort all forced grouped events together to. */ + if (force_grouped_idx == -1 && pos == pos_leader && pos->core.nr_members < 2 && + arch_evsel__must_be_in_group(pos)) + force_grouped_idx = pos->core.idx; } /* Sort events. */ - list_sort(&unsorted_idx, list, evlist__cmp); + list_sort(&force_grouped_idx, list, evlist__cmp); /* * Recompute groups, splitting for PMUs and adding groups for events @@ -2190,7 +2202,8 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) const struct evsel *pos_leader = evsel__leader(pos); const char *pos_pmu_name = pos->group_pmu_name; const char *cur_leader_pmu_name; - bool pos_force_grouped = arch_evsel__must_be_in_group(pos); + bool pos_force_grouped = force_grouped_idx != -1 && + arch_evsel__must_be_in_group(pos); /* Reset index and nr_members. */ if (pos->core.idx != idx) From d3053b4a6b76f29fd1bf0b438b19a2d6ece9657d Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 20 Jun 2023 05:53:59 +0300 Subject: [PATCH 050/230] MAINTAINERS: Add myself as reviewer for HYPERBUS Add myself as Designated Reviewer for Hyperbus support. I'm assessing the framework and I'd like to get involved in reviewing further patches. Signed-off-by: Tudor Ambarus Acked-by: Vignesh Raghavendra Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20230620025359.33839-1-tudor.ambarus@linaro.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3be1bdfe8ecc..c5e338ca3b52 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9649,6 +9649,7 @@ F: tools/hv/ HYPERBUS SUPPORT M: Vignesh Raghavendra +R: Tudor Ambarus L: linux-mtd@lists.infradead.org S: Supported Q: http://patchwork.ozlabs.org/project/linux-mtd/list/ From 71c8f9cf2623d0db79665f876b95afcdd8214aec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jul 2023 21:00:25 +0200 Subject: [PATCH 051/230] mtd: spi-nor: avoid holes in struct spi_mem_op gcc gets confused when -ftrivial-auto-var-init=pattern is used on sparse bit fields such as 'struct spi_mem_op', which caused the previous false positive warning about an uninitialized variable: drivers/mtd/spi-nor/spansion.c: error: 'op' is used uninitialized [-Werror=uninitialized] In fact, the variable is fully initialized and gcc does not see it being used, so the warning is entirely bogus. The problem appears to be a misoptimization in the initialization of single bit fields when the rest of the bytes are not initialized. A previous workaround added another initialization, which ended up shutting up the warning in spansion.c, though it apparently still happens in other files as reported by Peter Foley in the gcc bugzilla. The workaround of adding a fake initialization seems particularly bad because it would set values that can never be correct but prevent the compiler from warning about actually missing initializations. Revert the broken workaround and instead pad the structure to only have bitfields that add up to full bytes, which should avoid this behavior in all drivers. I also filed a new bug against gcc with what I found, so this can hopefully be addressed in future gcc releases. At the moment, only gcc-12 and gcc-13 are affected. Cc: Peter Foley Cc: Pedro Falcato Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110743 Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108402 Link: https://godbolt.org/z/efMMsG1Kx Fixes: 420c4495b5e56 ("mtd: spi-nor: spansion: make sure local struct does not contain garbage") Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20230719190045.4007391-1-arnd@kernel.org --- drivers/mtd/spi-nor/spansion.c | 4 ++-- include/linux/spi/spi-mem.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/spi-nor/spansion.c b/drivers/mtd/spi-nor/spansion.c index 36876aa849ed..15f9a80c10b9 100644 --- a/drivers/mtd/spi-nor/spansion.c +++ b/drivers/mtd/spi-nor/spansion.c @@ -361,7 +361,7 @@ static int cypress_nor_determine_addr_mode_by_sr1(struct spi_nor *nor, */ static int cypress_nor_set_addr_mode_nbytes(struct spi_nor *nor) { - struct spi_mem_op op = {}; + struct spi_mem_op op; u8 addr_mode; int ret; @@ -492,7 +492,7 @@ s25fs256t_post_bfpt_fixup(struct spi_nor *nor, const struct sfdp_parameter_header *bfpt_header, const struct sfdp_bfpt *bfpt) { - struct spi_mem_op op = {}; + struct spi_mem_op op; int ret; ret = cypress_nor_set_addr_mode_nbytes(nor); diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 8e984d75f5b6..6b0a7dc48a4b 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -101,6 +101,7 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; u16 opcode; } cmd; @@ -108,6 +109,7 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; u64 val; } addr; @@ -115,12 +117,14 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; } dummy; struct { u8 buswidth; u8 dtr : 1; u8 ecc : 1; + u8 __pad : 6; enum spi_mem_data_dir dir; unsigned int nbytes; union { From c6abce60338aa2080973cd95be0aedad528bb41f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 19 Jul 2023 23:55:01 +0200 Subject: [PATCH 052/230] mtd: rawnand: fsl_upm: Fix an off-by one test in fun_exec_op() 'op-cs' is copied in 'fun->mchip_number' which is used to access the 'mchip_offsets' and the 'rnb_gpio' arrays. These arrays have NAND_MAX_CHIPS elements, so the index must be below this limit. Fix the sanity check in order to avoid the NAND_MAX_CHIPS value. This would lead to out-of-bound accesses. Fixes: 54309d657767 ("mtd: rawnand: fsl_upm: Implement exec_op()") Signed-off-by: Christophe JAILLET Reviewed-by: Dan Carpenter Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/cd01cba1c7eda58bdabaae174c78c067325803d2.1689803636.git.christophe.jaillet@wanadoo.fr --- drivers/mtd/nand/raw/fsl_upm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/fsl_upm.c b/drivers/mtd/nand/raw/fsl_upm.c index 086426139173..7366e85c09fd 100644 --- a/drivers/mtd/nand/raw/fsl_upm.c +++ b/drivers/mtd/nand/raw/fsl_upm.c @@ -135,7 +135,7 @@ static int fun_exec_op(struct nand_chip *chip, const struct nand_operation *op, unsigned int i; int ret; - if (op->cs > NAND_MAX_CHIPS) + if (op->cs >= NAND_MAX_CHIPS) return -EINVAL; if (check_only) From cdddb626dc053a2bbe8be4150e9b67395130a683 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 1 Jul 2023 11:17:22 -0600 Subject: [PATCH 053/230] media: venus: Use struct_size_t() helper in pkt_session_unset_buffers() Prefer struct_size_t() over struct_size() when no pointer instance of the structure type is present. Signed-off-by: "Gustavo A. R. Silva" Reviewed-by: Vikash Garodia Link: https://lore.kernel.org/r/ZKBfoqSl61jfpO2r@work Signed-off-by: Kees Cook --- drivers/media/platform/qcom/venus/hfi_cmds.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/qcom/venus/hfi_cmds.c b/drivers/media/platform/qcom/venus/hfi_cmds.c index 7f0802a5518c..3418d2dd9371 100644 --- a/drivers/media/platform/qcom/venus/hfi_cmds.c +++ b/drivers/media/platform/qcom/venus/hfi_cmds.c @@ -251,8 +251,8 @@ int pkt_session_unset_buffers(struct hfi_session_release_buffer_pkt *pkt, pkt->extradata_size = 0; pkt->shdr.hdr.size = - struct_size((struct hfi_session_set_buffers_pkt *)0, - buffer_info, bd->num_buffers); + struct_size_t(struct hfi_session_set_buffers_pkt, + buffer_info, bd->num_buffers); } pkt->response_req = bd->response_required; From 6722b25712054c0f903b839b8f5088438dd04df3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 24 Jul 2023 23:43:20 +0530 Subject: [PATCH 054/230] powerpc/mm/altmap: Fix altmap boundary check altmap->free includes the entire free space from which altmap blocks can be allocated. So when checking whether the kernel is doing altmap block free, compute the boundary correctly, otherwise memory hotunplug can fail. Fixes: 9ef34630a461 ("powerpc/mm: Fallback to RAM if the altmap is unusable") Signed-off-by: "Aneesh Kumar K.V" Reviewed-by: David Hildenbrand Signed-off-by: Michael Ellerman Link: https://msgid.link/20230724181320.471386-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/init_64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index fe1b83020e0d..0ec5b45b1e86 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -314,8 +314,7 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, start = ALIGN_DOWN(start, page_size); if (altmap) { alt_start = altmap->base_pfn; - alt_end = altmap->base_pfn + altmap->reserve + - altmap->free + altmap->alloc + altmap->align; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; } pr_debug("vmemmap_free %lx...%lx\n", start, end); From 41a506ef71eb38d94fe133f565c87c3e06ccc072 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Wed, 21 Jun 2023 10:43:49 +0530 Subject: [PATCH 055/230] powerpc/ftrace: Create a dummy stackframe to fix stack unwind With ppc64 -mprofile-kernel and ppc32 -pg, profiling instructions to call into ftrace are emitted right at function entry. The instruction sequence used is minimal to reduce overhead. Crucially, a stackframe is not created for the function being traced. This breaks stack unwinding since the function being traced does not have a stackframe for itself. As such, it never shows up in the backtrace: /sys/kernel/debug/tracing # echo 1 > /proc/sys/kernel/stack_tracer_enabled /sys/kernel/debug/tracing # cat stack_trace Depth Size Location (17 entries) ----- ---- -------- 0) 4144 32 ftrace_call+0x4/0x44 1) 4112 432 get_page_from_freelist+0x26c/0x1ad0 2) 3680 496 __alloc_pages+0x290/0x1280 3) 3184 336 __folio_alloc+0x34/0x90 4) 2848 176 vma_alloc_folio+0xd8/0x540 5) 2672 272 __handle_mm_fault+0x700/0x1cc0 6) 2400 208 handle_mm_fault+0xf0/0x3f0 7) 2192 80 ___do_page_fault+0x3e4/0xbe0 8) 2112 160 do_page_fault+0x30/0xc0 9) 1952 256 data_access_common_virt+0x210/0x220 10) 1696 400 0xc00000000f16b100 11) 1296 384 load_elf_binary+0x804/0x1b80 12) 912 208 bprm_execve+0x2d8/0x7e0 13) 704 64 do_execveat_common+0x1d0/0x2f0 14) 640 160 sys_execve+0x54/0x70 15) 480 64 system_call_exception+0x138/0x350 16) 416 416 system_call_common+0x160/0x2c4 Fix this by having ftrace create a dummy stackframe for the function being traced. With this, backtraces now capture the function being traced: /sys/kernel/debug/tracing # cat stack_trace Depth Size Location (17 entries) ----- ---- -------- 0) 3888 32 _raw_spin_trylock+0x8/0x70 1) 3856 576 get_page_from_freelist+0x26c/0x1ad0 2) 3280 64 __alloc_pages+0x290/0x1280 3) 3216 336 __folio_alloc+0x34/0x90 4) 2880 176 vma_alloc_folio+0xd8/0x540 5) 2704 416 __handle_mm_fault+0x700/0x1cc0 6) 2288 96 handle_mm_fault+0xf0/0x3f0 7) 2192 48 ___do_page_fault+0x3e4/0xbe0 8) 2144 192 do_page_fault+0x30/0xc0 9) 1952 608 data_access_common_virt+0x210/0x220 10) 1344 16 0xc0000000334bbb50 11) 1328 416 load_elf_binary+0x804/0x1b80 12) 912 64 bprm_execve+0x2d8/0x7e0 13) 848 176 do_execveat_common+0x1d0/0x2f0 14) 672 192 sys_execve+0x54/0x70 15) 480 64 system_call_exception+0x138/0x350 16) 416 416 system_call_common+0x160/0x2c4 This results in two additional stores in the ftrace entry code, but produces reliable backtraces. Fixes: 153086644fd1 ("powerpc/ftrace: Add support for -mprofile-kernel ftrace ABI") Cc: stable@vger.kernel.org Signed-off-by: Naveen N Rao Signed-off-by: Michael Ellerman Link: https://msgid.link/20230621051349.759567-1-naveen@kernel.org --- arch/powerpc/kernel/trace/ftrace_mprofile.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_mprofile.S b/arch/powerpc/kernel/trace/ftrace_mprofile.S index ffb1db386849..1f7d86de1538 100644 --- a/arch/powerpc/kernel/trace/ftrace_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_mprofile.S @@ -33,6 +33,9 @@ * and then arrange for the ftrace function to be called. */ .macro ftrace_regs_entry allregs + /* Create a minimal stack frame for representing B */ + PPC_STLU r1, -STACK_FRAME_MIN_SIZE(r1) + /* Create our stack frame + pt_regs */ PPC_STLU r1,-SWITCH_FRAME_SIZE(r1) @@ -42,7 +45,7 @@ #ifdef CONFIG_PPC64 /* Save the original return address in A's stack frame */ - std r0, LRSAVE+SWITCH_FRAME_SIZE(r1) + std r0, LRSAVE+SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE(r1) /* Ok to continue? */ lbz r3, PACA_FTRACE_ENABLED(r13) cmpdi r3, 0 @@ -77,6 +80,8 @@ mflr r7 /* Save it as pt_regs->nip */ PPC_STL r7, _NIP(r1) + /* Also save it in B's stackframe header for proper unwind */ + PPC_STL r7, LRSAVE+SWITCH_FRAME_SIZE(r1) /* Save the read LR in pt_regs->link */ PPC_STL r0, _LINK(r1) @@ -142,7 +147,7 @@ #endif /* Pop our stack frame */ - addi r1, r1, SWITCH_FRAME_SIZE + addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE #ifdef CONFIG_LIVEPATCH_64 /* Based on the cmpd above, if the NIP was altered handle livepatch */ From ee31742bf17636da1304af77b2cb1c29b5dda642 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Mon, 15 May 2023 09:21:37 +0200 Subject: [PATCH 056/230] drm/imx/ipuv3: Fix front porch adjustment upon hactive aligning When hactive is not aligned to 8 pixels, it is aligned accordingly and hfront porch needs to be reduced the same amount. Unfortunately the front porch is set to the difference rather than reducing it. There are some Samsung TVs which can't cope with a front porch of instead of 70. Fixes: 94dfec48fca7 ("drm/imx: Add 8 pixel alignment fix") Signed-off-by: Alexander Stein Reviewed-by: Philipp Zabel Link: https://lore.kernel.org/r/20230515072137.116211-1-alexander.stein@ew.tq-group.com [p.zabel@pengutronix.de: Fixed subject] Signed-off-by: Philipp Zabel Link: https://patchwork.freedesktop.org/patch/msgid/20230515072137.116211-1-alexander.stein@ew.tq-group.com --- drivers/gpu/drm/imx/ipuv3/ipuv3-crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/imx/ipuv3/ipuv3-crtc.c b/drivers/gpu/drm/imx/ipuv3/ipuv3-crtc.c index 5f26090b0c98..89585b31b985 100644 --- a/drivers/gpu/drm/imx/ipuv3/ipuv3-crtc.c +++ b/drivers/gpu/drm/imx/ipuv3/ipuv3-crtc.c @@ -310,7 +310,7 @@ static void ipu_crtc_mode_set_nofb(struct drm_crtc *crtc) dev_warn(ipu_crtc->dev, "8-pixel align hactive %d -> %d\n", sig_cfg.mode.hactive, new_hactive); - sig_cfg.mode.hfront_porch = new_hactive - sig_cfg.mode.hactive; + sig_cfg.mode.hfront_porch -= new_hactive - sig_cfg.mode.hactive; sig_cfg.mode.hactive = new_hactive; } From 74158a8cad79d2f5dcf71508993664c5cfcbfa3c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 28 Jul 2023 00:08:24 +0000 Subject: [PATCH 057/230] KVM: arm64: Skip instruction after emulating write to TCR_EL1 Whelp, this is embarrassing. Since commit 082fdfd13841 ("KVM: arm64: Prevent guests from enabling HA/HD on Ampere1") KVM traps writes to TCR_EL1 on AmpereOne to work around an erratum in the unadvertised HAFDBS implementation, preventing the guest from enabling the feature. Unfortunately, I failed virtualization 101 when working on that change, and forgot to advance PC after instruction emulation. Do the right thing and skip the MSR instruction after emulating the write. Fixes: 082fdfd13841 ("KVM: arm64: Prevent guests from enabling HA/HD on Ampere1") Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20230728000824.3848025-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/include/hyp/switch.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 4bddb8541bec..34f222af6165 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -457,6 +457,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) */ val &= ~(TCR_HD | TCR_HA); write_sysreg_el1(val, SYS_TCR); + __kvm_skip_instr(vcpu); return true; } From 98ce8e4a9dcfb448b30a2d7a16190f4a00382377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Georg=20M=C3=BCller?= Date: Fri, 28 Jul 2023 17:18:12 +0200 Subject: [PATCH 058/230] perf test uprobe_from_different_cu: Skip if there is no gcc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without gcc, the test will fail. On cleanup, ignore probe removal errors. Otherwise, in case of an error adding the probe, the temporary directory is not removed. Fixes: 56cbeacf14353057 ("perf probe: Add test for regression introduced by switch to die_get_decl_file()") Signed-off-by: Georg Müller Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Georg Müller Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230728151812.454806-2-georgmueller@gmx.net Link: https://lore.kernel.org/r/CAP-5=fUP6UuLgRty3t2=fQsQi3k4hDMz415vWdp1x88QMvZ8ug@mail.gmail.com/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_uprobe_from_different_cu.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/test_uprobe_from_different_cu.sh b/tools/perf/tests/shell/test_uprobe_from_different_cu.sh index 00d2e0e2e0c2..319f36ebb9a4 100755 --- a/tools/perf/tests/shell/test_uprobe_from_different_cu.sh +++ b/tools/perf/tests/shell/test_uprobe_from_different_cu.sh @@ -4,6 +4,12 @@ set -e +# skip if there's no gcc +if ! [ -x "$(command -v gcc)" ]; then + echo "failed: no gcc compiler" + exit 2 +fi + temp_dir=$(mktemp -d /tmp/perf-uprobe-different-cu-sh.XXXXXXXXXX) cleanup() @@ -11,7 +17,7 @@ cleanup() trap - EXIT TERM INT if [[ "${temp_dir}" =~ ^/tmp/perf-uprobe-different-cu-sh.*$ ]]; then echo "--- Cleaning up ---" - perf probe -x ${temp_dir}/testfile -d foo + perf probe -x ${temp_dir}/testfile -d foo || true rm -f "${temp_dir}/"* rmdir "${temp_dir}" fi From 0fcde5989e8a54b2a155d8bcea21a7f99abb50f9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 25 Jul 2023 22:19:38 -0700 Subject: [PATCH 059/230] cxl/memdev: Improve sanitize ABI descriptions Be more detailed about the CPU cache management situation. The same goes for both sanitize and secure erase. Signed-off-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20230726051940.3570-2-dave@stgolabs.net Reviewed-by: Dave Jiang Signed-off-by: Vishal Verma --- Documentation/ABI/testing/sysfs-bus-cxl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 6350dd82b9a9..c4c4acb1f3b3 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -82,7 +82,11 @@ Description: whether it resides in persistent capacity, volatile capacity, or the LSA, is made permanently unavailable by whatever means is appropriate for the media type. This functionality requires - the device to be not be actively decoding any HPA ranges. + the device to be disabled, that is, not actively decoding any + HPA ranges. This permits avoiding explicit global CPU cache + management, relying instead for it to be done when a region + transitions between software programmed and hardware committed + states. What /sys/bus/cxl/devices/memX/security/erase @@ -92,7 +96,12 @@ Contact: linux-cxl@vger.kernel.org Description: (WO) Write a boolean 'true' string value to this attribute to secure erase user data by changing the media encryption keys for - all user data areas of the device. + all user data areas of the device. This functionality requires + the device to be disabled, that is, not actively decoding any + HPA ranges. This permits avoiding explicit global CPU cache + management, relying instead for it to be done when a region + transitions between software programmed and hardware committed + states. What: /sys/bus/cxl/devices/memX/firmware/ From 3de8cd2242419fb0adaee629d488acfd6cd93c92 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 25 Jul 2023 22:19:39 -0700 Subject: [PATCH 060/230] cxl/memdev: Document security state in kern-doc ... as is the case with all members of struct cxl_memdev_state. Signed-off-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20230726051940.3570-3-dave@stgolabs.net Reviewed-by: Dave Jiang Signed-off-by: Vishal Verma --- drivers/cxl/cxlmem.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 499113328586..f3aca828fbec 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -434,6 +434,7 @@ struct cxl_dev_state { * @next_persistent_bytes: persistent capacity change pending device reset * @event: event log driver state * @poison: poison driver state info + * @security: security driver state info * @fw: firmware upload / activation state * @mbox_send: @dev specific transport for transmitting mailbox commands * From ad64f5952ce3ea565c7f76ec37ab41df0dde773a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 25 Jul 2023 22:19:40 -0700 Subject: [PATCH 061/230] cxl/memdev: Only show sanitize sysfs files when supported If the device does not support Sanitize or Secure Erase commands, hide the respective sysfs interfaces such that the operation can never be attempted. In order to be generic, keep track of the enabled security commands found in the CEL - the driver does not support Security Passthrough. Signed-off-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20230726051940.3570-4-dave@stgolabs.net Reviewed-by: Dave Jiang Signed-off-by: Vishal Verma --- Documentation/ABI/testing/sysfs-bus-cxl | 6 ++-- drivers/cxl/core/mbox.c | 45 ++++++++++++++++++++++++- drivers/cxl/core/memdev.c | 19 +++++++++++ drivers/cxl/cxlmem.h | 15 +++++++++ 4 files changed, 82 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index c4c4acb1f3b3..087f762ebfd5 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -86,7 +86,8 @@ Description: HPA ranges. This permits avoiding explicit global CPU cache management, relying instead for it to be done when a region transitions between software programmed and hardware committed - states. + states. If this file is not present, then there is no hardware + support for the operation. What /sys/bus/cxl/devices/memX/security/erase @@ -101,7 +102,8 @@ Description: HPA ranges. This permits avoiding explicit global CPU cache management, relying instead for it to be done when a region transitions between software programmed and hardware committed - states. + states. If this file is not present, then there is no hardware + support for the operation. What: /sys/bus/cxl/devices/memX/firmware/ diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index d6d067fbee97..ca60bb8114f2 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -121,6 +121,45 @@ static bool cxl_is_security_command(u16 opcode) return false; } +static void cxl_set_security_cmd_enabled(struct cxl_security_state *security, + u16 opcode) +{ + switch (opcode) { + case CXL_MBOX_OP_SANITIZE: + set_bit(CXL_SEC_ENABLED_SANITIZE, security->enabled_cmds); + break; + case CXL_MBOX_OP_SECURE_ERASE: + set_bit(CXL_SEC_ENABLED_SECURE_ERASE, + security->enabled_cmds); + break; + case CXL_MBOX_OP_GET_SECURITY_STATE: + set_bit(CXL_SEC_ENABLED_GET_SECURITY_STATE, + security->enabled_cmds); + break; + case CXL_MBOX_OP_SET_PASSPHRASE: + set_bit(CXL_SEC_ENABLED_SET_PASSPHRASE, + security->enabled_cmds); + break; + case CXL_MBOX_OP_DISABLE_PASSPHRASE: + set_bit(CXL_SEC_ENABLED_DISABLE_PASSPHRASE, + security->enabled_cmds); + break; + case CXL_MBOX_OP_UNLOCK: + set_bit(CXL_SEC_ENABLED_UNLOCK, security->enabled_cmds); + break; + case CXL_MBOX_OP_FREEZE_SECURITY: + set_bit(CXL_SEC_ENABLED_FREEZE_SECURITY, + security->enabled_cmds); + break; + case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE: + set_bit(CXL_SEC_ENABLED_PASSPHRASE_SECURE_ERASE, + security->enabled_cmds); + break; + default: + break; + } +} + static bool cxl_is_poison_command(u16 opcode) { #define CXL_MBOX_OP_POISON_CMDS 0x43 @@ -677,7 +716,8 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel) u16 opcode = le16_to_cpu(cel_entry[i].opcode); struct cxl_mem_command *cmd = cxl_mem_find_command(opcode); - if (!cmd && !cxl_is_poison_command(opcode)) { + if (!cmd && (!cxl_is_poison_command(opcode) || + !cxl_is_security_command(opcode))) { dev_dbg(dev, "Opcode 0x%04x unsupported by driver\n", opcode); continue; @@ -689,6 +729,9 @@ static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel) if (cxl_is_poison_command(opcode)) cxl_set_poison_cmd_enabled(&mds->poison, opcode); + if (cxl_is_security_command(opcode)) + cxl_set_security_cmd_enabled(&mds->security, opcode); + dev_dbg(dev, "Opcode 0x%04x enabled\n", opcode); } } diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index f99e7ec3cc40..14b547c07f54 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -477,9 +477,28 @@ static struct attribute_group cxl_memdev_pmem_attribute_group = { .attrs = cxl_memdev_pmem_attributes, }; +static umode_t cxl_memdev_security_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + + if (a == &dev_attr_security_sanitize.attr && + !test_bit(CXL_SEC_ENABLED_SANITIZE, mds->security.enabled_cmds)) + return 0; + + if (a == &dev_attr_security_erase.attr && + !test_bit(CXL_SEC_ENABLED_SECURE_ERASE, mds->security.enabled_cmds)) + return 0; + + return a->mode; +} + static struct attribute_group cxl_memdev_security_attribute_group = { .name = "security", .attrs = cxl_memdev_security_attributes, + .is_visible = cxl_memdev_security_visible, }; static const struct attribute_group *cxl_memdev_attribute_groups[] = { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index f3aca828fbec..706f8a6d1ef4 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -244,6 +244,19 @@ enum poison_cmd_enabled_bits { CXL_POISON_ENABLED_MAX }; +/* Device enabled security commands */ +enum security_cmd_enabled_bits { + CXL_SEC_ENABLED_SANITIZE, + CXL_SEC_ENABLED_SECURE_ERASE, + CXL_SEC_ENABLED_GET_SECURITY_STATE, + CXL_SEC_ENABLED_SET_PASSPHRASE, + CXL_SEC_ENABLED_DISABLE_PASSPHRASE, + CXL_SEC_ENABLED_UNLOCK, + CXL_SEC_ENABLED_FREEZE_SECURITY, + CXL_SEC_ENABLED_PASSPHRASE_SECURE_ERASE, + CXL_SEC_ENABLED_MAX +}; + /** * struct cxl_poison_state - Driver poison state info * @@ -346,6 +359,7 @@ struct cxl_fw_state { * struct cxl_security_state - Device security state * * @state: state of last security operation + * @enabled_cmds: All security commands enabled in the CEL * @poll: polling for sanitization is enabled, device has no mbox irq support * @poll_tmo_secs: polling timeout * @poll_dwork: polling work item @@ -353,6 +367,7 @@ struct cxl_fw_state { */ struct cxl_security_state { unsigned long state; + DECLARE_BITMAP(enabled_cmds, CXL_SEC_ENABLED_MAX); bool poll; int poll_tmo_secs; struct delayed_work poll_dwork; From 238ec850b95a02dcdff3edc86781aa913549282f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 28 Jul 2023 17:28:43 -0500 Subject: [PATCH 062/230] x86/srso: Fix return thunks in generated code Set X86_FEATURE_RETHUNK when enabling the SRSO mitigation so that generated code (e.g., ftrace, static call, eBPF) generates "jmp __x86_return_thunk" instead of RET. [ bp: Add a comment. ] Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation") Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/alternative.c | 4 +--- arch/x86/kernel/cpu/bugs.c | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 920a8ca7a8f8..2dcf3a06af09 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -707,9 +707,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) int i = 0; /* Patch the custom return thunks... */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK) || - cpu_feature_enabled(X86_FEATURE_SRSO) || - cpu_feature_enabled(X86_FEATURE_SRSO_ALIAS)) { + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d4109eb5eb2e..7314a6bdc862 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2297,6 +2297,12 @@ static void __init srso_select_mitigation(void) case SRSO_CMD_SAFE_RET: if (IS_ENABLED(CONFIG_CPU_SRSO)) { + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + if (boot_cpu_data.x86 == 0x19) setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); else From 3bbbe97ad83db8d9df06daf027b0840188de625d Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 28 Jul 2023 23:03:22 +0200 Subject: [PATCH 063/230] x86/srso: Add a forgotten NOENDBR annotation Fix: vmlinux.o: warning: objtool: .export_symbol+0x29e40: data relocation to !ENDBR: srso_untrain_ret_alias+0x0 Reported-by: Linus Torvalds Signed-off-by: Borislav Petkov (AMD) --- arch/x86/lib/retpoline.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 845cfb0d748f..2cff585f22f2 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -151,6 +151,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) .section .text.__x86.rethunk_untrain SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE) + ANNOTATE_NOENDBR ASM_NOP2 lfence jmp __x86_return_thunk From 46d14e17095237007b59f56aae2d81ae2dcb0f93 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 31 Jul 2023 19:20:33 +0800 Subject: [PATCH 064/230] drm/i915/gvt: Fix bug in getting msg length in AUX CH registers handler Msg length should be obtained from value written to AUX_CH_CTL register rather than from enum type of the register. Commit 0cad796a2269 ("drm/i915: Use REG_BIT() & co. for AUX CH registers") incorrectly calculates the msg_length from reg type and yields below warning in intel_gvt_i2c_handle_aux_ch_write(): "i915 0000:00:02.0: drm_WARN_ON(msg_length != 4)". Fixes: 0cad796a2269 ("drm/i915: Use REG_BIT() & co. for AUX CH registers") Signed-off-by: Yan Zhao Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20230731112033.7275-1-yan.y.zhao@intel.com Reviewed-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/edid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/edid.c b/drivers/gpu/drm/i915/gvt/edid.c index 2a0438f12a14..af9afdb53c7f 100644 --- a/drivers/gpu/drm/i915/gvt/edid.c +++ b/drivers/gpu/drm/i915/gvt/edid.c @@ -491,7 +491,7 @@ void intel_gvt_i2c_handle_aux_ch_write(struct intel_vgpu *vgpu, return; } - msg_length = REG_FIELD_GET(DP_AUX_CH_CTL_MESSAGE_SIZE_MASK, reg); + msg_length = REG_FIELD_GET(DP_AUX_CH_CTL_MESSAGE_SIZE_MASK, value); // check the msg in DATA register. msg = vgpu_vreg(vgpu, offset + 4); From 34bc65d6d831200194c0b9a30fb8fe47faaf83d6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 6 Jul 2023 11:37:04 -0700 Subject: [PATCH 065/230] perf pmus: Create placholder regardless of scanning core_only If scanning all PMUs the placeholder is still necessary if no core PMU is found. This situation occurs in perf test's parse-events test, when uncore events appear before core. Fixes: 628eaa4e877af823 ("perf pmus: Add placeholder core PMU") Signed-off-by: Ian Rogers Tested-by: Thomas Richter Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Link: https://lore.kernel.org/r/20230706183705.601412-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmus.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c index 3cd9de42139e..c58ba9fb6a36 100644 --- a/tools/perf/util/pmus.c +++ b/tools/perf/util/pmus.c @@ -152,16 +152,14 @@ static void pmu_read_sysfs(bool core_only) } closedir(dir); - if (core_only) { - if (!list_empty(&core_pmus)) - read_sysfs_core_pmus = true; - else { - if (perf_pmu__create_placeholder_core_pmu(&core_pmus)) - read_sysfs_core_pmus = true; - } - } else { + if (list_empty(&core_pmus)) { + if (!perf_pmu__create_placeholder_core_pmu(&core_pmus)) + pr_err("Failure to set up any core PMUs\n"); + } + if (!list_empty(&core_pmus)) { read_sysfs_core_pmus = true; - read_sysfs_all_pmus = true; + if (!core_only) + read_sysfs_all_pmus = true; } } From 07d2b820fd75b96f550c93503f19c8cfcbc577cf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 6 Jul 2023 11:37:05 -0700 Subject: [PATCH 066/230] perf test parse-events: Test complex name has required event format test__checkevent_complex_name will use an "event" format which if not present, such as with a placeholder PMU, will cause test failures. Skip the test in this case to avoid failures in restricted environments. Add perf_pmu__has_format utility as a general PMU utility. Fixes: 628eaa4e877af823 ("perf pmus: Add placeholder core PMU") Signed-off-by: Ian Rogers Tested-by: Thomas Richter Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Link: https://lore.kernel.org/r/20230706183705.601412-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 12 +++++++++++- tools/perf/util/pmu.c | 11 +++++++++++ tools/perf/util/pmu.h | 1 + 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index b2f82847e4c3..658fb9599d95 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1631,6 +1631,16 @@ static bool test__pmu_cpu_valid(void) return !!perf_pmus__find("cpu"); } +static bool test__pmu_cpu_event_valid(void) +{ + struct perf_pmu *pmu = perf_pmus__find("cpu"); + + if (!pmu) + return false; + + return perf_pmu__has_format(pmu, "event"); +} + static bool test__intel_pt_valid(void) { return !!perf_pmus__find("intel_pt"); @@ -2179,7 +2189,7 @@ static const struct evlist_test test__events_pmu[] = { }, { .name = "cpu/name='COMPLEX_CYCLES_NAME:orig=cycles,desc=chip-clock-ticks',period=0x1,event=0x2/ukp", - .valid = test__pmu_cpu_valid, + .valid = test__pmu_cpu_event_valid, .check = test__checkevent_complex_name, /* 3 */ }, diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 7f984a7f16ca..28380e7aa8d0 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1440,6 +1440,17 @@ void perf_pmu__del_formats(struct list_head *formats) } } +bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name) +{ + struct perf_pmu_format *format; + + list_for_each_entry(format, &pmu->format, list) { + if (!strcmp(format->name, name)) + return true; + } + return false; +} + bool is_pmu_core(const char *name) { return !strcmp(name, "cpu") || !strcmp(name, "cpum_cf") || is_sysfs_pmu_core(name); diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 203b92860e3c..6b414cecbad2 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -234,6 +234,7 @@ int perf_pmu__new_format(struct list_head *list, char *name, void perf_pmu__set_format(unsigned long *bits, long from, long to); int perf_pmu__format_parse(int dirfd, struct list_head *head); void perf_pmu__del_formats(struct list_head *formats); +bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name); bool is_pmu_core(const char *name); bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu); From d14560ac1b595aa2e792365e91fea6aeaee66c2b Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Tue, 25 Jul 2023 02:19:44 +0200 Subject: [PATCH 067/230] drm/i915/gt: Cleanup aux invalidation registers Fix the 'NV' definition postfix that is supposed to be INV. Take the chance to also order properly the registers based on their address and call the GEN12_GFX_CCS_AUX_INV address as GEN12_CCS_AUX_INV like all the other similar registers. Remove also VD1, VD3 and VE1 registers that don't exist and add BCS0 and CCS0. Signed-off-by: Andi Shyti Cc: # v5.8+ Reviewed-by: Nirmoy Das Reviewed-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/20230725001950.1014671-2-andi.shyti@linux.intel.com (cherry picked from commit 2f0b927d3ca3440445975ebde27f3df1c3ed6f76) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 8 ++++---- drivers/gpu/drm/i915/gt/intel_gt_regs.h | 16 ++++++++-------- drivers/gpu/drm/i915/gt/intel_lrc.c | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index 23857cc08eca..563efee05560 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -287,8 +287,8 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) if (!HAS_FLAT_CCS(rq->engine->i915)) { /* hsdes: 1809175790 */ - cs = gen12_emit_aux_table_inv(rq->engine->gt, - cs, GEN12_GFX_CCS_AUX_NV); + cs = gen12_emit_aux_table_inv(rq->engine->gt, cs, + GEN12_CCS_AUX_INV); } *cs++ = preparser_disable(false); @@ -348,10 +348,10 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode) if (aux_inv) { /* hsdes: 1809175790 */ if (rq->engine->class == VIDEO_DECODE_CLASS) cs = gen12_emit_aux_table_inv(rq->engine->gt, - cs, GEN12_VD0_AUX_NV); + cs, GEN12_VD0_AUX_INV); else cs = gen12_emit_aux_table_inv(rq->engine->gt, - cs, GEN12_VE0_AUX_NV); + cs, GEN12_VE0_AUX_INV); } if (mode & EMIT_INVALIDATE) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h index 718cb2c80f79..2cdfb2f713d0 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h @@ -332,9 +332,11 @@ #define GEN8_PRIVATE_PAT_HI _MMIO(0x40e0 + 4) #define GEN10_PAT_INDEX(index) _MMIO(0x40e0 + (index) * 4) #define BSD_HWS_PGA_GEN7 _MMIO(0x4180) -#define GEN12_GFX_CCS_AUX_NV _MMIO(0x4208) -#define GEN12_VD0_AUX_NV _MMIO(0x4218) -#define GEN12_VD1_AUX_NV _MMIO(0x4228) + +#define GEN12_CCS_AUX_INV _MMIO(0x4208) +#define GEN12_VD0_AUX_INV _MMIO(0x4218) +#define GEN12_VE0_AUX_INV _MMIO(0x4238) +#define GEN12_BCS0_AUX_INV _MMIO(0x4248) #define GEN8_RTCR _MMIO(0x4260) #define GEN8_M1TCR _MMIO(0x4264) @@ -342,14 +344,12 @@ #define GEN8_BTCR _MMIO(0x426c) #define GEN8_VTCR _MMIO(0x4270) -#define GEN12_VD2_AUX_NV _MMIO(0x4298) -#define GEN12_VD3_AUX_NV _MMIO(0x42a8) -#define GEN12_VE0_AUX_NV _MMIO(0x4238) - #define BLT_HWS_PGA_GEN7 _MMIO(0x4280) -#define GEN12_VE1_AUX_NV _MMIO(0x42b8) +#define GEN12_VD2_AUX_INV _MMIO(0x4298) +#define GEN12_CCS0_AUX_INV _MMIO(0x42c8) #define AUX_INV REG_BIT(0) + #define VEBOX_HWS_PGA_GEN7 _MMIO(0x4380) #define GEN12_AUX_ERR_DBG _MMIO(0x43f4) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index a4ec20aaafe2..325f3dbfb90e 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1367,7 +1367,7 @@ gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) /* hsdes: 1809175790 */ if (!HAS_FLAT_CCS(ce->engine->i915)) cs = gen12_emit_aux_table_inv(ce->engine->gt, - cs, GEN12_GFX_CCS_AUX_NV); + cs, GEN12_CCS_AUX_INV); /* Wa_16014892111 */ if (IS_MTL_GRAPHICS_STEP(ce->engine->i915, M, STEP_A0, STEP_B0) || @@ -1396,10 +1396,10 @@ gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) if (!HAS_FLAT_CCS(ce->engine->i915)) { if (ce->engine->class == VIDEO_DECODE_CLASS) cs = gen12_emit_aux_table_inv(ce->engine->gt, - cs, GEN12_VD0_AUX_NV); + cs, GEN12_VD0_AUX_INV); else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS) cs = gen12_emit_aux_table_inv(ce->engine->gt, - cs, GEN12_VE0_AUX_NV); + cs, GEN12_VE0_AUX_INV); } return cs; From b2f59e9026038a5bbcbc0019fa58f963138211ee Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Tue, 25 Jul 2023 02:19:45 +0200 Subject: [PATCH 068/230] drm/i915: Add the gen12_needs_ccs_aux_inv helper We always assumed that a device might either have AUX or FLAT CCS, but this is an approximation that is not always true, e.g. PVC represents an exception. Set the basis for future finer selection by implementing a boolean gen12_needs_ccs_aux_inv() function that tells whether aux invalidation is needed or not. Currently PVC is the only exception to the above mentioned rule. Requires: 059ae7ae2a1c ("drm/i915/gt: Cleanup aux invalidation registers") Signed-off-by: Andi Shyti Cc: Matt Roper Cc: Jonathan Cavitt Cc: # v5.8+ Reviewed-by: Matt Roper Reviewed-by: Andrzej Hajda Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20230725001950.1014671-3-andi.shyti@linux.intel.com (cherry picked from commit c827655b87ad201ebe36f2e28d16b5491c8f7801) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index 563efee05560..460c9225a50f 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -165,6 +165,18 @@ static u32 preparser_disable(bool state) return MI_ARB_CHECK | 1 << 8 | state; } +static bool gen12_needs_ccs_aux_inv(struct intel_engine_cs *engine) +{ + if (IS_PONTEVECCHIO(engine->i915)) + return false; + + /* + * so far platforms supported by i915 having + * flat ccs do not require AUX invalidation + */ + return !HAS_FLAT_CCS(engine->i915); +} + u32 *gen12_emit_aux_table_inv(struct intel_gt *gt, u32 *cs, const i915_reg_t inv_reg) { u32 gsi_offset = gt->uncore->gsi_offset; @@ -267,7 +279,7 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) else if (engine->class == COMPUTE_CLASS) flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS; - if (!HAS_FLAT_CCS(rq->engine->i915)) + if (gen12_needs_ccs_aux_inv(rq->engine)) count = 8 + 4; else count = 8; @@ -285,7 +297,7 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); - if (!HAS_FLAT_CCS(rq->engine->i915)) { + if (gen12_needs_ccs_aux_inv(rq->engine)) { /* hsdes: 1809175790 */ cs = gen12_emit_aux_table_inv(rq->engine->gt, cs, GEN12_CCS_AUX_INV); @@ -307,7 +319,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode) if (mode & EMIT_INVALIDATE) { cmd += 2; - if (!HAS_FLAT_CCS(rq->engine->i915) && + if (gen12_needs_ccs_aux_inv(rq->engine) && (rq->engine->class == VIDEO_DECODE_CLASS || rq->engine->class == VIDEO_ENHANCEMENT_CLASS)) { aux_inv = rq->engine->mask & From 78a6ccd65fa3a7cc697810db079cc4b84dff03d5 Mon Sep 17 00:00:00 2001 From: Jonathan Cavitt Date: Tue, 25 Jul 2023 02:19:46 +0200 Subject: [PATCH 069/230] drm/i915/gt: Ensure memory quiesced before invalidation All memory traffic must be quiesced before requesting an aux invalidation on platforms that use Aux CCS. Fixes: 972282c4cf24 ("drm/i915/gen12: Add aux table invalidate for all engines") Requires: a2a4aa0eef3b ("drm/i915: Add the gen12_needs_ccs_aux_inv helper") Signed-off-by: Jonathan Cavitt Signed-off-by: Andi Shyti Cc: # v5.8+ Reviewed-by: Nirmoy Das Reviewed-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/20230725001950.1014671-4-andi.shyti@linux.intel.com (cherry picked from commit ad8ebf12217e451cd19804b1c3e97ad56491c74a) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index 460c9225a50f..6210b38a2d38 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -214,7 +214,11 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) { struct intel_engine_cs *engine = rq->engine; - if (mode & EMIT_FLUSH) { + /* + * On Aux CCS platforms the invalidation of the Aux + * table requires quiescing memory traffic beforehand + */ + if (mode & EMIT_FLUSH || gen12_needs_ccs_aux_inv(engine)) { u32 flags = 0; int err; u32 *cs; From 592b228f12e15867a63e3a6eeeb54c5c12662a62 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Tue, 25 Jul 2023 02:19:47 +0200 Subject: [PATCH 070/230] drm/i915/gt: Rename flags with bit_group_X according to the datasheet In preparation of the next patch align with the datasheet (BSPEC 47112) with the naming of the pipe control set of flag values. The variable "flags" in gen12_emit_flush_rcs() is applied as a set of flags called Bit Group 1. Define also the Bit Group 0 as bit_group_0 where currently only PIPE_CONTROL0_HDC_PIPELINE_FLUSH bit is set. Signed-off-by: Andi Shyti Cc: # v5.8+ Reviewed-by: Matt Roper Reviewed-by: Andrzej Hajda Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20230725001950.1014671-5-andi.shyti@linux.intel.com (cherry picked from commit f2dcd21d5a22e13f2fbfe7ab65149038b93cf2ff) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 34 +++++++++++++----------- drivers/gpu/drm/i915/gt/gen8_engine_cs.h | 18 ++++++++----- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index 6210b38a2d38..5d2175e918dd 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -219,7 +219,8 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) * table requires quiescing memory traffic beforehand */ if (mode & EMIT_FLUSH || gen12_needs_ccs_aux_inv(engine)) { - u32 flags = 0; + u32 bit_group_0 = 0; + u32 bit_group_1 = 0; int err; u32 *cs; @@ -227,32 +228,33 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) if (err) return err; - flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; - flags |= PIPE_CONTROL_FLUSH_L3; - flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; - flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + bit_group_0 |= PIPE_CONTROL0_HDC_PIPELINE_FLUSH; + + bit_group_1 |= PIPE_CONTROL_TILE_CACHE_FLUSH; + bit_group_1 |= PIPE_CONTROL_FLUSH_L3; + bit_group_1 |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + bit_group_1 |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; /* Wa_1409600907:tgl,adl-p */ - flags |= PIPE_CONTROL_DEPTH_STALL; - flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; - flags |= PIPE_CONTROL_FLUSH_ENABLE; + bit_group_1 |= PIPE_CONTROL_DEPTH_STALL; + bit_group_1 |= PIPE_CONTROL_DC_FLUSH_ENABLE; + bit_group_1 |= PIPE_CONTROL_FLUSH_ENABLE; - flags |= PIPE_CONTROL_STORE_DATA_INDEX; - flags |= PIPE_CONTROL_QW_WRITE; + bit_group_1 |= PIPE_CONTROL_STORE_DATA_INDEX; + bit_group_1 |= PIPE_CONTROL_QW_WRITE; - flags |= PIPE_CONTROL_CS_STALL; + bit_group_1 |= PIPE_CONTROL_CS_STALL; if (!HAS_3D_PIPELINE(engine->i915)) - flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS; + bit_group_1 &= ~PIPE_CONTROL_3D_ARCH_FLAGS; else if (engine->class == COMPUTE_CLASS) - flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS; + bit_group_1 &= ~PIPE_CONTROL_3D_ENGINE_FLAGS; cs = intel_ring_begin(rq, 6); if (IS_ERR(cs)) return PTR_ERR(cs); - cs = gen12_emit_pipe_control(cs, - PIPE_CONTROL0_HDC_PIPELINE_FLUSH, - flags, LRC_PPHWSP_SCRATCH_ADDR); + cs = gen12_emit_pipe_control(cs, bit_group_0, bit_group_1, + LRC_PPHWSP_SCRATCH_ADDR); intel_ring_advance(rq, cs); } diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h index 655e5c00ddc2..a44eda096557 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h @@ -49,25 +49,29 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs); u32 *gen12_emit_aux_table_inv(struct intel_gt *gt, u32 *cs, const i915_reg_t inv_reg); static inline u32 * -__gen8_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u32 offset) +__gen8_emit_pipe_control(u32 *batch, u32 bit_group_0, + u32 bit_group_1, u32 offset) { memset(batch, 0, 6 * sizeof(u32)); - batch[0] = GFX_OP_PIPE_CONTROL(6) | flags0; - batch[1] = flags1; + batch[0] = GFX_OP_PIPE_CONTROL(6) | bit_group_0; + batch[1] = bit_group_1; batch[2] = offset; return batch + 6; } -static inline u32 *gen8_emit_pipe_control(u32 *batch, u32 flags, u32 offset) +static inline u32 *gen8_emit_pipe_control(u32 *batch, + u32 bit_group_1, u32 offset) { - return __gen8_emit_pipe_control(batch, 0, flags, offset); + return __gen8_emit_pipe_control(batch, 0, bit_group_1, offset); } -static inline u32 *gen12_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u32 offset) +static inline u32 *gen12_emit_pipe_control(u32 *batch, u32 bit_group_0, + u32 bit_group_1, u32 offset) { - return __gen8_emit_pipe_control(batch, flags0, flags1, offset); + return __gen8_emit_pipe_control(batch, bit_group_0, + bit_group_1, offset); } static inline u32 * From 824df77ab2107d8d4740b834b276681a41ae1ac8 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Tue, 25 Jul 2023 02:19:48 +0200 Subject: [PATCH 071/230] drm/i915/gt: Enable the CCS_FLUSH bit in the pipe control and in the CS Enable the CCS_FLUSH bit 13 in the control pipe for render and compute engines in platforms starting from Meteor Lake (BSPEC 43904 and 47112). For the copy engine add MI_FLUSH_DW_CCS (bit 16) in the command streamer. Fixes: 972282c4cf24 ("drm/i915/gen12: Add aux table invalidate for all engines") Requires: 8da173db894a ("drm/i915/gt: Rename flags with bit_group_X according to the datasheet") Signed-off-by: Andi Shyti Cc: Jonathan Cavitt Cc: Nirmoy Das Cc: # v5.8+ Reviewed-by: Matt Roper Reviewed-by: Andrzej Hajda Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20230725001950.1014671-6-andi.shyti@linux.intel.com (cherry picked from commit b70df82b428774875c7c56d3808102165891547c) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 11 +++++++++++ drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 1 + 2 files changed, 12 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index 5d2175e918dd..ec54d36eaef7 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -230,6 +230,13 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) bit_group_0 |= PIPE_CONTROL0_HDC_PIPELINE_FLUSH; + /* + * When required, in MTL and beyond platforms we + * need to set the CCS_FLUSH bit in the pipe control + */ + if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70)) + bit_group_0 |= PIPE_CONTROL_CCS_FLUSH; + bit_group_1 |= PIPE_CONTROL_TILE_CACHE_FLUSH; bit_group_1 |= PIPE_CONTROL_FLUSH_L3; bit_group_1 |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; @@ -356,6 +363,10 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode) cmd |= MI_INVALIDATE_TLB; if (rq->engine->class == VIDEO_DECODE_CLASS) cmd |= MI_INVALIDATE_BSD; + + if (gen12_needs_ccs_aux_inv(rq->engine) && + rq->engine->class == COPY_ENGINE_CLASS) + cmd |= MI_FLUSH_DW_CCS; } *cs++ = cmd; diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h index 5d143e2a8db0..5df7cce23197 100644 --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -299,6 +299,7 @@ #define PIPE_CONTROL_QW_WRITE (1<<14) #define PIPE_CONTROL_POST_SYNC_OP_MASK (3<<14) #define PIPE_CONTROL_DEPTH_STALL (1<<13) +#define PIPE_CONTROL_CCS_FLUSH (1<<13) /* MTL+ */ #define PIPE_CONTROL_WRITE_FLUSH (1<<12) #define PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH (1<<12) /* gen6+ */ #define PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE (1<<11) /* MBZ on ILK */ From 0fde2f23516a00fd90dfb980b66b4665fcbfa659 Mon Sep 17 00:00:00 2001 From: Jonathan Cavitt Date: Tue, 25 Jul 2023 02:19:49 +0200 Subject: [PATCH 072/230] drm/i915/gt: Poll aux invalidation register bit on invalidation For platforms that use Aux CCS, wait for aux invalidation to complete by checking the aux invalidation register bit is cleared. Fixes: 972282c4cf24 ("drm/i915/gen12: Add aux table invalidate for all engines") Signed-off-by: Jonathan Cavitt Signed-off-by: Andi Shyti Cc: # v5.8+ Reviewed-by: Nirmoy Das Reviewed-by: Andrzej Hajda Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20230725001950.1014671-7-andi.shyti@linux.intel.com (cherry picked from commit d459c86f00aa98028d155a012c65dc42f7c37e76) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 17 ++++++++++++----- drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index ec54d36eaef7..ec7a0ddf9e12 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -184,7 +184,15 @@ u32 *gen12_emit_aux_table_inv(struct intel_gt *gt, u32 *cs, const i915_reg_t inv *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN; *cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset; *cs++ = AUX_INV; - *cs++ = MI_NOOP; + + *cs++ = MI_SEMAPHORE_WAIT_TOKEN | + MI_SEMAPHORE_REGISTER_POLL | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD; + *cs++ = 0; + *cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset; + *cs++ = 0; + *cs++ = 0; return cs; } @@ -292,10 +300,9 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) else if (engine->class == COMPUTE_CLASS) flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS; + count = 8; if (gen12_needs_ccs_aux_inv(rq->engine)) - count = 8 + 4; - else - count = 8; + count += 8; cs = intel_ring_begin(rq, count); if (IS_ERR(cs)) @@ -338,7 +345,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode) aux_inv = rq->engine->mask & ~GENMASK(_BCS(I915_MAX_BCS - 1), BCS0); if (aux_inv) - cmd += 4; + cmd += 8; } } diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h index 5df7cce23197..2bd8d98d2110 100644 --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -121,6 +121,7 @@ #define MI_SEMAPHORE_TARGET(engine) ((engine)<<15) #define MI_SEMAPHORE_WAIT MI_INSTR(0x1c, 2) /* GEN8+ */ #define MI_SEMAPHORE_WAIT_TOKEN MI_INSTR(0x1c, 3) /* GEN12+ */ +#define MI_SEMAPHORE_REGISTER_POLL (1 << 16) #define MI_SEMAPHORE_POLL (1 << 15) #define MI_SEMAPHORE_SAD_GT_SDD (0 << 12) #define MI_SEMAPHORE_SAD_GTE_SDD (1 << 12) From 6a35f22d222528e1b157c6978c9424d2f8cbe0a1 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Tue, 25 Jul 2023 02:19:50 +0200 Subject: [PATCH 073/230] drm/i915/gt: Support aux invalidation on all engines Perform some refactoring with the purpose of keeping in one single place all the operations around the aux table invalidation. With this refactoring add more engines where the invalidation should be performed. Fixes: 972282c4cf24 ("drm/i915/gen12: Add aux table invalidate for all engines") Signed-off-by: Andi Shyti Cc: Jonathan Cavitt Cc: Matt Roper Cc: # v5.8+ Reviewed-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/20230725001950.1014671-8-andi.shyti@linux.intel.com (cherry picked from commit 76ff7789d6e63d1a10b3b58f5c70b2e640c7a880) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 66 +++++++++++++----------- drivers/gpu/drm/i915/gt/gen8_engine_cs.h | 3 +- drivers/gpu/drm/i915/gt/intel_lrc.c | 17 +----- 3 files changed, 41 insertions(+), 45 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index ec7a0ddf9e12..2702ad4c26c8 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -165,21 +165,47 @@ static u32 preparser_disable(bool state) return MI_ARB_CHECK | 1 << 8 | state; } +static i915_reg_t gen12_get_aux_inv_reg(struct intel_engine_cs *engine) +{ + switch (engine->id) { + case RCS0: + return GEN12_CCS_AUX_INV; + case BCS0: + return GEN12_BCS0_AUX_INV; + case VCS0: + return GEN12_VD0_AUX_INV; + case VCS2: + return GEN12_VD2_AUX_INV; + case VECS0: + return GEN12_VE0_AUX_INV; + case CCS0: + return GEN12_CCS0_AUX_INV; + default: + return INVALID_MMIO_REG; + } +} + static bool gen12_needs_ccs_aux_inv(struct intel_engine_cs *engine) { + i915_reg_t reg = gen12_get_aux_inv_reg(engine); + if (IS_PONTEVECCHIO(engine->i915)) return false; /* - * so far platforms supported by i915 having - * flat ccs do not require AUX invalidation + * So far platforms supported by i915 having flat ccs do not require + * AUX invalidation. Check also whether the engine requires it. */ - return !HAS_FLAT_CCS(engine->i915); + return i915_mmio_reg_valid(reg) && !HAS_FLAT_CCS(engine->i915); } -u32 *gen12_emit_aux_table_inv(struct intel_gt *gt, u32 *cs, const i915_reg_t inv_reg) +u32 *gen12_emit_aux_table_inv(struct intel_engine_cs *engine, u32 *cs) { - u32 gsi_offset = gt->uncore->gsi_offset; + i915_reg_t inv_reg = gen12_get_aux_inv_reg(engine); + u32 gsi_offset = engine->gt->uncore->gsi_offset; + + if (!gen12_needs_ccs_aux_inv(engine)) + return cs; *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN; *cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset; @@ -317,11 +343,7 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); - if (gen12_needs_ccs_aux_inv(rq->engine)) { - /* hsdes: 1809175790 */ - cs = gen12_emit_aux_table_inv(rq->engine->gt, cs, - GEN12_CCS_AUX_INV); - } + cs = gen12_emit_aux_table_inv(engine, cs); *cs++ = preparser_disable(false); intel_ring_advance(rq, cs); @@ -332,21 +354,14 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode) { - intel_engine_mask_t aux_inv = 0; - u32 cmd, *cs; + u32 cmd = 4; + u32 *cs; - cmd = 4; if (mode & EMIT_INVALIDATE) { cmd += 2; - if (gen12_needs_ccs_aux_inv(rq->engine) && - (rq->engine->class == VIDEO_DECODE_CLASS || - rq->engine->class == VIDEO_ENHANCEMENT_CLASS)) { - aux_inv = rq->engine->mask & - ~GENMASK(_BCS(I915_MAX_BCS - 1), BCS0); - if (aux_inv) - cmd += 8; - } + if (gen12_needs_ccs_aux_inv(rq->engine)) + cmd += 8; } cs = intel_ring_begin(rq, cmd); @@ -381,14 +396,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode) *cs++ = 0; /* upper addr */ *cs++ = 0; /* value */ - if (aux_inv) { /* hsdes: 1809175790 */ - if (rq->engine->class == VIDEO_DECODE_CLASS) - cs = gen12_emit_aux_table_inv(rq->engine->gt, - cs, GEN12_VD0_AUX_INV); - else - cs = gen12_emit_aux_table_inv(rq->engine->gt, - cs, GEN12_VE0_AUX_INV); - } + cs = gen12_emit_aux_table_inv(rq->engine, cs); if (mode & EMIT_INVALIDATE) *cs++ = preparser_disable(false); diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h index a44eda096557..867ba697aceb 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h @@ -13,6 +13,7 @@ #include "intel_gt_regs.h" #include "intel_gpu_commands.h" +struct intel_engine_cs; struct intel_gt; struct i915_request; @@ -46,7 +47,7 @@ u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs); u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs); u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs); -u32 *gen12_emit_aux_table_inv(struct intel_gt *gt, u32 *cs, const i915_reg_t inv_reg); +u32 *gen12_emit_aux_table_inv(struct intel_engine_cs *engine, u32 *cs); static inline u32 * __gen8_emit_pipe_control(u32 *batch, u32 bit_group_0, diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 325f3dbfb90e..9477c2422321 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1364,10 +1364,7 @@ gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) IS_DG2_G11(ce->engine->i915)) cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); - /* hsdes: 1809175790 */ - if (!HAS_FLAT_CCS(ce->engine->i915)) - cs = gen12_emit_aux_table_inv(ce->engine->gt, - cs, GEN12_CCS_AUX_INV); + cs = gen12_emit_aux_table_inv(ce->engine, cs); /* Wa_16014892111 */ if (IS_MTL_GRAPHICS_STEP(ce->engine->i915, M, STEP_A0, STEP_B0) || @@ -1392,17 +1389,7 @@ gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); - /* hsdes: 1809175790 */ - if (!HAS_FLAT_CCS(ce->engine->i915)) { - if (ce->engine->class == VIDEO_DECODE_CLASS) - cs = gen12_emit_aux_table_inv(ce->engine->gt, - cs, GEN12_VD0_AUX_INV); - else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS) - cs = gen12_emit_aux_table_inv(ce->engine->gt, - cs, GEN12_VE0_AUX_INV); - } - - return cs; + return gen12_emit_aux_table_inv(ce->engine, cs); } static void From a337b64f0d5717248a0c894e2618e658e6a9de9f Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Thu, 20 Jul 2023 11:35:44 +0200 Subject: [PATCH 074/230] drm/i915: Fix premature release of request's reusable memory Infinite waits for completion of GPU activity have been observed in CI, mostly inside __i915_active_wait(), triggered by igt@gem_barrier_race or igt@perf@stress-open-close. Root cause analysis, based of ftrace dumps generated with a lot of extra trace_printk() calls added to the code, revealed loops of request dependencies being accidentally built, preventing the requests from being processed, each waiting for completion of another one's activity. After we substitute a new request for a last active one tracked on a timeline, we set up a dependency of our new request to wait on completion of current activity of that previous one. While doing that, we must take care of keeping the old request still in memory until we use its attributes for setting up that await dependency, or we can happen to set up the await dependency on an unrelated request that already reuses the memory previously allocated to the old one, already released. Combined with perf adding consecutive kernel context remote requests to different user context timelines, unresolvable loops of await dependencies can be built, leading do infinite waits. We obtain a pointer to the previous request to wait upon when we substitute it with a pointer to our new request in an active tracker, e.g. in intel_timeline.last_request. In some processing paths we protect that old request from being freed before we use it by getting a reference to it under RCU protection, but in others, e.g. __i915_request_commit() -> __i915_request_add_to_timeline() -> __i915_request_ensure_ordering(), we don't. But anyway, since the requests' memory is SLAB_FAILSAFE_BY_RCU, that RCU protection is not sufficient against reuse of memory. We could protect i915_request's memory from being prematurely reused by calling its release function via call_rcu() and using rcu_read_lock() consequently, as proposed in v1. However, that approach leads to significant (up to 10 times) increase of SLAB utilization by i915_request SLAB cache. Another potential approach is to take a reference to the previous active fence. When updating an active fence tracker, we first lock the new fence, substitute a pointer of the current active fence with the new one, then we lock the substituted fence. With this approach, there is a time window after the substitution and before the lock when the request can be concurrently released by an interrupt handler and its memory reused, then we may happen to lock and return a new, unrelated request. Always get a reference to the current active fence first, before replacing it with a new one. Having it protected from premature release and reuse, lock it and then replace with the new one but only if not yet signalled via a potential concurrent interrupt nor replaced with another one by a potential concurrent thread, otherwise retry, starting from getting a reference to the new current one. Adjust users to not get a reference to the previous active fence themselves and always put the reference got by __i915_active_fence_set() when no longer needed. v3: Fix lockdep splat reports and other issues caused by incorrect use of try_cmpxchg() (use (cmpxchg() != prev) instead) v2: Protect request's memory by getting a reference to it in favor of delegating its release to call_rcu() (Chris) Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/8211 Fixes: df9f85d8582e ("drm/i915: Serialise i915_active_fence_set() with itself") Suggested-by: Chris Wilson Signed-off-by: Janusz Krzysztofik Cc: # v5.6+ Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20230720093543.832147-2-janusz.krzysztofik@linux.intel.com (cherry picked from commit 946e047a3d88d46d15b5c5af0414098e12b243f7) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_active.c | 99 ++++++++++++++++++++--------- drivers/gpu/drm/i915/i915_request.c | 11 ++++ 2 files changed, 81 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index 8ef93889061a..5ec293011d99 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -449,8 +449,11 @@ int i915_active_add_request(struct i915_active *ref, struct i915_request *rq) } } while (unlikely(is_barrier(active))); - if (!__i915_active_fence_set(active, fence)) + fence = __i915_active_fence_set(active, fence); + if (!fence) __i915_active_acquire(ref); + else + dma_fence_put(fence); out: i915_active_release(ref); @@ -469,13 +472,9 @@ __i915_active_set_fence(struct i915_active *ref, return NULL; } - rcu_read_lock(); prev = __i915_active_fence_set(active, fence); - if (prev) - prev = dma_fence_get_rcu(prev); - else + if (!prev) __i915_active_acquire(ref); - rcu_read_unlock(); return prev; } @@ -1019,10 +1018,11 @@ void i915_request_add_active_barriers(struct i915_request *rq) * * Records the new @fence as the last active fence along its timeline in * this active tracker, moving the tracking callbacks from the previous - * fence onto this one. Returns the previous fence (if not already completed), - * which the caller must ensure is executed before the new fence. To ensure - * that the order of fences within the timeline of the i915_active_fence is - * understood, it should be locked by the caller. + * fence onto this one. Gets and returns a reference to the previous fence + * (if not already completed), which the caller must put after making sure + * that it is executed before the new fence. To ensure that the order of + * fences within the timeline of the i915_active_fence is understood, it + * should be locked by the caller. */ struct dma_fence * __i915_active_fence_set(struct i915_active_fence *active, @@ -1031,7 +1031,23 @@ __i915_active_fence_set(struct i915_active_fence *active, struct dma_fence *prev; unsigned long flags; - if (fence == rcu_access_pointer(active->fence)) + /* + * In case of fences embedded in i915_requests, their memory is + * SLAB_FAILSAFE_BY_RCU, then it can be reused right after release + * by new requests. Then, there is a risk of passing back a pointer + * to a new, completely unrelated fence that reuses the same memory + * while tracked under a different active tracker. Combined with i915 + * perf open/close operations that build await dependencies between + * engine kernel context requests and user requests from different + * timelines, this can lead to dependency loops and infinite waits. + * + * As a countermeasure, we try to get a reference to the active->fence + * first, so if we succeed and pass it back to our user then it is not + * released and potentially reused by an unrelated request before the + * user has a chance to set up an await dependency on it. + */ + prev = i915_active_fence_get(active); + if (fence == prev) return fence; GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)); @@ -1040,27 +1056,56 @@ __i915_active_fence_set(struct i915_active_fence *active, * Consider that we have two threads arriving (A and B), with * C already resident as the active->fence. * - * A does the xchg first, and so it sees C or NULL depending - * on the timing of the interrupt handler. If it is NULL, the - * previous fence must have been signaled and we know that - * we are first on the timeline. If it is still present, - * we acquire the lock on that fence and serialise with the interrupt - * handler, in the process removing it from any future interrupt - * callback. A will then wait on C before executing (if present). - * - * As B is second, it sees A as the previous fence and so waits for - * it to complete its transition and takes over the occupancy for - * itself -- remembering that it needs to wait on A before executing. + * Both A and B have got a reference to C or NULL, depending on the + * timing of the interrupt handler. Let's assume that if A has got C + * then it has locked C first (before B). * * Note the strong ordering of the timeline also provides consistent * nesting rules for the fence->lock; the inner lock is always the * older lock. */ spin_lock_irqsave(fence->lock, flags); - prev = xchg(__active_fence_slot(active), fence); - if (prev) { - GEM_BUG_ON(prev == fence); + if (prev) spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING); + + /* + * A does the cmpxchg first, and so it sees C or NULL, as before, or + * something else, depending on the timing of other threads and/or + * interrupt handler. If not the same as before then A unlocks C if + * applicable and retries, starting from an attempt to get a new + * active->fence. Meanwhile, B follows the same path as A. + * Once A succeeds with cmpxch, B fails again, retires, gets A from + * active->fence, locks it as soon as A completes, and possibly + * succeeds with cmpxchg. + */ + while (cmpxchg(__active_fence_slot(active), prev, fence) != prev) { + if (prev) { + spin_unlock(prev->lock); + dma_fence_put(prev); + } + spin_unlock_irqrestore(fence->lock, flags); + + prev = i915_active_fence_get(active); + GEM_BUG_ON(prev == fence); + + spin_lock_irqsave(fence->lock, flags); + if (prev) + spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING); + } + + /* + * If prev is NULL then the previous fence must have been signaled + * and we know that we are first on the timeline. If it is still + * present then, having the lock on that fence already acquired, we + * serialise with the interrupt handler, in the process of removing it + * from any future interrupt callback. A will then wait on C before + * executing (if present). + * + * As B is second, it sees A as the previous fence and so waits for + * it to complete its transition and takes over the occupancy for + * itself -- remembering that it needs to wait on A before executing. + */ + if (prev) { __list_del_entry(&active->cb.node); spin_unlock(prev->lock); /* serialise with prev->cb_list */ } @@ -1077,11 +1122,7 @@ int i915_active_fence_set(struct i915_active_fence *active, int err = 0; /* Must maintain timeline ordering wrt previous active requests */ - rcu_read_lock(); fence = __i915_active_fence_set(active, &rq->fence); - if (fence) /* but the previous fence may not belong to that timeline! */ - fence = dma_fence_get_rcu(fence); - rcu_read_unlock(); if (fence) { err = i915_request_await_dma_fence(rq, fence); dma_fence_put(fence); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 894068bb37b6..833b73edefdb 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1661,6 +1661,11 @@ __i915_request_ensure_parallel_ordering(struct i915_request *rq, request_to_parent(rq)->parallel.last_rq = i915_request_get(rq); + /* + * Users have to put a reference potentially got by + * __i915_active_fence_set() to the returned request + * when no longer needed + */ return to_request(__i915_active_fence_set(&timeline->last_request, &rq->fence)); } @@ -1707,6 +1712,10 @@ __i915_request_ensure_ordering(struct i915_request *rq, 0); } + /* + * Users have to put the reference to prev potentially got + * by __i915_active_fence_set() when no longer needed + */ return prev; } @@ -1760,6 +1769,8 @@ __i915_request_add_to_timeline(struct i915_request *rq) prev = __i915_request_ensure_ordering(rq, timeline); else prev = __i915_request_ensure_parallel_ordering(rq, timeline); + if (prev) + i915_request_put(prev); /* * Make sure that no request gazumped us - if it was allocated after From 2dc0bc1138eecc88b2c376ccb0b0acb215c25a5c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 1 Aug 2023 20:26:50 +1000 Subject: [PATCH 075/230] powerpc/64e: Fix secondary thread bringup for ELFv2 kernels When booting on e6500 with an ELF v2 ABI kernel, the secondary threads do not start correctly: [ 0.051118] smp: Bringing up secondary CPUs ... [ 5.072700] Processor 1 is stuck. This occurs because the startup code is written to use function descriptors when loading the entry point for the secondary threads. When building with ELF v2 ABI there are no function descriptors, and the code loads junk values for the entry point address. Fix it by using ppc_function_entry() in C, and DOTSYM() in asm, both of which work correctly for ELF v2 ABI as well as ELF v1 ABI kernels. Fixes: 8c5fa3b5c4df ("powerpc/64: Make ELFv2 the default for big-endian builds") Signed-off-by: Michael Ellerman Link: https://msgid.link/20230801102650.48705-1-mpe@ellerman.id.au --- arch/powerpc/kernel/head_64.S | 3 +-- arch/powerpc/platforms/85xx/smp.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index f132d8704263..6440b1bb332a 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -375,8 +375,7 @@ _GLOBAL(generic_secondary_smp_init) beq 20f /* start the specified thread */ - LOAD_REG_ADDR(r5, fsl_secondary_thread_init) - ld r4, 0(r5) + LOAD_REG_ADDR(r5, DOTSYM(fsl_secondary_thread_init)) bl book3e_start_thread /* stop the current thread */ diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 9c43cf32f4c9..40aa58206888 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -180,7 +180,7 @@ static void wake_hw_thread(void *info) unsigned long inia; int cpu = *(const int *)info; - inia = *(unsigned long *)fsl_secondary_thread_init; + inia = ppc_function_entry(fsl_secondary_thread_init); book3e_start_thread(cpu_thread_in_core(cpu), inia); } #endif From 16e95a62eed18864aecac404f1e4eed764c363f2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 25 Jul 2023 13:39:12 +0800 Subject: [PATCH 076/230] powercap: intel_rapl: Fix a sparse warning in TPMI interface Depends on the interface used, the RAPL registers can be either MSR indexes or memory mapped IO addresses. Current RAPL common code uses u64 to save both MSR and memory mapped IO registers. With this, when handling register address with an __iomem annotation, it triggers a sparse warning like below: sparse warnings: (new ones prefixed by >>) >> drivers/powercap/intel_rapl_tpmi.c:141:41: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected unsigned long long [usertype] *tpmi_rapl_regs @@ got void [noderef] __iomem * @@ drivers/powercap/intel_rapl_tpmi.c:141:41: sparse: expected unsigned long long [usertype] *tpmi_rapl_regs drivers/powercap/intel_rapl_tpmi.c:141:41: sparse: got void [noderef] __iomem * Fix the problem by using a union to save the registers instead. Suggested-by: David Laight Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202307031405.dy3druuy-lkp@intel.com/ Tested-by: Wang Wendy Signed-off-by: Zhang Rui [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 14 +++--- drivers/powercap/intel_rapl_msr.c | 49 ++++++++++--------- drivers/powercap/intel_rapl_tpmi.c | 17 +++---- .../int340x_thermal/processor_thermal_rapl.c | 16 +++--- include/linux/intel_rapl.h | 14 ++++-- 5 files changed, 58 insertions(+), 52 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 4e646e5e48f6..8fac57b28f8a 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -818,7 +818,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, return -EINVAL; ra.reg = rd->regs[rpi->id]; - if (!ra.reg) + if (!ra.reg.val) return -EINVAL; /* non-hardware data are collected by the polling thread */ @@ -830,7 +830,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, ra.mask = rpi->mask; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { - pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg, rd->rp->name, rd->name); + pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name); return -EIO; } @@ -920,7 +920,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) ra.mask = ~0; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", - ra.reg, rd->rp->name, rd->name); + ra.reg.val, rd->rp->name, rd->name); return -ENODEV; } @@ -948,7 +948,7 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) ra.mask = ~0; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", - ra.reg, rd->rp->name, rd->name); + ra.reg.val, rd->rp->name, rd->name); return -ENODEV; } @@ -1135,7 +1135,7 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) ra.mask = ~0; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", - ra.reg, rd->rp->name, rd->name); + ra.reg.val, rd->rp->name, rd->name); return -ENODEV; } @@ -1411,8 +1411,8 @@ static int rapl_get_domain_unit(struct rapl_domain *rd) struct rapl_defaults *defaults = get_defaults(rd->rp); int ret; - if (!rd->regs[RAPL_DOMAIN_REG_UNIT]) { - if (!rd->rp->priv->reg_unit) { + if (!rd->regs[RAPL_DOMAIN_REG_UNIT].val) { + if (!rd->rp->priv->reg_unit.val) { pr_err("No valid Unit register found\n"); return -ENODEV; } diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 569e25eab1e1..dd471021f237 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -34,28 +34,32 @@ static struct rapl_if_priv *rapl_msr_priv; static struct rapl_if_priv rapl_msr_priv_intel = { .type = RAPL_IF_MSR, - .reg_unit = MSR_RAPL_POWER_UNIT, - .regs[RAPL_DOMAIN_PACKAGE] = { - MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO }, - .regs[RAPL_DOMAIN_PP0] = { - MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 }, - .regs[RAPL_DOMAIN_PP1] = { - MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 }, - .regs[RAPL_DOMAIN_DRAM] = { - MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO }, - .regs[RAPL_DOMAIN_PLATFORM] = { - MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0}, + .reg_unit.msr = MSR_RAPL_POWER_UNIT, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_LIMIT].msr = MSR_PKG_POWER_LIMIT, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_STATUS].msr = MSR_PKG_ENERGY_STATUS, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PERF].msr = MSR_PKG_PERF_STATUS, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_INFO].msr = MSR_PKG_POWER_INFO, + .regs[RAPL_DOMAIN_PP0][RAPL_DOMAIN_REG_LIMIT].msr = MSR_PP0_POWER_LIMIT, + .regs[RAPL_DOMAIN_PP0][RAPL_DOMAIN_REG_STATUS].msr = MSR_PP0_ENERGY_STATUS, + .regs[RAPL_DOMAIN_PP0][RAPL_DOMAIN_REG_POLICY].msr = MSR_PP0_POLICY, + .regs[RAPL_DOMAIN_PP1][RAPL_DOMAIN_REG_LIMIT].msr = MSR_PP1_POWER_LIMIT, + .regs[RAPL_DOMAIN_PP1][RAPL_DOMAIN_REG_STATUS].msr = MSR_PP1_ENERGY_STATUS, + .regs[RAPL_DOMAIN_PP1][RAPL_DOMAIN_REG_POLICY].msr = MSR_PP1_POLICY, + .regs[RAPL_DOMAIN_DRAM][RAPL_DOMAIN_REG_LIMIT].msr = MSR_DRAM_POWER_LIMIT, + .regs[RAPL_DOMAIN_DRAM][RAPL_DOMAIN_REG_STATUS].msr = MSR_DRAM_ENERGY_STATUS, + .regs[RAPL_DOMAIN_DRAM][RAPL_DOMAIN_REG_PERF].msr = MSR_DRAM_PERF_STATUS, + .regs[RAPL_DOMAIN_DRAM][RAPL_DOMAIN_REG_INFO].msr = MSR_DRAM_POWER_INFO, + .regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT].msr = MSR_PLATFORM_POWER_LIMIT, + .regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS].msr = MSR_PLATFORM_ENERGY_STATUS, .limits[RAPL_DOMAIN_PACKAGE] = BIT(POWER_LIMIT2), .limits[RAPL_DOMAIN_PLATFORM] = BIT(POWER_LIMIT2), }; static struct rapl_if_priv rapl_msr_priv_amd = { .type = RAPL_IF_MSR, - .reg_unit = MSR_AMD_RAPL_POWER_UNIT, - .regs[RAPL_DOMAIN_PACKAGE] = { - 0, MSR_AMD_PKG_ENERGY_STATUS, 0, 0, 0 }, - .regs[RAPL_DOMAIN_PP0] = { - 0, MSR_AMD_CORE_ENERGY_STATUS, 0, 0, 0 }, + .reg_unit.msr = MSR_AMD_RAPL_POWER_UNIT, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_STATUS].msr = MSR_AMD_PKG_ENERGY_STATUS, + .regs[RAPL_DOMAIN_PP0][RAPL_DOMAIN_REG_STATUS].msr = MSR_AMD_CORE_ENERGY_STATUS, }; /* Handles CPU hotplug on multi-socket systems. @@ -99,10 +103,8 @@ static int rapl_cpu_down_prep(unsigned int cpu) static int rapl_msr_read_raw(int cpu, struct reg_action *ra) { - u32 msr = (u32)ra->reg; - - if (rdmsrl_safe_on_cpu(cpu, msr, &ra->value)) { - pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu); + if (rdmsrl_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) { + pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu); return -EIO; } ra->value &= ra->mask; @@ -112,17 +114,16 @@ static int rapl_msr_read_raw(int cpu, struct reg_action *ra) static void rapl_msr_update_func(void *info) { struct reg_action *ra = info; - u32 msr = (u32)ra->reg; u64 val; - ra->err = rdmsrl_safe(msr, &val); + ra->err = rdmsrl_safe(ra->reg.msr, &val); if (ra->err) return; val &= ~ra->mask; val |= ra->value; - ra->err = wrmsrl_safe(msr, val); + ra->err = wrmsrl_safe(ra->reg.msr, val); } static int rapl_msr_write_raw(int cpu, struct reg_action *ra) @@ -171,7 +172,7 @@ static int rapl_msr_probe(struct platform_device *pdev) if (id) { rapl_msr_priv->limits[RAPL_DOMAIN_PACKAGE] |= BIT(POWER_LIMIT4); - rapl_msr_priv->regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PL4] = + rapl_msr_priv->regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PL4].msr = MSR_VR_CURRENT_CONFIG; pr_info("PL4 support detected.\n"); } diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 4f4f13ded225..891c90fefd8b 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -59,10 +59,10 @@ static struct powercap_control_type *tpmi_control_type; static int tpmi_rapl_read_raw(int id, struct reg_action *ra) { - if (!ra->reg) + if (!ra->reg.mmio) return -EINVAL; - ra->value = readq((void __iomem *)ra->reg); + ra->value = readq(ra->reg.mmio); ra->value &= ra->mask; return 0; @@ -72,15 +72,15 @@ static int tpmi_rapl_write_raw(int id, struct reg_action *ra) { u64 val; - if (!ra->reg) + if (!ra->reg.mmio) return -EINVAL; - val = readq((void __iomem *)ra->reg); + val = readq(ra->reg.mmio); val &= ~ra->mask; val |= ra->value; - writeq(val, (void __iomem *)ra->reg); + writeq(val, ra->reg.mmio); return 0; } @@ -138,8 +138,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) enum tpmi_rapl_register reg_index; enum rapl_domain_reg_id reg_id; int tpmi_domain_size, tpmi_domain_flags; - u64 *tpmi_rapl_regs = trp->base + offset; - u64 tpmi_domain_header = readq((void __iomem *)tpmi_rapl_regs); + u64 tpmi_domain_header = readq(trp->base + offset); /* Domain Parent bits are ignored for now */ tpmi_domain_version = tpmi_domain_header & 0xff; @@ -180,7 +179,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) return -EINVAL; } - if (trp->priv.regs[domain_type][RAPL_DOMAIN_REG_UNIT]) { + if (trp->priv.regs[domain_type][RAPL_DOMAIN_REG_UNIT].mmio) { pr_warn(FW_BUG "Duplicate Domain type %d\n", tpmi_domain_type); return -EINVAL; } @@ -218,7 +217,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) default: continue; } - trp->priv.regs[domain_type][reg_id] = (u64)&tpmi_rapl_regs[reg_index]; + trp->priv.regs[domain_type][reg_id].mmio = trp->base + offset + reg_index * 8; } return 0; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c index 013f1633f082..2f00fc3bf274 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c @@ -57,10 +57,10 @@ static int rapl_mmio_cpu_down_prep(unsigned int cpu) static int rapl_mmio_read_raw(int cpu, struct reg_action *ra) { - if (!ra->reg) + if (!ra->reg.mmio) return -EINVAL; - ra->value = readq((void __iomem *)ra->reg); + ra->value = readq(ra->reg.mmio); ra->value &= ra->mask; return 0; } @@ -69,13 +69,13 @@ static int rapl_mmio_write_raw(int cpu, struct reg_action *ra) { u64 val; - if (!ra->reg) + if (!ra->reg.mmio) return -EINVAL; - val = readq((void __iomem *)ra->reg); + val = readq(ra->reg.mmio); val &= ~ra->mask; val |= ra->value; - writeq(val, (void __iomem *)ra->reg); + writeq(val, ra->reg.mmio); return 0; } @@ -92,13 +92,13 @@ int proc_thermal_rapl_add(struct pci_dev *pdev, struct proc_thermal_device *proc for (domain = RAPL_DOMAIN_PACKAGE; domain < RAPL_DOMAIN_MAX; domain++) { for (reg = RAPL_DOMAIN_REG_LIMIT; reg < RAPL_DOMAIN_REG_MAX; reg++) if (rapl_regs->regs[domain][reg]) - rapl_mmio_priv.regs[domain][reg] = - (u64)proc_priv->mmio_base + + rapl_mmio_priv.regs[domain][reg].mmio = + proc_priv->mmio_base + rapl_regs->regs[domain][reg]; rapl_mmio_priv.limits[domain] = rapl_regs->limits[domain]; } rapl_mmio_priv.type = RAPL_IF_MMIO; - rapl_mmio_priv.reg_unit = (u64)proc_priv->mmio_base + rapl_regs->reg_unit; + rapl_mmio_priv.reg_unit.mmio = proc_priv->mmio_base + rapl_regs->reg_unit; rapl_mmio_priv.read_raw = rapl_mmio_read_raw; rapl_mmio_priv.write_raw = rapl_mmio_write_raw; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index e6936cb25047..33f21bd85dbf 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -100,10 +100,16 @@ struct rapl_package; #define RAPL_DOMAIN_NAME_LENGTH 16 +union rapl_reg { + void __iomem *mmio; + u32 msr; + u64 val; +}; + struct rapl_domain { char name[RAPL_DOMAIN_NAME_LENGTH]; enum rapl_domain_type id; - u64 regs[RAPL_DOMAIN_REG_MAX]; + union rapl_reg regs[RAPL_DOMAIN_REG_MAX]; struct powercap_zone power_zone; struct rapl_domain_data rdd; struct rapl_power_limit rpl[NR_POWER_LIMITS]; @@ -116,7 +122,7 @@ struct rapl_domain { }; struct reg_action { - u64 reg; + union rapl_reg reg; u64 mask; u64 value; int err; @@ -143,8 +149,8 @@ struct rapl_if_priv { enum rapl_if_type type; struct powercap_control_type *control_type; enum cpuhp_state pcap_rapl_online; - u64 reg_unit; - u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; + union rapl_reg reg_unit; + union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra); int (*write_raw)(int id, struct reg_action *ra); From e7e607bd00481745550389a29ecabe33e13d67cf Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 25 Jul 2023 12:03:59 +0800 Subject: [PATCH 077/230] ceph: defer stopping mdsc delayed_work Flushing the dirty buffer may take a long time if the cluster is overloaded or if there is network issue. So we should ping the MDSs periodically to keep alive, else the MDS will blocklist the kclient. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/61843 Signed-off-by: Xiubo Li Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++-- fs/ceph/mds_client.h | 5 +++++ fs/ceph/super.c | 10 ++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 66048a86c480..5fb367b1d4b0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4764,7 +4764,7 @@ static void delayed_work(struct work_struct *work) dout("mdsc delayed_work\n"); - if (mdsc->stopping) + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) return; mutex_lock(&mdsc->mutex); @@ -4943,7 +4943,7 @@ void send_flush_mdlog(struct ceph_mds_session *s) void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) { dout("pre_umount\n"); - mdsc->stopping = 1; + mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN; ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 724307ff89cd..86d2965e68a1 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -380,6 +380,11 @@ struct cap_wait { int want; }; +enum { + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHED = 2, +}; + /* * mds client state */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 3fc48b43cab0..a5f52013314d 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1374,6 +1374,16 @@ static void ceph_kill_sb(struct super_block *s) ceph_mdsc_pre_umount(fsc->mdsc); flush_fs_workqueues(fsc); + /* + * Though the kill_anon_super() will finally trigger the + * sync_filesystem() anyway, we still need to do it here + * and then bump the stage of shutdown to stop the work + * queue as earlier as possible. + */ + sync_filesystem(s); + + fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; From 1b0fc0345f2852ffe54fb9ae0e12e2ee69ad6a20 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 1 Aug 2023 07:31:07 -0700 Subject: [PATCH 078/230] Documentation/x86: Fix backwards on/off logic about YMM support These options clearly turn *off* XSAVE YMM support. Correct the typo. Reported-by: Ben Hutchings Fixes: 553a5c03e90a ("x86/speculation: Add force option to GDS mitigation") Signed-off-by: Dave Hansen --- Documentation/admin-guide/hw-vuln/gather_data_sampling.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst index 40b7a6260010..264bfa937f7d 100644 --- a/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst +++ b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst @@ -63,7 +63,7 @@ GDS can also be mitigated on systems that don't have updated microcode by disabling AVX. This can be done by setting gather_data_sampling="force" or "clearcpuid=avx" on the kernel command-line. -If used, these options will disable AVX use by turning on XSAVE YMM support. +If used, these options will disable AVX use by turning off XSAVE YMM support. However, the processor will still enumerate AVX support. Userspace that does not follow proper AVX enumeration to check both AVX *and* XSAVE YMM support will break. From 9d01e07fd1bfb4daae156ab528aa196f5ac2b2bc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 1 Aug 2023 19:14:24 +0200 Subject: [PATCH 079/230] rbd: prevent busy loop when requesting exclusive lock Due to rbd_try_acquire_lock() effectively swallowing all but EBLOCKLISTED error from rbd_try_lock() ("request lock anyway") and rbd_request_lock() returning ETIMEDOUT error not only for an actual notify timeout but also when the lock owner doesn't respond, a busy loop inside of rbd_acquire_lock() between rbd_try_acquire_lock() and rbd_request_lock() is possible. Requesting the lock on EBUSY error (returned by get_lock_owner_info() if an incompatible lock or invalid lock owner is detected) makes very little sense. The same goes for ETIMEDOUT error (might pop up pretty much anywhere if osd_request_timeout option is set) and many others. Just fail I/O requests on rbd_dev->acquiring_list immediately on any error from rbd_try_lock(). Cc: stable@vger.kernel.org # 588159009d5b: rbd: retrieve and check lock owner twice before blocklisting Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 24afcc93ac01..2328cc05be36 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3675,7 +3675,7 @@ static int rbd_lock(struct rbd_device *rbd_dev) ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, RBD_LOCK_TAG, "", 0); - if (ret) + if (ret && ret != -EEXIST) return ret; __rbd_lock(rbd_dev, cookie); @@ -3878,7 +3878,7 @@ static struct ceph_locker *get_lock_owner_info(struct rbd_device *rbd_dev) &rbd_dev->header_oloc, RBD_LOCK_NAME, &lock_type, &lock_tag, &lockers, &num_lockers); if (ret) { - rbd_warn(rbd_dev, "failed to retrieve lockers: %d", ret); + rbd_warn(rbd_dev, "failed to get header lockers: %d", ret); return ERR_PTR(ret); } @@ -3940,8 +3940,10 @@ static int find_watcher(struct rbd_device *rbd_dev, ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, &watchers, &num_watchers); - if (ret) + if (ret) { + rbd_warn(rbd_dev, "failed to get watchers: %d", ret); return ret; + } sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); for (i = 0; i < num_watchers; i++) { @@ -3985,8 +3987,12 @@ static int rbd_try_lock(struct rbd_device *rbd_dev) locker = refreshed_locker = NULL; ret = rbd_lock(rbd_dev); - if (ret != -EBUSY) + if (!ret) goto out; + if (ret != -EBUSY) { + rbd_warn(rbd_dev, "failed to lock header: %d", ret); + goto out; + } /* determine if the current lock holder is still alive */ locker = get_lock_owner_info(rbd_dev); @@ -4089,11 +4095,8 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev) ret = rbd_try_lock(rbd_dev); if (ret < 0) { - rbd_warn(rbd_dev, "failed to lock header: %d", ret); - if (ret == -EBLOCKLISTED) - goto out; - - ret = 1; /* request lock anyway */ + rbd_warn(rbd_dev, "failed to acquire lock: %d", ret); + goto out; } if (ret > 0) { up_write(&rbd_dev->lock_rwsem); @@ -6627,12 +6630,11 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) cancel_delayed_work_sync(&rbd_dev->lock_dwork); if (!ret) ret = -ETIMEDOUT; - } - if (ret) { - rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret); - return ret; + rbd_warn(rbd_dev, "failed to acquire lock: %ld", ret); } + if (ret) + return ret; /* * The lock may have been released by now, unless automatic lock From e6e2843230799230fc5deb8279728a7218b0d63c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 1 Aug 2023 19:14:24 +0200 Subject: [PATCH 080/230] libceph: fix potential hang in ceph_osdc_notify() If the cluster becomes unavailable, ceph_osdc_notify() may hang even with osd_request_timeout option set because linger_notify_finish_wait() waits for MWatchNotify NOTIFY_COMPLETE message with no associated OSD request in flight -- it's completely asynchronous. Introduce an additional timeout, derived from the specified notify timeout. While at it, switch both waits to killable which is more correct. Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang Reviewed-by: Xiubo Li --- net/ceph/osd_client.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 11c04e7d928e..658a6f2320cf 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3334,17 +3334,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) int ret; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - ret = wait_for_completion_interruptible(&lreq->reg_commit_wait); + ret = wait_for_completion_killable(&lreq->reg_commit_wait); return ret ?: lreq->reg_commit_error; } -static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq) +static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq, + unsigned long timeout) { - int ret; + long left; dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - ret = wait_for_completion_interruptible(&lreq->notify_finish_wait); - return ret ?: lreq->notify_finish_error; + left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait, + ceph_timeout_jiffies(timeout)); + if (left <= 0) + left = left ?: -ETIMEDOUT; + else + left = lreq->notify_finish_error; /* completed */ + + return left; } /* @@ -4896,7 +4903,8 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (!ret) - ret = linger_notify_finish_wait(lreq); + ret = linger_notify_finish_wait(lreq, + msecs_to_jiffies(2 * timeout * MSEC_PER_SEC)); else dout("lreq %p failed to initiate notify %d\n", lreq, ret); From 0a8589055936d8feb56477123a8373ac634018fa Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 24 Jul 2023 13:23:14 +0900 Subject: [PATCH 081/230] ata,scsi: do not issue START STOP UNIT on resume During system resume, ata_port_pm_resume() triggers ata EH to 1) Resume the controller 2) Reset and rescan the ports 3) Revalidate devices This EH execution is started asynchronously from ata_port_pm_resume(), which means that when sd_resume() is executed, none or only part of the above processing may have been executed. However, sd_resume() issues a START STOP UNIT to wake up the drive from sleep mode. This command is translated to ATA with ata_scsi_start_stop_xlat() and issued to the device. However, depending on the state of execution of the EH process and revalidation triggerred by ata_port_pm_resume(), two things may happen: 1) The START STOP UNIT fails if it is received before the controller has been reenabled at the beginning of the EH execution. This is visible with error messages like: ata10.00: device reported invalid CHS sector 0 sd 9:0:0:0: [sdc] Start/Stop Unit failed: Result: hostbyte=DID_OK driverbyte=DRIVER_OK sd 9:0:0:0: [sdc] Sense Key : Illegal Request [current] sd 9:0:0:0: [sdc] Add. Sense: Unaligned write command sd 9:0:0:0: PM: dpm_run_callback(): scsi_bus_resume+0x0/0x90 returns -5 sd 9:0:0:0: PM: failed to resume async: error -5 2) The START STOP UNIT command is received while the EH process is on-going, which mean that it is stopped and must wait for its completion, at which point the command is rather useless as the drive is already fully spun up already. This case results also in a significant delay in sd_resume() which is observable by users as the entire system resume completion is delayed. Given that ATA devices will be woken up by libata activity on resume, sd_resume() has no need to issue a START STOP UNIT command, which solves the above mentioned problems. Do not issue this command by introducing the new scsi_device flag no_start_on_resume and setting this flag to 1 in ata_scsi_dev_config(). sd_resume() is modified to issue a START STOP UNIT command only if this flag is not set. Reported-by: Paul Ausbeck Closes: https://bugzilla.kernel.org/show_bug.cgi?id=215880 Fixes: a19a93e4c6a9 ("scsi: core: pm: Rely on the device driver core for async power management") Signed-off-by: Damien Le Moal Tested-by: Tanner Watkins Tested-by: Paul Ausbeck Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche --- drivers/ata/libata-scsi.c | 7 +++++++ drivers/scsi/sd.c | 9 ++++++--- include/scsi/scsi_device.h | 1 + 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 370d18aca71e..c6ece32de8e3 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1100,7 +1100,14 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) } } else { sdev->sector_size = ata_id_logical_sector_size(dev->id); + /* + * Stop the drive on suspend but do not issue START STOP UNIT + * on resume as this is not necessary and may fail: the device + * will be woken up by ata_port_pm_resume() with a port reset + * and device revalidation. + */ sdev->manage_start_stop = 1; + sdev->no_start_on_resume = 1; } /* diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 68b12afa0721..3c668cfb146d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3876,7 +3876,7 @@ static int sd_suspend_runtime(struct device *dev) static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); - int ret; + int ret = 0; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; @@ -3884,8 +3884,11 @@ static int sd_resume(struct device *dev) if (!sdkp->device->manage_start_stop) return 0; - sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); - ret = sd_start_stop_device(sdkp, 1); + if (!sdkp->device->no_start_on_resume) { + sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); + ret = sd_start_stop_device(sdkp, 1); + } + if (!ret) opal_unlock_from_suspend(sdkp->opal_dev); return ret; diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 75b2235b99e2..b9230b6add04 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -194,6 +194,7 @@ struct scsi_device { unsigned no_start_on_add:1; /* do not issue start on add */ unsigned allow_restart:1; /* issue START_UNIT in error handler */ unsigned manage_start_stop:1; /* Let HLD (sd) manage start/stop */ + unsigned no_start_on_resume:1; /* Do not issue START_STOP_UNIT on resume */ unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */ unsigned no_uld_attach:1; /* disable connecting to upper level drivers */ unsigned select_no_atn:1; From c71b7aa8619a0c9700132d0733e33999fb614339 Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Wed, 2 Aug 2023 11:41:22 +0500 Subject: [PATCH 082/230] drm/panel: samsung-s6d7aa0: Add MODULE_DEVICE_TABLE The driver can be built as a module, however the lack of the MODULE_DEVICE_TABLE macro prevents it from being automatically probed from the DT in such case. Add the missed macro to make sure the module can load automatically. Fixes: 6810bb390282 ("drm/panel: Add Samsung S6D7AA0 panel controller driver") Signed-off-by: Nikita Travkin Acked-by: Artur Weber Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20230802-gt5-panel-dtable-v1-1-c0a765c175e2@trvn.ru --- drivers/gpu/drm/panel/panel-samsung-s6d7aa0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-samsung-s6d7aa0.c b/drivers/gpu/drm/panel/panel-samsung-s6d7aa0.c index 102e1fc7ee38..be4ec5bb5223 100644 --- a/drivers/gpu/drm/panel/panel-samsung-s6d7aa0.c +++ b/drivers/gpu/drm/panel/panel-samsung-s6d7aa0.c @@ -569,6 +569,7 @@ static const struct of_device_id s6d7aa0_of_match[] = { }, { /* sentinel */ } }; +MODULE_DEVICE_TABLE(of, s6d7aa0_of_match); static struct mipi_dsi_driver s6d7aa0_driver = { .probe = s6d7aa0_probe, From 86582e6189dd8f9f52c25d46c70fe5d111da6345 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Thu, 6 Jul 2023 11:08:16 +1000 Subject: [PATCH 083/230] powerpc/powermac: Use early_* IO variants in via_calibrate_decr() On a powermac platform, via the call path: start_kernel() time_init() ppc_md.calibrate_decr() (pmac_calibrate_decr) via_calibrate_decr() ioremap() and iounmap() are called. The unmap can enable interrupts unexpectedly (cond_resched() in vunmap_pmd_range()), which causes a warning later in the boot sequence in start_kernel(). Use the early_* variants of these IO functions to prevent this. The issue is pre-existing, but is surfaced by commit 721255b9826b ("genirq: Use a maple tree for interrupt descriptor management"). Signed-off-by: Benjamin Gray Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/20230706010816.72682-1-bgray@linux.ibm.com --- arch/powerpc/platforms/powermac/time.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powermac/time.c b/arch/powerpc/platforms/powermac/time.c index 4c5790aff1b5..8633891b7aa5 100644 --- a/arch/powerpc/platforms/powermac/time.c +++ b/arch/powerpc/platforms/powermac/time.c @@ -26,8 +26,8 @@ #include #include +#include #include -#include #include #include #include @@ -182,7 +182,7 @@ static int __init via_calibrate_decr(void) return 0; } of_node_put(vias); - via = ioremap(rsrc.start, resource_size(&rsrc)); + via = early_ioremap(rsrc.start, resource_size(&rsrc)); if (via == NULL) { printk(KERN_ERR "Failed to map VIA for timer calibration !\n"); return 0; @@ -207,7 +207,7 @@ static int __init via_calibrate_decr(void) ppc_tb_freq = (dstart - dend) * 100 / 6; - iounmap(via); + early_iounmap((void *)via, resource_size(&rsrc)); return 1; } From 11260c3d608b59231f4c228147a795ab21a10b33 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 2 Aug 2023 13:43:03 -0300 Subject: [PATCH 084/230] smb: client: fix dfs link mount against w2k8 Customer reported that they couldn't mount their DFS link that was seen by the client as a DFS interlink -- special form of DFS link where its single target may point to a different DFS namespace -- and it turned out that it was just a regular DFS link where its referral header flags missed the StorageServers bit thus making the client think it couldn't tree connect to target directly without requiring further referrals. When the DFS link referral header flags misses the StoraServers bit and its target doesn't respond to any referrals, then tree connect to it. Fixes: a1c0d00572fc ("cifs: share dfs connections and supers") Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/dfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index df3fd3b720da..ee772c3d9f00 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -177,8 +177,12 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); rc = dfs_get_referral(mnt_ctx, ref_path + 1, NULL, &tl); - if (rc) + if (rc) { + rc = cifs_mount_get_tcon(mnt_ctx); + if (!rc) + rc = cifs_is_path_remote(mnt_ctx); break; + } tit = dfs_cache_get_tgt_iterator(&tl); if (!tit) { From 8c82d2bf5944123d8e90d01bf27655497d9aa321 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 12 Jul 2023 12:35:14 -0700 Subject: [PATCH 085/230] selftests/riscv: fix potential build failure during the "emit_tests" step The riscv selftests (which were modeled after the arm64 selftests) are improperly declaring the "emit_tests" target to depend upon the "all" target. This approach, when combined with commit 9fc96c7c19df ("selftests: error out if kernel header files are not yet built"), has caused build failures [1] on arm64, and is likely to cause similar failures for riscv. To fix this, simply remove the unnecessary "all" dependency from the emit_tests target. The dependency is still effectively honored, because again, invocation is via "install", which also depends upon "all". An alternative approach would be to harden the emit_tests target so that it can depend upon "all", but that's a lot more complicated and hard to get right, and doesn't seem worth it, especially given that emit_tests should probably not be overridden at all. [1] https://lore.kernel.org/20230710-kselftest-fix-arm64-v1-1-48e872844f25@kernel.org Fixes: 9fc96c7c19df ("selftests: error out if kernel header files are not yet built") Signed-off-by: John Hubbard Tested-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230712193514.740033-1-jhubbard@nvidia.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 9dd629cc86aa..f4b3d5c9af5b 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -43,7 +43,7 @@ run_tests: all done # Avoid any output on non riscv on emit_tests -emit_tests: all +emit_tests: @for DIR in $(RISCV_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ From 25696067202f047e22c1562f1f56b0e2eb547d1a Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 13 Jul 2023 13:58:29 +0200 Subject: [PATCH 086/230] selftests: riscv: Fix compilation error with vstate_exec_nolibc.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following error happens: In file included from vstate_exec_nolibc.c:2: /usr/include/riscv64-linux-gnu/sys/prctl.h:42:12: error: conflicting types for ‘prctl’; h ave ‘int(int, ...)’ 42 | extern int prctl (int __option, ...) __THROW; | ^~~~~ In file included from ./../../../../include/nolibc/nolibc.h:99, from : ./../../../../include/nolibc/sys.h:892:5: note: previous definition of ‘prctl’ with type ‘int(int, long unsigned int, long unsigned int, long unsigned int, long unsigned int) ’ 892 | int prctl(int option, unsigned long arg2, unsigned long arg3, | ^~~~~ Fix this by not including , which is not needed here since prctl syscall is directly called using its number. Fixes: 7cf6198ce22d ("selftests: Test RISC-V Vector prctl interface") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230713115829.110421-1-alexghiti@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c index 5cbc392944a6..2c0d2b1126c1 100644 --- a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c +++ b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-only -#include - #define THIS_PROGRAM "./vstate_exec_nolibc" int main(int argc, char **argv) From 568701fee36652a7660ed667a3980c945d8051e0 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 24 Jul 2023 15:33:46 +0530 Subject: [PATCH 087/230] RISC-V: ACPI: Fix acpi_os_ioremap to return iomem address acpi_os_ioremap() currently is a wrapper to memremap() on RISC-V. But the callers of acpi_os_ioremap() expect it to return __iomem address and hence sparse tool reports a new warning. Fix this issue by type casting to __iomem type. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202307230357.egcTAefj-lkp@intel.com/ Fixes: a91a9ffbd3a5 ("RISC-V: Add support to build the ACPI core") Signed-off-by: Sunil V L Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230724100346.1302937-1-sunilvl@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/acpi.h | 2 +- arch/riscv/kernel/acpi.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h index f71ce21ff684..d5604d2073bc 100644 --- a/arch/riscv/include/asm/acpi.h +++ b/arch/riscv/include/asm/acpi.h @@ -19,7 +19,7 @@ typedef u64 phys_cpuid_t; #define PHYS_CPUID_INVALID INVALID_HARTID /* ACPI table mapping after acpi_permanent_mmap is set */ -void *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); #define acpi_os_ioremap acpi_os_ioremap #define acpi_strict 1 /* No out-of-spec workarounds on RISC-V */ diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c index 5ee03ebab80e..56cb2c986c48 100644 --- a/arch/riscv/kernel/acpi.c +++ b/arch/riscv/kernel/acpi.c @@ -215,9 +215,9 @@ void __init __acpi_unmap_table(void __iomem *map, unsigned long size) early_iounmap(map, size); } -void *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) { - return memremap(phys, size, MEMREMAP_WB); + return (void __iomem *)memremap(phys, size, MEMREMAP_WB); } #ifdef CONFIG_PCI From fbe7d19d2b7fcbd38905ba9f691be8f245c6faa6 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Mon, 24 Jul 2023 18:09:16 +0800 Subject: [PATCH 088/230] riscv: Export va_kernel_pa_offset in vmcoreinfo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since RISC-V Linux v6.4, the commit 3335068f8721 ("riscv: Use PUD/P4D/PGD pages for the linear mapping") changes phys_ram_base from the physical start of the kernel to the actual start of the DRAM. The Crash-utility's VTOP() still uses phys_ram_base and kernel_map.virt_addr to translate kernel virtual address, that failed the Crash with Linux v6.4 [1]. Export kernel_map.va_kernel_pa_offset in vmcoreinfo to help Crash translate the kernel virtual address correctly. Fixes: 3335068f8721 ("riscv: Use PUD/P4D/PGD pages for the linear mapping") Link: https://lore.kernel.org/linux-riscv/20230724040649.220279-1-suagrfillet@gmail.com/ [1] Signed-off-by: Song Shuai Reviewed-by: Xianting Tian  Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230724100917.309061-1-suagrfillet@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/crash_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c index b351a3c01355..55f1d7856b54 100644 --- a/arch/riscv/kernel/crash_core.c +++ b/arch/riscv/kernel/crash_core.c @@ -18,4 +18,6 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); #endif vmcoreinfo_append_str("NUMBER(KERNEL_LINK_ADDR)=0x%lx\n", KERNEL_LINK_ADDR); + vmcoreinfo_append_str("NUMBER(va_kernel_pa_offset)=0x%lx\n", + kernel_map.va_kernel_pa_offset); } From 640c503d7dbd7d34a62099c933f4db0ed77ccbec Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Mon, 24 Jul 2023 18:09:17 +0800 Subject: [PATCH 089/230] Documentation: kdump: Add va_kernel_pa_offset for RISCV64 RISC-V Linux exports "va_kernel_pa_offset" in vmcoreinfo to help Crash-utility translate the kernel virtual address correctly. Here adds the definition of "va_kernel_pa_offset". Fixes: 3335068f8721 ("riscv: Use PUD/P4D/PGD pages for the linear mapping") Link: https://lore.kernel.org/linux-riscv/20230724040649.220279-1-suagrfillet@gmail.com/ Signed-off-by: Song Shuai Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230724100917.309061-2-suagrfillet@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index c18d94fa6470..f8ebb63b6c5d 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -624,3 +624,9 @@ Used to get the correct ranges: * VMALLOC_START ~ VMALLOC_END : vmalloc() / ioremap() space. * VMEMMAP_START ~ VMEMMAP_END : vmemmap space, used for struct page array. * KERNEL_LINK_ADDR : start address of Kernel link and BPF + +va_kernel_pa_offset +------------------- + +Indicates the offset between the kernel virtual and physical mappings. +Used to translate virtual to physical addresses. From 9e2d0c336524706fb327e9b87477f5f3337ad7a6 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Fri, 23 Jun 2023 09:28:08 -0700 Subject: [PATCH 090/230] x86/hyperv: add noop functions to x86_init mpparse functions Hyper-V can run VMs at different privilege "levels" known as Virtual Trust Levels (VTL). Sometimes, it chooses to run two different VMs at different levels but they share some of their address space. In such setups VTL2 (higher level VM) has visibility of all of the VTL0 (level 0) memory space. When the CONFIG_X86_MPPARSE is enabled for VTL2, the VTL2 kernel performs a search within the low memory to locate MP tables. However, in systems where VTL0 manages the low memory and may contain valid tables, this scanning can result in incorrect MP table information being provided to the VTL2 kernel, mistakenly considering VTL0's MP table as its own Add noop functions to avoid MP parse scan by VTL2. Signed-off-by: Saurabh Sengar Acked-by: Dave Hansen Link: https://lore.kernel.org/r/1687537688-5397-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_vtl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 85d38b9f3586..db5d2ea39fc0 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -25,6 +25,10 @@ void __init hv_vtl_init_platform(void) x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = x86_init_noop; + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_platform.get_wallclock = get_rtc_noop; x86_platform.set_wallclock = set_rtc_noop; x86_platform.get_nmi_reason = hv_get_nmi_reason; From 6ad0f2f91ad14ba0a3c2990c054fd6fbe8100429 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 25 Jul 2023 22:21:08 +0800 Subject: [PATCH 091/230] Drivers: hv: vmbus: Remove unused extern declaration vmbus_ontimer() Since commit 30fbee49b071 ("Staging: hv: vmbus: Get rid of the unused function vmbus_ontimer()") this is not used anymore, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20230725142108.27280-1-yuehaibing@huawei.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index bfbc37ce223b..3ac3974b3c78 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1239,9 +1239,6 @@ extern int vmbus_recvpacket_raw(struct vmbus_channel *channel, u32 *buffer_actual_len, u64 *requestid); - -extern void vmbus_ontimer(unsigned long data); - /* Base driver object */ struct hv_driver { const char *name; From 534fc31d09b706a16d83533e16b5dc855caf7576 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Thu, 3 Aug 2023 08:41:22 +0200 Subject: [PATCH 092/230] xen/netback: Fix buffer overrun triggered by unusual packet It is possible that a guest can send a packet that contains a head + 18 slots and yet has a len <= XEN_NETBACK_TX_COPY_LEN. This causes nr_slots to underflow in xenvif_get_requests() which then causes the subsequent loop's termination condition to be wrong, causing a buffer overrun of queue->tx_map_ops. Rework the code to account for the extra frag_overflow slots. This is CVE-2023-34319 / XSA-432. Fixes: ad7f402ae4f4 ("xen/netback: Ensure protocol headers don't fall in the non-linear area") Signed-off-by: Ross Lagerwall Reviewed-by: Paul Durrant Reviewed-by: Wei Liu Signed-off-by: Juergen Gross --- drivers/net/xen-netback/netback.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index c8d20cddf658..88f760a7cbc3 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -396,7 +396,7 @@ static void xenvif_get_requests(struct xenvif_queue *queue, struct gnttab_map_grant_ref *gop = queue->tx_map_ops + *map_ops; struct xen_netif_tx_request *txp = first; - nr_slots = shinfo->nr_frags + 1; + nr_slots = shinfo->nr_frags + frag_overflow + 1; copy_count(skb) = 0; XENVIF_TX_CB(skb)->split_mask = 0; @@ -462,8 +462,8 @@ static void xenvif_get_requests(struct xenvif_queue *queue, } } - for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots; - shinfo->nr_frags++, gop++) { + for (shinfo->nr_frags = 0; nr_slots > 0 && shinfo->nr_frags < MAX_SKB_FRAGS; + shinfo->nr_frags++, gop++, nr_slots--) { index = pending_index(queue->pending_cons++); pending_idx = queue->pending_ring[index]; xenvif_tx_create_map_op(queue, pending_idx, txp, @@ -476,12 +476,12 @@ static void xenvif_get_requests(struct xenvif_queue *queue, txp++; } - if (frag_overflow) { + if (nr_slots > 0) { shinfo = skb_shinfo(nskb); frags = shinfo->frags; - for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow; + for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots; shinfo->nr_frags++, txp++, gop++) { index = pending_index(queue->pending_cons++); pending_idx = queue->pending_ring[index]; @@ -492,6 +492,11 @@ static void xenvif_get_requests(struct xenvif_queue *queue, } skb_shinfo(skb)->frag_list = nskb; + } else if (nskb) { + /* A frag_list skb was allocated but it is no longer needed + * because enough slots were converted to copy ops above. + */ + kfree_skb(nskb); } (*copy_ops) = cop - queue->tx_copy_ops; From c2ff2b736c41cc63bb0aaec85cccfead9fbcfe92 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Thu, 3 Aug 2023 09:24:04 +0300 Subject: [PATCH 093/230] parisc/mm: preallocate fixmap page tables at init Christoph Biedl reported early OOM on recent kernels: swapper: page allocation failure: order:0, mode:0x100(__GFP_ZERO), nodemask=(null) CPU: 0 PID: 0 Comm: swapper Not tainted 6.3.0-rc4+ #16 Hardware name: 9000/785/C3600 Backtrace: [<10408594>] show_stack+0x48/0x5c [<10e152d8>] dump_stack_lvl+0x48/0x64 [<10e15318>] dump_stack+0x24/0x34 [<105cf7f8>] warn_alloc+0x10c/0x1c8 [<105d068c>] __alloc_pages+0xbbc/0xcf8 [<105d0e4c>] __get_free_pages+0x28/0x78 [<105ad10c>] __pte_alloc_kernel+0x30/0x98 [<10406934>] set_fixmap+0xec/0xf4 [<10411ad4>] patch_map.constprop.0+0xa8/0xdc [<10411bb0>] __patch_text_multiple+0xa8/0x208 [<10411d78>] patch_text+0x30/0x48 [<1041246c>] arch_jump_label_transform+0x90/0xcc [<1056f734>] jump_label_update+0xd4/0x184 [<1056fc9c>] static_key_enable_cpuslocked+0xc0/0x110 [<1056fd08>] static_key_enable+0x1c/0x2c [<1011362c>] init_mem_debugging_and_hardening+0xdc/0xf8 [<1010141c>] start_kernel+0x5f0/0xa98 [<10105da8>] start_parisc+0xb8/0xe4 Mem-Info: active_anon:0 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:0 isolated_file:0 unevictable:0 dirty:0 writeback:0 slab_reclaimable:0 slab_unreclaimable:0 mapped:0 shmem:0 pagetables:0 sec_pagetables:0 bounce:0 kernel_misc_reclaimable:0 free:0 free_pcp:0 free_cma:0 Node 0 active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB +writeback_tmp:0kB kernel_stack:0kB pagetables:0kB sec_pagetables:0kB all_unreclaimable? no Normal free:0kB boost:0kB min:0kB low:0kB high:0kB reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB +present:1048576kB managed:1039360kB mlocked:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 0 Normal: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB 0 total pagecache pages 0 pages in swap cache Free swap = 0kB Total swap = 0kB 262144 pages RAM 0 pages HighMem/MovableOnly 2304 pages reserved Backtrace: [<10411d78>] patch_text+0x30/0x48 [<1041246c>] arch_jump_label_transform+0x90/0xcc [<1056f734>] jump_label_update+0xd4/0x184 [<1056fc9c>] static_key_enable_cpuslocked+0xc0/0x110 [<1056fd08>] static_key_enable+0x1c/0x2c [<1011362c>] init_mem_debugging_and_hardening+0xdc/0xf8 [<1010141c>] start_kernel+0x5f0/0xa98 [<10105da8>] start_parisc+0xb8/0xe4 Kernel Fault: Code=15 (Data TLB miss fault) at addr 0f7fe3c0 CPU: 0 PID: 0 Comm: swapper Not tainted 6.3.0-rc4+ #16 Hardware name: 9000/785/C3600 This happens because patching static key code temporarily maps it via fixmap and if it happens before page allocator is initialized set_fixmap() cannot allocate memory using pte_alloc_kernel(). Make sure that fixmap page tables are preallocated early so that pte_offset_kernel() in set_fixmap() never resorts to pte allocation. Signed-off-by: Mike Rapoport (IBM) Acked-by: Vlastimil Babka Signed-off-by: Helge Deller Tested-by: Christoph Biedl Tested-by: John David Anglin Cc: # v6.4+ --- arch/parisc/mm/fixmap.c | 3 --- arch/parisc/mm/init.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/arch/parisc/mm/fixmap.c b/arch/parisc/mm/fixmap.c index cc15d737fda6..ae3493dae9dc 100644 --- a/arch/parisc/mm/fixmap.c +++ b/arch/parisc/mm/fixmap.c @@ -19,9 +19,6 @@ void notrace set_fixmap(enum fixed_addresses idx, phys_addr_t phys) pmd_t *pmd = pmd_offset(pud, vaddr); pte_t *pte; - if (pmd_none(*pmd)) - pte = pte_alloc_kernel(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte_at(&init_mm, vaddr, pte, __mk_pte(phys, PAGE_KERNEL_RWX)); flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE); diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 406c52fe23d5..389941c7f209 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -669,6 +669,39 @@ static void __init gateway_init(void) PAGE_SIZE, PAGE_GATEWAY, 1); } +static void __init fixmap_init(void) +{ + unsigned long addr = FIXMAP_START; + unsigned long end = FIXMAP_START + FIXMAP_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + pmd_t *pmd; + + BUILD_BUG_ON(FIXMAP_SIZE > PMD_SIZE); + +#if CONFIG_PGTABLE_LEVELS == 3 + if (pud_none(*pud)) { + pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER, + PAGE_SIZE << PMD_TABLE_ORDER); + if (!pmd) + panic("fixmap: pmd allocation failed.\n"); + pud_populate(NULL, pud, pmd); + } +#endif + + pmd = pmd_offset(pud, addr); + do { + pte_t *pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pte) + panic("fixmap: pte allocation failed.\n"); + + pmd_populate_kernel(&init_mm, pmd, pte); + + addr += PAGE_SIZE; + } while (addr < end); +} + static void __init parisc_bootmem_free(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; @@ -683,6 +716,7 @@ void __init paging_init(void) setup_bootmem(); pagetable_init(); gateway_init(); + fixmap_init(); flush_cache_all_local(); /* start with known state */ flush_tlb_all_local(NULL); From ce9ff57d393db86a34ba3f817d7fb886b7c278dc Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 2 Aug 2023 18:33:21 +0200 Subject: [PATCH 094/230] parisc: pci-dma: remove unused and dead EISA code and comment Clearly, this code isn't needed, but it gives a false positive when grepping the complete source tree for coherent_dma_mask. Signed-off-by: Petr Tesarik Signed-off-by: Helge Deller --- arch/parisc/kernel/pci-dma.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index d818ece23b4a..3f6b507970eb 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -417,14 +417,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, map_uncached_pages(vaddr, size, paddr); *dma_handle = (dma_addr_t) paddr; -#if 0 -/* This probably isn't needed to support EISA cards. -** ISA cards will certainly only support 24-bit DMA addressing. -** Not clear if we can, want, or need to support ISA. -*/ - if (!dev || *dev->coherent_dma_mask < 0xffffffff) - gfp |= GFP_DMA; -#endif return (void *)vaddr; } From 2e1b1d7063a35ab6cf9984f9d5bc29829e1e8788 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Jul 2023 17:09:14 +0200 Subject: [PATCH 095/230] parport: gsc: remove DMA leftover code This driver does not actually work with DMA mode, but still tries to call ISA DMA interface functions that are stubbed out on parisc, resulting in a W=1 build warning: drivers/parport/parport_gsc.c: In function 'parport_remove_chip': drivers/parport/parport_gsc.c:389:20: warning: suggest braces around empty body in an 'if' statement [-Wempty-body] 389 | free_dma(p->dma); Remove the corresponding code as a prerequisite for turning on -Wempty-body by default in all kernels. Signed-off-by: Arnd Bergmann Signed-off-by: Helge Deller --- drivers/parport/parport_gsc.c | 28 ++++------------------------ drivers/parport/parport_gsc.h | 7 ------- 2 files changed, 4 insertions(+), 31 deletions(-) diff --git a/drivers/parport/parport_gsc.c b/drivers/parport/parport_gsc.c index 0dcc497b0449..5e4475254bd0 100644 --- a/drivers/parport/parport_gsc.c +++ b/drivers/parport/parport_gsc.c @@ -28,7 +28,6 @@ #include #include -#include #include #include @@ -226,9 +225,9 @@ static int parport_PS2_supported(struct parport *pb) /* --- Initialisation code -------------------------------- */ -struct parport *parport_gsc_probe_port(unsigned long base, +static struct parport *parport_gsc_probe_port(unsigned long base, unsigned long base_hi, int irq, - int dma, struct parisc_device *padev) + struct parisc_device *padev) { struct parport_gsc_private *priv; struct parport_operations *ops; @@ -250,12 +249,9 @@ struct parport *parport_gsc_probe_port(unsigned long base, } priv->ctr = 0xc; priv->ctr_writable = 0xff; - priv->dma_buf = NULL; - priv->dma_handle = 0; p->base = base; p->base_hi = base_hi; p->irq = irq; - p->dma = dma; p->modes = PARPORT_MODE_PCSPP | PARPORT_MODE_SAFEININT; p->ops = ops; p->private_data = priv; @@ -286,17 +282,9 @@ struct parport *parport_gsc_probe_port(unsigned long base, if (p->irq == PARPORT_IRQ_AUTO) { p->irq = PARPORT_IRQ_NONE; } - if (p->irq != PARPORT_IRQ_NONE) { + if (p->irq != PARPORT_IRQ_NONE) pr_cont(", irq %d", p->irq); - if (p->dma == PARPORT_DMA_AUTO) { - p->dma = PARPORT_DMA_NONE; - } - } - if (p->dma == PARPORT_DMA_AUTO) /* To use DMA, giving the irq - is mandatory (see above) */ - p->dma = PARPORT_DMA_NONE; - pr_cont(" ["); #define printmode(x) \ do { \ @@ -321,7 +309,6 @@ do { \ pr_warn("%s: irq %d in use, resorting to polled operation\n", p->name, p->irq); p->irq = PARPORT_IRQ_NONE; - p->dma = PARPORT_DMA_NONE; } } @@ -369,8 +356,7 @@ static int __init parport_init_chip(struct parisc_device *dev) pr_info("%s: enhanced parport-modes not supported\n", __func__); } - p = parport_gsc_probe_port(port, 0, dev->irq, - /* PARPORT_IRQ_NONE */ PARPORT_DMA_NONE, dev); + p = parport_gsc_probe_port(port, 0, dev->irq, dev); if (p) parport_count++; dev_set_drvdata(&dev->dev, p); @@ -382,16 +368,10 @@ static void __exit parport_remove_chip(struct parisc_device *dev) { struct parport *p = dev_get_drvdata(&dev->dev); if (p) { - struct parport_gsc_private *priv = p->private_data; struct parport_operations *ops = p->ops; parport_remove_port(p); - if (p->dma != PARPORT_DMA_NONE) - free_dma(p->dma); if (p->irq != PARPORT_IRQ_NONE) free_irq(p->irq, p); - if (priv->dma_buf) - dma_free_coherent(&priv->dev->dev, PAGE_SIZE, - priv->dma_buf, priv->dma_handle); kfree (p->private_data); parport_put_port(p); kfree (ops); /* hope no-one cached it */ diff --git a/drivers/parport/parport_gsc.h b/drivers/parport/parport_gsc.h index 9301217edf12..d447a568c257 100644 --- a/drivers/parport/parport_gsc.h +++ b/drivers/parport/parport_gsc.h @@ -63,8 +63,6 @@ struct parport_gsc_private { int writeIntrThreshold; /* buffer suitable for DMA, if DMA enabled */ - char *dma_buf; - dma_addr_t dma_handle; struct pci_dev *dev; }; @@ -199,9 +197,4 @@ extern void parport_gsc_inc_use_count(void); extern void parport_gsc_dec_use_count(void); -extern struct parport *parport_gsc_probe_port(unsigned long base, - unsigned long base_hi, - int irq, int dma, - struct parisc_device *padev); - #endif /* __DRIVERS_PARPORT_PARPORT_GSC_H */ From 99b2f159b6e76b84357eae6dc2a206871aa630d5 Mon Sep 17 00:00:00 2001 From: "hanyu001@208suo.com" Date: Thu, 20 Jul 2023 14:40:27 +0800 Subject: [PATCH 096/230] parisc: unaligned: Add required spaces after ',' Fix checkpatch warnings: unaligned.c:475: ERROR: space required after that ',' Signed-off-by: Yu Han Signed-off-by: Helge Deller --- arch/parisc/kernel/unaligned.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index 033b9e50b44a..813062701922 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -337,7 +337,7 @@ static int emulate_std(struct pt_regs *regs, int frreg, int flop) : "r19", "r20", "r21", "r22", "r1" ); #else { - unsigned long valh=(val>>32),vall=(val&0xffffffffl); + unsigned long valh = (val >> 32), vall = (val & 0xffffffffl); __asm__ __volatile__ ( " mtsp %4, %%sr1\n" " zdep %2, 29, 2, %%r19\n" @@ -473,7 +473,7 @@ void handle_unaligned(struct pt_regs *regs) case OPCODE_LDWA_I: case OPCODE_LDW_S: case OPCODE_LDWA_S: - ret = emulate_ldw(regs, R3(regs->iir),0); + ret = emulate_ldw(regs, R3(regs->iir), 0); break; case OPCODE_STH: @@ -482,7 +482,7 @@ void handle_unaligned(struct pt_regs *regs) case OPCODE_STW: case OPCODE_STWA: - ret = emulate_stw(regs, R2(regs->iir),0); + ret = emulate_stw(regs, R2(regs->iir), 0); break; #ifdef CONFIG_64BIT @@ -490,12 +490,12 @@ void handle_unaligned(struct pt_regs *regs) case OPCODE_LDDA_I: case OPCODE_LDD_S: case OPCODE_LDDA_S: - ret = emulate_ldd(regs, R3(regs->iir),0); + ret = emulate_ldd(regs, R3(regs->iir), 0); break; case OPCODE_STD: case OPCODE_STDA: - ret = emulate_std(regs, R2(regs->iir),0); + ret = emulate_std(regs, R2(regs->iir), 0); break; #endif @@ -503,24 +503,24 @@ void handle_unaligned(struct pt_regs *regs) case OPCODE_FLDWS: case OPCODE_FLDWXR: case OPCODE_FLDWSR: - ret = emulate_ldw(regs,FR3(regs->iir),1); + ret = emulate_ldw(regs, FR3(regs->iir), 1); break; case OPCODE_FLDDX: case OPCODE_FLDDS: - ret = emulate_ldd(regs,R3(regs->iir),1); + ret = emulate_ldd(regs, R3(regs->iir), 1); break; case OPCODE_FSTWX: case OPCODE_FSTWS: case OPCODE_FSTWXR: case OPCODE_FSTWSR: - ret = emulate_stw(regs,FR3(regs->iir),1); + ret = emulate_stw(regs, FR3(regs->iir), 1); break; case OPCODE_FSTDX: case OPCODE_FSTDS: - ret = emulate_std(regs,R3(regs->iir),1); + ret = emulate_std(regs, R3(regs->iir), 1); break; case OPCODE_LDCD_I: From c9bb40b7f786662e33d71afe236442b0b61f0446 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 3 Aug 2023 00:46:39 +0100 Subject: [PATCH 097/230] arm64/fpsimd: Clear SME state in the target task when setting the VL When setting SME vector lengths we clear TIF_SME to reenable SME traps, doing a reallocation of the backing storage on next use. We do this using clear_thread_flag() which operates on the current thread, meaning that when setting the vector length via ptrace we may both not force traps for the target task and force a spurious flush of any SME state that the tracing task may have. Clear the flag in the target task. Fixes: e12310a0d30f ("arm64/sme: Implement ptrace support for streaming mode SVE registers") Reported-by: David Spickett Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230803-arm64-fix-ptrace-tif-sme-v1-1-88312fd6fbfd@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 520b681a07bb..a61a1fd6492d 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -909,7 +909,7 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, */ task->thread.svcr &= ~(SVCR_SM_MASK | SVCR_ZA_MASK); - clear_thread_flag(TIF_SME); + clear_tsk_thread_flag(task, TIF_SME); free_sme = true; } } From 89a65c3f170e5c3b05a626046c68354e2afd7912 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 3 Aug 2023 01:19:06 +0100 Subject: [PATCH 098/230] arm64/ptrace: Flush FP state when setting ZT0 When setting ZT0 via ptrace we do not currently force a reload of the floating point register state from memory, do that to ensure that the newly set value gets loaded into the registers on next task execution. The function was templated off the function for FPSIMD which due to our providing the option of embedding a FPSIMD regset within the SVE regset does not directly include the flush. Fixes: f90b529bcbe5 ("arm64/sme: Implement ZT0 ptrace support") Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230803-arm64-fix-ptrace-zt0-flush-v1-1-72e854eaf96e@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/ptrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index d7f4f0d1ae12..740e81e9db04 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1180,6 +1180,8 @@ static int zt_set(struct task_struct *target, if (ret == 0) target->thread.svcr |= SVCR_ZA_MASK; + fpsimd_flush_task_state(target); + return ret; } From 1696ec8654016dad3b1baf6c024303e584400453 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 2 Aug 2023 10:40:29 -0700 Subject: [PATCH 099/230] mISDN: Update parameter type of dsp_cmx_send() When booting a kernel with CONFIG_MISDN_DSP=y and CONFIG_CFI_CLANG=y, there is a failure when dsp_cmx_send() is called indirectly from call_timer_fn(): [ 0.371412] CFI failure at call_timer_fn+0x2f/0x150 (target: dsp_cmx_send+0x0/0x530; expected type: 0x92ada1e9) The function pointer prototype that call_timer_fn() expects is void (*fn)(struct timer_list *) whereas dsp_cmx_send() has a parameter type of 'void *', which causes the control flow integrity checks to fail because the parameter types do not match. Change dsp_cmx_send()'s parameter type to be 'struct timer_list' to match the expected prototype. The argument is unused anyways, so this has no functional change, aside from avoiding the CFI failure. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202308020936.58787e6c-oliver.sang@intel.com Signed-off-by: Nathan Chancellor Reviewed-by: Sami Tolvanen Reviewed-by: Kees Cook Fixes: e313ac12eb13 ("mISDN: Convert timers to use timer_setup()") Link: https://lore.kernel.org/r/20230802-fix-dsp_cmx_send-cfi-failure-v1-1-2f2e79b0178d@kernel.org Signed-off-by: Jakub Kicinski --- drivers/isdn/mISDN/dsp.h | 2 +- drivers/isdn/mISDN/dsp_cmx.c | 2 +- drivers/isdn/mISDN/dsp_core.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/isdn/mISDN/dsp.h b/drivers/isdn/mISDN/dsp.h index fa09d511a8ed..baf31258f5c9 100644 --- a/drivers/isdn/mISDN/dsp.h +++ b/drivers/isdn/mISDN/dsp.h @@ -247,7 +247,7 @@ extern void dsp_cmx_hardware(struct dsp_conf *conf, struct dsp *dsp); extern int dsp_cmx_conf(struct dsp *dsp, u32 conf_id); extern void dsp_cmx_receive(struct dsp *dsp, struct sk_buff *skb); extern void dsp_cmx_hdlc(struct dsp *dsp, struct sk_buff *skb); -extern void dsp_cmx_send(void *arg); +extern void dsp_cmx_send(struct timer_list *arg); extern void dsp_cmx_transmit(struct dsp *dsp, struct sk_buff *skb); extern int dsp_cmx_del_conf_member(struct dsp *dsp); extern int dsp_cmx_del_conf(struct dsp_conf *conf); diff --git a/drivers/isdn/mISDN/dsp_cmx.c b/drivers/isdn/mISDN/dsp_cmx.c index 357b87592eb4..61cb45c5d0d8 100644 --- a/drivers/isdn/mISDN/dsp_cmx.c +++ b/drivers/isdn/mISDN/dsp_cmx.c @@ -1614,7 +1614,7 @@ static u16 dsp_count; /* last sample count */ static int dsp_count_valid; /* if we have last sample count */ void -dsp_cmx_send(void *arg) +dsp_cmx_send(struct timer_list *arg) { struct dsp_conf *conf; struct dsp_conf_member *member; diff --git a/drivers/isdn/mISDN/dsp_core.c b/drivers/isdn/mISDN/dsp_core.c index 386084530c2f..fae95f166688 100644 --- a/drivers/isdn/mISDN/dsp_core.c +++ b/drivers/isdn/mISDN/dsp_core.c @@ -1195,7 +1195,7 @@ static int __init dsp_init(void) } /* set sample timer */ - timer_setup(&dsp_spl_tl, (void *)dsp_cmx_send, 0); + timer_setup(&dsp_spl_tl, dsp_cmx_send, 0); dsp_spl_tl.expires = jiffies + dsp_tics; dsp_spl_jiffies = dsp_spl_tl.expires; add_timer(&dsp_spl_tl); From e58f30246c35c126c7571065b33bee4b3b1d2ef8 Mon Sep 17 00:00:00 2001 From: Li Yang Date: Wed, 2 Aug 2023 14:13:46 -0500 Subject: [PATCH 100/230] net: phy: at803x: fix the wol setting functions In commit 7beecaf7d507 ("net: phy: at803x: improve the WOL feature"), it seems not correct to use a wol_en bit in a 1588 Control Register which is only available on AR8031/AR8033(share the same phy_id) to determine if WoL is enabled. Change it back to use AT803X_INTR_ENABLE_WOL for determining the WoL status which is applicable on all chips supporting wol. Also update the at803x_set_wol() function to only update the 1588 register on chips having it. After this change, disabling wol at probe from commit d7cd5e06c9dd ("net: phy: at803x: disable WOL at probe") is no longer needed. Change it to just disable the WoL bit in 1588 register for AR8031/AR8033 to be aligned with AT803X_INTR_ENABLE_WOL in probe. Fixes: 7beecaf7d507 ("net: phy: at803x: improve the WOL feature") Signed-off-by: Li Yang Reviewed-by: Viorel Suman Reviewed-by: Wei Fang Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 45 ++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index c1f307d90518..9c2c2e2ee94b 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -459,21 +459,27 @@ static int at803x_set_wol(struct phy_device *phydev, phy_write_mmd(phydev, MDIO_MMD_PCS, offsets[i], mac[(i * 2) + 1] | (mac[(i * 2)] << 8)); - /* Enable WOL function */ - ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL, - 0, AT803X_WOL_EN); - if (ret) - return ret; + /* Enable WOL function for 1588 */ + if (phydev->drv->phy_id == ATH8031_PHY_ID) { + ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, + AT803X_PHY_MMD3_WOL_CTRL, + 0, AT803X_WOL_EN); + if (ret) + return ret; + } /* Enable WOL interrupt */ ret = phy_modify(phydev, AT803X_INTR_ENABLE, 0, AT803X_INTR_ENABLE_WOL); if (ret) return ret; } else { - /* Disable WoL function */ - ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL, - AT803X_WOL_EN, 0); - if (ret) - return ret; + /* Disable WoL function for 1588 */ + if (phydev->drv->phy_id == ATH8031_PHY_ID) { + ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, + AT803X_PHY_MMD3_WOL_CTRL, + AT803X_WOL_EN, 0); + if (ret) + return ret; + } /* Disable WOL interrupt */ ret = phy_modify(phydev, AT803X_INTR_ENABLE, AT803X_INTR_ENABLE_WOL, 0); if (ret) @@ -508,11 +514,11 @@ static void at803x_get_wol(struct phy_device *phydev, wol->supported = WAKE_MAGIC; wol->wolopts = 0; - value = phy_read_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL); + value = phy_read(phydev, AT803X_INTR_ENABLE); if (value < 0) return; - if (value & AT803X_WOL_EN) + if (value & AT803X_INTR_ENABLE_WOL) wol->wolopts |= WAKE_MAGIC; } @@ -858,9 +864,6 @@ static int at803x_probe(struct phy_device *phydev) if (phydev->drv->phy_id == ATH8031_PHY_ID) { int ccr = phy_read(phydev, AT803X_REG_CHIP_CONFIG); int mode_cfg; - struct ethtool_wolinfo wol = { - .wolopts = 0, - }; if (ccr < 0) return ccr; @@ -877,12 +880,14 @@ static int at803x_probe(struct phy_device *phydev) break; } - /* Disable WOL by default */ - ret = at803x_set_wol(phydev, &wol); - if (ret < 0) { - phydev_err(phydev, "failed to disable WOL on probe: %d\n", ret); + /* Disable WoL in 1588 register which is enabled + * by default + */ + ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, + AT803X_PHY_MMD3_WOL_CTRL, + AT803X_WOL_EN, 0); + if (ret) return ret; - } } return 0; From d7791cec2304aea22eb2ada944e4d467302f5bfe Mon Sep 17 00:00:00 2001 From: Li Yang Date: Wed, 2 Aug 2023 14:13:47 -0500 Subject: [PATCH 101/230] net: phy: at803x: remove set/get wol callbacks for AR8032 Since the AR8032 part does not support wol, remove related callbacks from it. Fixes: 5800091a2061 ("net: phy: at803x: add support for AR8032 PHY") Signed-off-by: Li Yang Cc: David Bauer Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 9c2c2e2ee94b..8a77ec33b417 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -2064,8 +2064,6 @@ static struct phy_driver at803x_driver[] = { .flags = PHY_POLL_CABLE_TEST, .config_init = at803x_config_init, .link_change_notify = at803x_link_change_notify, - .set_wol = at803x_set_wol, - .get_wol = at803x_get_wol, .suspend = at803x_suspend, .resume = at803x_resume, /* PHY_BASIC_FEATURES */ From b3d8aa84bbfe9b58ccc5332cacf8ea17200af310 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Sat, 29 Jul 2023 18:29:02 -0700 Subject: [PATCH 102/230] rust: allocator: Prevent mis-aligned allocation Currently the rust allocator simply passes the size of the type Layout to krealloc(), and in theory the alignment requirement from the type Layout may be larger than the guarantee provided by SLAB, which means the allocated object is mis-aligned. Fix this by adjusting the allocation size to the nearest power of two, which SLAB always guarantees a size-aligned allocation. And because Rust guarantees that the original size must be a multiple of alignment and the alignment must be a power of two, then the alignment requirement is satisfied. Suggested-by: Vlastimil Babka Co-developed-by: "Andreas Hindborg (Samsung)" Signed-off-by: "Andreas Hindborg (Samsung)" Signed-off-by: Boqun Feng Cc: stable@vger.kernel.org # v6.1+ Acked-by: Vlastimil Babka Fixes: 247b365dc8dc ("rust: add `kernel` crate") Link: https://github.com/Rust-for-Linux/linux/issues/974 Link: https://lore.kernel.org/r/20230730012905.643822-2-boqun.feng@gmail.com [ Applied rewording of comment as discussed in the mailing list. ] Signed-off-by: Miguel Ojeda --- rust/bindings/bindings_helper.h | 1 + rust/kernel/allocator.rs | 74 ++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 3e601ce2548d..058954961bfc 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -13,5 +13,6 @@ #include /* `bindgen` gets confused at certain things. */ +const size_t BINDINGS_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN; const gfp_t BINDINGS_GFP_KERNEL = GFP_KERNEL; const gfp_t BINDINGS___GFP_ZERO = __GFP_ZERO; diff --git a/rust/kernel/allocator.rs b/rust/kernel/allocator.rs index 397a3dd57a9b..9363b527be66 100644 --- a/rust/kernel/allocator.rs +++ b/rust/kernel/allocator.rs @@ -9,6 +9,36 @@ use crate::bindings; struct KernelAllocator; +/// Calls `krealloc` with a proper size to alloc a new object aligned to `new_layout`'s alignment. +/// +/// # Safety +/// +/// - `ptr` can be either null or a pointer which has been allocated by this allocator. +/// - `new_layout` must have a non-zero size. +unsafe fn krealloc_aligned(ptr: *mut u8, new_layout: Layout, flags: bindings::gfp_t) -> *mut u8 { + // Customized layouts from `Layout::from_size_align()` can have size < align, so pad first. + let layout = new_layout.pad_to_align(); + + let mut size = layout.size(); + + if layout.align() > bindings::BINDINGS_ARCH_SLAB_MINALIGN { + // The alignment requirement exceeds the slab guarantee, thus try to enlarge the size + // to use the "power-of-two" size/alignment guarantee (see comments in `kmalloc()` for + // more information). + // + // Note that `layout.size()` (after padding) is guaranteed to be a multiple of + // `layout.align()`, so `next_power_of_two` gives enough alignment guarantee. + size = size.next_power_of_two(); + } + + // SAFETY: + // - `ptr` is either null or a pointer returned from a previous `k{re}alloc()` by the + // function safety requirement. + // - `size` is greater than 0 since it's either a `layout.size()` (which cannot be zero + // according to the function safety requirement) or a result from `next_power_of_two()`. + unsafe { bindings::krealloc(ptr as *const core::ffi::c_void, size, flags) as *mut u8 } +} + unsafe impl GlobalAlloc for KernelAllocator { unsafe fn alloc(&self, layout: Layout) -> *mut u8 { // `krealloc()` is used instead of `kmalloc()` because the latter is @@ -30,10 +60,20 @@ static ALLOCATOR: KernelAllocator = KernelAllocator; // to extract the object file that has them from the archive. For the moment, // let's generate them ourselves instead. // +// Note: Although these are *safe* functions, they are called by the compiler +// with parameters that obey the same `GlobalAlloc` function safety +// requirements: size and align should form a valid layout, and size is +// greater than 0. +// // Note that `#[no_mangle]` implies exported too, nowadays. #[no_mangle] -fn __rust_alloc(size: usize, _align: usize) -> *mut u8 { - unsafe { bindings::krealloc(core::ptr::null(), size, bindings::GFP_KERNEL) as *mut u8 } +fn __rust_alloc(size: usize, align: usize) -> *mut u8 { + // SAFETY: See assumption above. + let layout = unsafe { Layout::from_size_align_unchecked(size, align) }; + + // SAFETY: `ptr::null_mut()` is null, per assumption above the size of `layout` is greater + // than 0. + unsafe { krealloc_aligned(ptr::null_mut(), layout, bindings::GFP_KERNEL) } } #[no_mangle] @@ -42,23 +82,27 @@ fn __rust_dealloc(ptr: *mut u8, _size: usize, _align: usize) { } #[no_mangle] -fn __rust_realloc(ptr: *mut u8, _old_size: usize, _align: usize, new_size: usize) -> *mut u8 { - unsafe { - bindings::krealloc( - ptr as *const core::ffi::c_void, - new_size, - bindings::GFP_KERNEL, - ) as *mut u8 - } +fn __rust_realloc(ptr: *mut u8, _old_size: usize, align: usize, new_size: usize) -> *mut u8 { + // SAFETY: See assumption above. + let new_layout = unsafe { Layout::from_size_align_unchecked(new_size, align) }; + + // SAFETY: Per assumption above, `ptr` is allocated by `__rust_*` before, and the size of + // `new_layout` is greater than 0. + unsafe { krealloc_aligned(ptr, new_layout, bindings::GFP_KERNEL) } } #[no_mangle] -fn __rust_alloc_zeroed(size: usize, _align: usize) -> *mut u8 { +fn __rust_alloc_zeroed(size: usize, align: usize) -> *mut u8 { + // SAFETY: See assumption above. + let layout = unsafe { Layout::from_size_align_unchecked(size, align) }; + + // SAFETY: `ptr::null_mut()` is null, per assumption above the size of `layout` is greater + // than 0. unsafe { - bindings::krealloc( - core::ptr::null(), - size, + krealloc_aligned( + ptr::null_mut(), + layout, bindings::GFP_KERNEL | bindings::__GFP_ZERO, - ) as *mut u8 + ) } } From 1d24eb2d536ba27ef938a6563ac8bfb49c738cc1 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Thu, 6 Jul 2023 09:46:15 +0000 Subject: [PATCH 103/230] rust: delete `ForeignOwnable::borrow_mut` We discovered that the current design of `borrow_mut` is problematic. This patch removes it until a better solution can be found. Specifically, the current design gives you access to a `&mut T`, which lets you change where the `ForeignOwnable` points (e.g., with `core::mem::swap`). No upcoming user of this API intended to make that possible, making all of them unsound. Signed-off-by: Alice Ryhl Reviewed-by: Gary Guo Reviewed-by: Benno Lossin Reviewed-by: Martin Rodriguez Reboredo Fixes: 0fc4424d24a2 ("rust: types: introduce `ForeignOwnable`") Link: https://lore.kernel.org/r/20230706094615.3080784-1-aliceryhl@google.com Signed-off-by: Miguel Ojeda --- rust/kernel/sync/arc.rs | 3 +-- rust/kernel/types.rs | 22 ++-------------------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index a89843cacaad..172f563976a9 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -243,8 +243,7 @@ impl ForeignOwnable for Arc { let inner = NonNull::new(ptr as *mut ArcInner).unwrap(); // SAFETY: The safety requirements of `from_foreign` ensure that the object remains alive - // for the lifetime of the returned value. Additionally, the safety requirements of - // `ForeignOwnable::borrow_mut` ensure that no new mutable references are created. + // for the lifetime of the returned value. unsafe { ArcBorrow::new(inner) } } diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs index 1e5380b16ed5..d479f8da8f38 100644 --- a/rust/kernel/types.rs +++ b/rust/kernel/types.rs @@ -35,34 +35,16 @@ pub trait ForeignOwnable: Sized { /// /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. - /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow_mut`] - /// for this object must have been dropped. unsafe fn borrow<'a>(ptr: *const core::ffi::c_void) -> Self::Borrowed<'a>; - /// Mutably borrows a foreign-owned object. - /// - /// # Safety - /// - /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for - /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. - /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow`] and - /// [`ForeignOwnable::borrow_mut`] for this object must have been dropped. - unsafe fn borrow_mut(ptr: *const core::ffi::c_void) -> ScopeGuard { - // SAFETY: The safety requirements ensure that `ptr` came from a previous call to - // `into_foreign`. - ScopeGuard::new_with_data(unsafe { Self::from_foreign(ptr) }, |d| { - d.into_foreign(); - }) - } - /// Converts a foreign-owned object back to a Rust-owned one. /// /// # Safety /// /// `ptr` must have been returned by a previous call to [`ForeignOwnable::into_foreign`] for /// which a previous matching [`ForeignOwnable::from_foreign`] hasn't been called yet. - /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow`] and - /// [`ForeignOwnable::borrow_mut`] for this object must have been dropped. + /// Additionally, all instances (if any) of values returned by [`ForeignOwnable::borrow`] for + /// this object must have been dropped. unsafe fn from_foreign(ptr: *const core::ffi::c_void) -> Self; } From b05544884300e98512964103b33f8f87650ce887 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 11 Jul 2023 09:19:14 +0200 Subject: [PATCH 104/230] rust: fix bindgen build error with UBSAN_BOUNDS_STRICT With commit 2d47c6956ab3 ("ubsan: Tighten UBSAN_BOUNDS on GCC") if CONFIG_UBSAN is enabled and gcc supports -fsanitize=bounds-strict, we can trigger the following build error due to bindgen lacking support for this additional build option: BINDGEN rust/bindings/bindings_generated.rs error: unsupported argument 'bounds-strict' to option '-fsanitize=' Fix by adding -fsanitize=bounds-strict to the list of skipped gcc flags for bindgen. Fixes: 2d47c6956ab3 ("ubsan: Tighten UBSAN_BOUNDS on GCC") Signed-off-by: Andrea Righi Acked-by: Kees Cook Reviewed-by: Martin Rodriguez Reboredo Link: https://lore.kernel.org/r/20230711071914.133946-1-andrea.righi@canonical.com Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index 7c9d9f11aec5..4124bfa01798 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -257,7 +257,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ -fno-partial-inlining -fplugin-arg-arm_ssp_per_task_plugin-% \ -fno-reorder-blocks -fno-allow-store-data-races -fasan-shadow-offset=% \ -fzero-call-used-regs=% -fno-stack-clash-protection \ - -fno-inline-functions-called-once \ + -fno-inline-functions-called-once -fsanitize=bounds-strict \ --param=% --param asan-% # Derived from `scripts/Makefile.clang`. From 045aecdfcb2e060db142d83a0f4082380c465d2c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 3 Aug 2023 19:33:21 +0100 Subject: [PATCH 105/230] arm64/ptrace: Don't enable SVE when setting streaming SVE Systems which implement SME without also implementing SVE are architecturally valid but were not initially supported by the kernel, unfortunately we missed one issue in the ptrace code. The SVE register setting code is shared between SVE and streaming mode SVE. When we set full SVE register state we currently enable TIF_SVE unconditionally, in the case where streaming SVE is being configured on a system that supports vanilla SVE this is not an issue since we always initialise enough state for both vector lengths but on a system which only support SME it will result in us attempting to restore the SVE vector length after having set streaming SVE registers. Fix this by making the enabling of SVE conditional on setting SVE vector state. If we set streaming SVE state and SVE was not already enabled this will result in a SVE access trap on next use of normal SVE, this will cause us to flush our register state but this is fine since the only way to trigger a SVE access trap would be to exit streaming mode which will cause the in register state to be flushed anyway. Fixes: e12310a0d30f ("arm64/sme: Implement ptrace support for streaming mode SVE registers") Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230803-arm64-fix-ptrace-ssve-no-sve-v1-1-49df214bfb3e@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/ptrace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 740e81e9db04..5b9b4305248b 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -932,11 +932,13 @@ static int sve_set_common(struct task_struct *target, /* * Ensure target->thread.sve_state is up to date with target's * FPSIMD regs, so that a short copyin leaves trailing - * registers unmodified. Always enable SVE even if going into - * streaming mode. + * registers unmodified. Only enable SVE if we are + * configuring normal SVE, a system with streaming SVE may not + * have normal SVE. */ fpsimd_sync_to_sve(target); - set_tsk_thread_flag(target, TIF_SVE); + if (type == ARM64_VEC_SVE) + set_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_SVE; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); From 507ea5dd92d23fcf10e4d1a68a443c86a49753ed Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 3 Aug 2023 19:33:22 +0100 Subject: [PATCH 106/230] arm64/fpsimd: Sync FPSIMD state with SVE for SME only systems Currently we guard FPSIMD/SVE state conversions with a check for the system supporting SVE but SME only systems may need to sync streaming mode SVE state so add a check for SME support too. These functions are only used by the ptrace code. Fixes: e12310a0d30f ("arm64/sme: Implement ptrace support for streaming mode SVE registers") Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230803-arm64-fix-ptrace-ssve-no-sve-v1-2-49df214bfb3e@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index a61a1fd6492d..bfaa70cb040e 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -679,7 +679,7 @@ static void fpsimd_to_sve(struct task_struct *task) void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) return; vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); @@ -705,7 +705,7 @@ static void sve_to_fpsimd(struct task_struct *task) unsigned int i; __uint128_t const *p; - if (!system_supports_sve()) + if (!system_supports_sve() && !system_supports_sme()) return; vl = thread_get_cur_vl(&task->thread); From 69af56ae56a48a2522aad906c4461c6c7c092737 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 3 Aug 2023 19:33:23 +0100 Subject: [PATCH 107/230] arm64/fpsimd: Sync and zero pad FPSIMD state for streaming SVE We have a function sve_sync_from_fpsimd_zeropad() which is used by the ptrace code to update the SVE state when the user writes to the the FPSIMD register set. Currently this checks that the task has SVE enabled but this will miss updates for tasks which have streaming SVE enabled if SVE has not been enabled for the thread, also do the conversion if the task has streaming SVE enabled. Fixes: e12310a0d30f ("arm64/sme: Implement ptrace support for streaming mode SVE registers") Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230803-arm64-fix-ptrace-ssve-no-sve-v1-3-49df214bfb3e@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index bfaa70cb040e..75c37b1c55aa 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -835,7 +835,8 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!test_tsk_thread_flag(task, TIF_SVE)) + if (!test_tsk_thread_flag(task, TIF_SVE) && + !thread_sm_enabled(&task->thread)) return; vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); From 4e15a0ddc3ff40e8ea84032213976ecf774d7f77 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 4 Aug 2023 12:42:45 -0400 Subject: [PATCH 108/230] KVM: SEV: snapshot the GHCB before accessing it Validation of the GHCB is susceptible to time-of-check/time-of-use vulnerabilities. To avoid them, we would like to always snapshot the fields that are read in sev_es_validate_vmgexit(), and not use the GHCB anymore after it returns. This means: - invoking sev_es_sync_from_ghcb() before any GHCB access, including before sev_es_validate_vmgexit() - snapshotting all fields including the valid bitmap and the sw_scratch field, which are currently not caching anywhere. The valid bitmap is the first thing to be copied out of the GHCB; then, further accesses will use the copy in svm->sev_es. Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 69 +++++++++++++++++++++--------------------- arch/x86/kvm/svm/svm.h | 26 ++++++++++++++++ 2 files changed, 61 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 07756b7348ae..e898f0b2b0ba 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2417,15 +2417,18 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb); + BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); + memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); - svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb); - if (ghcb_xcr0_is_valid(ghcb)) { + svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb); + + if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); kvm_update_cpuid_runtime(vcpu); } @@ -2436,6 +2439,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) control->exit_code_hi = upper_32_bits(exit_code); control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); + svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb); /* Clear the valid entries fields */ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); @@ -2464,56 +2468,56 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) reason = GHCB_ERR_MISSING_INPUT; - if (!ghcb_sw_exit_code_is_valid(ghcb) || - !ghcb_sw_exit_info_1_is_valid(ghcb) || - !ghcb_sw_exit_info_2_is_valid(ghcb)) + if (!kvm_ghcb_sw_exit_code_is_valid(svm) || + !kvm_ghcb_sw_exit_info_1_is_valid(svm) || + !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; switch (ghcb_get_sw_exit_code(ghcb)) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: - if (!ghcb_rax_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_RDTSC: break; case SVM_EXIT_RDPMC: - if (!ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_CPUID: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; if (ghcb_get_rax(ghcb) == 0xd) - if (!ghcb_xcr0_is_valid(ghcb)) + if (!kvm_ghcb_xcr0_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_INVD: break; case SVM_EXIT_IOIO: if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) { - if (!ghcb_sw_scratch_is_valid(ghcb)) + if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; } else { if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK)) - if (!ghcb_rax_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; } break; case SVM_EXIT_MSR: - if (!ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; if (ghcb_get_sw_exit_info_1(ghcb)) { - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rdx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err; } break; case SVM_EXIT_VMMCALL: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_cpl_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_cpl_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_RDTSCP: @@ -2521,19 +2525,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_EXIT_WBINVD: break; case SVM_EXIT_MONITOR: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb) || - !ghcb_rdx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm) || + !kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_MWAIT: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_MMIO_READ: case SVM_VMGEXIT_MMIO_WRITE: - if (!ghcb_sw_scratch_is_valid(ghcb)) + if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_NMI_COMPLETE: @@ -2563,9 +2567,6 @@ vmgexit_err: dump_ghcb(svm); } - /* Clear the valid entries fields */ - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); - ghcb_set_sw_exit_info_1(ghcb, 2); ghcb_set_sw_exit_info_2(ghcb, reason); @@ -2586,7 +2587,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) */ if (svm->sev_es.ghcb_sa_sync) { kvm_write_guest(svm->vcpu.kvm, - ghcb_get_sw_scratch(svm->sev_es.ghcb), + svm->sev_es.sw_scratch, svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); svm->sev_es.ghcb_sa_sync = false; @@ -2637,7 +2638,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; - scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); + scratch_gpa_beg = svm->sev_es.sw_scratch; if (!scratch_gpa_beg) { pr_err("vmgexit: scratch gpa not provided\n"); goto e_scratch; @@ -2853,11 +2854,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) exit_code = ghcb_get_sw_exit_code(ghcb); + sev_es_sync_from_ghcb(svm); ret = sev_es_validate_vmgexit(svm); if (ret) return ret; - sev_es_sync_from_ghcb(svm); ghcb_set_sw_exit_info_1(ghcb, 0); ghcb_set_sw_exit_info_2(ghcb, 0); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 18af7e712a5a..8239c8de45ac 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -190,10 +190,12 @@ struct vcpu_sev_es_state { /* SEV-ES support */ struct sev_es_save_area *vmsa; struct ghcb *ghcb; + u8 valid_bitmap[16]; struct kvm_host_map ghcb_map; bool received_first_sipi; /* SEV-ES scratch area support */ + u64 sw_scratch; void *ghcb_sa; u32 ghcb_sa_len; bool ghcb_sa_sync; @@ -744,4 +746,28 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm); void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +#define DEFINE_KVM_GHCB_ACCESSORS(field) \ + static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \ + { \ + return test_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&svm->sev_es.valid_bitmap); \ + } \ + \ + static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \ + { \ + return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \ + } \ + +DEFINE_KVM_GHCB_ACCESSORS(cpl) +DEFINE_KVM_GHCB_ACCESSORS(rax) +DEFINE_KVM_GHCB_ACCESSORS(rcx) +DEFINE_KVM_GHCB_ACCESSORS(rdx) +DEFINE_KVM_GHCB_ACCESSORS(rbx) +DEFINE_KVM_GHCB_ACCESSORS(rsi) +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code) +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1) +DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2) +DEFINE_KVM_GHCB_ACCESSORS(sw_scratch) +DEFINE_KVM_GHCB_ACCESSORS(xcr0) + #endif From 7588dbcebcbf0193ab5b76987396d0254270b04a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 4 Aug 2023 12:56:36 -0400 Subject: [PATCH 109/230] KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e898f0b2b0ba..ca4ba5fe9a01 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2445,9 +2445,15 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { - struct kvm_vcpu *vcpu; + struct vmcb_control_area *control = &svm->vmcb->control; + struct kvm_vcpu *vcpu = &svm->vcpu; struct ghcb *ghcb; u64 exit_code; u64 reason; @@ -2458,7 +2464,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ - exit_code = ghcb_get_sw_exit_code(ghcb); + exit_code = kvm_ghcb_get_sw_exit_code(control); /* Only GHCB Usage code 0 is supported */ if (ghcb->ghcb_usage) { @@ -2473,7 +2479,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; - switch (ghcb_get_sw_exit_code(ghcb)) { + switch (exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: @@ -2490,18 +2496,18 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; - if (ghcb_get_rax(ghcb) == 0xd) + if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd) if (!kvm_ghcb_xcr0_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_INVD: break; case SVM_EXIT_IOIO: - if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) { + if (control->exit_info_1 & SVM_IOIO_STR_MASK) { if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; } else { - if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK)) + if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK)) if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; } @@ -2509,7 +2515,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_EXIT_MSR: if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; - if (ghcb_get_sw_exit_info_1(ghcb)) { + if (control->exit_info_1) { if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err; @@ -2553,8 +2559,6 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return 0; vmgexit_err: - vcpu = &svm->vcpu; - if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", ghcb->ghcb_usage); @@ -2852,8 +2856,6 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); - exit_code = ghcb_get_sw_exit_code(ghcb); - sev_es_sync_from_ghcb(svm); ret = sev_es_validate_vmgexit(svm); if (ret) @@ -2862,6 +2864,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ghcb_set_sw_exit_info_1(ghcb, 0); ghcb_set_sw_exit_info_2(ghcb, 0); + exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); From 63dbc67cf4ed11f94b2e8dde34b41438a3cb3d83 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 4 Aug 2023 13:01:43 -0400 Subject: [PATCH 110/230] KVM: SEV: remove ghcb variable declarations To avoid possible time-of-check/time-of-use issues, the GHCB should almost never be accessed outside dump_ghcb, sev_es_sync_to_ghcb and sev_es_sync_from_ghcb. The only legitimate uses are to set the exitinfo fields and to find the address of the scratch area embedded in the ghcb. Accessing ghcb_usage also goes through svm->sev_es.ghcb in sev_es_validate_vmgexit(), but that is because anyway the value is not used. Removing a shortcut variable that contains the value of svm->sev_es.ghcb makes these cases a bit more verbose, but it limits the chance of someone reading the ghcb by mistake. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ca4ba5fe9a01..d3aec1f2cad2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2454,12 +2454,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb; u64 exit_code; u64 reason; - ghcb = svm->sev_es.ghcb; - /* * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. @@ -2467,7 +2464,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) exit_code = kvm_ghcb_get_sw_exit_code(control); /* Only GHCB Usage code 0 is supported */ - if (ghcb->ghcb_usage) { + if (svm->sev_es.ghcb->ghcb_usage) { reason = GHCB_ERR_INVALID_USAGE; goto vmgexit_err; } @@ -2561,7 +2558,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) vmgexit_err: if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", - ghcb->ghcb_usage); + svm->sev_es.ghcb->ghcb_usage); } else if (reason == GHCB_ERR_INVALID_EVENT) { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", exit_code); @@ -2571,8 +2568,8 @@ vmgexit_err: dump_ghcb(svm); } - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, reason); + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason); /* Resume the guest to "return" the error code. */ return 1; @@ -2637,7 +2634,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; - struct ghcb *ghcb = svm->sev_es.ghcb; u64 ghcb_scratch_beg, ghcb_scratch_end; u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; @@ -2713,8 +2709,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return 0; e_scratch: - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); return 1; } @@ -2827,7 +2823,6 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; u64 ghcb_gpa, exit_code; - struct ghcb *ghcb; int ret; /* Validate the GHCB */ @@ -2852,17 +2847,16 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) } svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; - ghcb = svm->sev_es.ghcb_map.hva; - trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); + trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); sev_es_sync_from_ghcb(svm); ret = sev_es_validate_vmgexit(svm); if (ret) return ret; - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0); exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { @@ -2902,13 +2896,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) break; case 1: /* Get AP jump table address */ - ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table); break; default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); } ret = 1; From 797964253d358cf8d705614dda394dbe30120223 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 3 Aug 2023 11:35:53 -0700 Subject: [PATCH 111/230] file: reinstate f_pos locking optimization for regular files In commit 20ea1e7d13c1 ("file: always lock position for FMODE_ATOMIC_POS") we ended up always taking the file pos lock, because pidfd_getfd() could get a reference to the file even when it didn't have an elevated file count due to threading of other sharing cases. But Mateusz Guzik reports that the extra locking is actually measurable, so let's re-introduce the optimization, and only force the locking for directory traversal. Directories need the lock for correctness reasons, while regular files only need it for "POSIX semantics". Since pidfd_getfd() is about debuggers etc special things that are _way_ outside of POSIX, we can relax the rules for that case. Reported-by: Mateusz Guzik Cc: Christian Brauner Link: https://lore.kernel.org/linux-fsdevel/20230803095311.ijpvhx3fyrbkasul@f/ Signed-off-by: Linus Torvalds --- fs/file.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index 35c62b54c9d6..dbca26ef7a01 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1036,12 +1036,28 @@ unsigned long __fdget_raw(unsigned int fd) return __fget_light(fd, 0); } +/* + * Try to avoid f_pos locking. We only need it if the + * file is marked for FMODE_ATOMIC_POS, and it can be + * accessed multiple ways. + * + * Always do it for directories, because pidfd_getfd() + * can make a file accessible even if it otherwise would + * not be, and for directories this is a correctness + * issue, not a "POSIX requirement". + */ +static inline bool file_needs_f_pos_lock(struct file *file) +{ + return (file->f_mode & FMODE_ATOMIC_POS) && + (file_count(file) > 1 || S_ISDIR(file_inode(file)->i_mode)); +} + unsigned long __fdget_pos(unsigned int fd) { unsigned long v = __fdget(fd); struct file *file = (struct file *)(v & ~3); - if (file && (file->f_mode & FMODE_ATOMIC_POS)) { + if (file && file_needs_f_pos_lock(file)) { v |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } From d5ad9aae13dcced333c1a7816ff0a4fbbb052466 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 4 Aug 2023 20:22:11 +0100 Subject: [PATCH 112/230] selftests/rseq: Fix build with undefined __weak Commit 3bcbc20942db ("selftests/rseq: Play nice with binaries statically linked against glibc 2.35+") which is now in Linus' tree introduced uses of __weak but did nothing to ensure that a definition is provided for it resulting in build failures for the rseq tests: rseq.c:41:1: error: unknown type name '__weak' __weak ptrdiff_t __rseq_offset; ^ rseq.c:41:17: error: expected ';' after top level declarator __weak ptrdiff_t __rseq_offset; ^ ; rseq.c:42:1: error: unknown type name '__weak' __weak unsigned int __rseq_size; ^ rseq.c:43:1: error: unknown type name '__weak' __weak unsigned int __rseq_flags; Fix this by using the definition from tools/include compiler.h. Fixes: 3bcbc20942db ("selftests/rseq: Play nice with binaries statically linked against glibc 2.35+") Signed-off-by: Mark Brown Message-Id: <20230804-kselftest-rseq-build-v1-1-015830b66aa9@kernel.org> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/rseq/Makefile | 4 +++- tools/testing/selftests/rseq/rseq.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index b357ba24af06..7a957c7d459a 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -4,8 +4,10 @@ ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),) CLANG_FLAGS += -no-integrated-as endif +top_srcdir = ../../../.. + CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -L$(OUTPUT) -Wl,-rpath=./ \ - $(CLANG_FLAGS) + $(CLANG_FLAGS) -I$(top_srcdir)/tools/include LDLIBS += -lpthread -ldl # Own dependencies because we only want to build against 1st prerequisite, but diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index a723da253244..96e812bdf8a4 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -31,6 +31,8 @@ #include #include +#include + #include "../kselftest.h" #include "rseq.h" From 17ebf8a4c38b5481c29623f5e003fdf7583947f9 Mon Sep 17 00:00:00 2001 From: Xiang Yang Date: Thu, 3 Aug 2023 07:24:38 +0000 Subject: [PATCH 113/230] mptcp: fix the incorrect judgment for msk->cb_flags Coccicheck reports the error below: net/mptcp/protocol.c:3330:15-28: ERROR: test of a variable/field address Since the address of msk->cb_flags is used in __test_and_clear_bit, the address should not be NULL. The judgment for if (unlikely(msk->cb_flags)) will always be true, we should check the real value of msk->cb_flags here. Fixes: 65a569b03ca8 ("mptcp: optimize release_cb for the common case") Signed-off-by: Xiang Yang Reviewed-by: Matthieu Baerts Link: https://lore.kernel.org/r/20230803072438.1847500-1-xiangyang3@huawei.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3317d1cca156..be5f6f4b94c7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3328,7 +3328,7 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); - if (unlikely(&msk->cb_flags)) { + if (unlikely(msk->cb_flags)) { /* be sure to set the current sk state before tacking actions * depending on sk_state, that is processing MPTCP_ERROR_REPORT */ From a94c16a2fda010866b8858a386a8bfbeba4f72c5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Aug 2023 16:42:53 +0300 Subject: [PATCH 114/230] net: dsa: ocelot: call dsa_tag_8021q_unregister() under rtnl_lock() on driver remove When the tagging protocol in current use is "ocelot-8021q" and we unbind the driver, we see this splat: $ echo '0000:00:00.2' > /sys/bus/pci/drivers/fsl_enetc/unbind mscc_felix 0000:00:00.5 swp0: left promiscuous mode sja1105 spi2.0: Link is Down DSA: tree 1 torn down mscc_felix 0000:00:00.5 swp2: left promiscuous mode sja1105 spi2.2: Link is Down DSA: tree 3 torn down fsl_enetc 0000:00:00.2 eno2: left promiscuous mode mscc_felix 0000:00:00.5: Link is Down ------------[ cut here ]------------ RTNL: assertion failed at net/dsa/tag_8021q.c (409) WARNING: CPU: 1 PID: 329 at net/dsa/tag_8021q.c:409 dsa_tag_8021q_unregister+0x12c/0x1a0 Modules linked in: CPU: 1 PID: 329 Comm: bash Not tainted 6.5.0-rc3+ #771 pc : dsa_tag_8021q_unregister+0x12c/0x1a0 lr : dsa_tag_8021q_unregister+0x12c/0x1a0 Call trace: dsa_tag_8021q_unregister+0x12c/0x1a0 felix_tag_8021q_teardown+0x130/0x150 felix_teardown+0x3c/0xd8 dsa_tree_teardown_switches+0xbc/0xe0 dsa_unregister_switch+0x168/0x260 felix_pci_remove+0x30/0x60 pci_device_remove+0x4c/0x100 device_release_driver_internal+0x188/0x288 device_links_unbind_consumers+0xfc/0x138 device_release_driver_internal+0xe0/0x288 device_driver_detach+0x24/0x38 unbind_store+0xd8/0x108 drv_attr_store+0x30/0x50 ---[ end trace 0000000000000000 ]--- ------------[ cut here ]------------ RTNL: assertion failed at net/8021q/vlan_core.c (376) WARNING: CPU: 1 PID: 329 at net/8021q/vlan_core.c:376 vlan_vid_del+0x1b8/0x1f0 CPU: 1 PID: 329 Comm: bash Tainted: G W 6.5.0-rc3+ #771 pc : vlan_vid_del+0x1b8/0x1f0 lr : vlan_vid_del+0x1b8/0x1f0 dsa_tag_8021q_unregister+0x8c/0x1a0 felix_tag_8021q_teardown+0x130/0x150 felix_teardown+0x3c/0xd8 dsa_tree_teardown_switches+0xbc/0xe0 dsa_unregister_switch+0x168/0x260 felix_pci_remove+0x30/0x60 pci_device_remove+0x4c/0x100 device_release_driver_internal+0x188/0x288 device_links_unbind_consumers+0xfc/0x138 device_release_driver_internal+0xe0/0x288 device_driver_detach+0x24/0x38 unbind_store+0xd8/0x108 drv_attr_store+0x30/0x50 DSA: tree 0 torn down This was somewhat not so easy to spot, because "ocelot-8021q" is not the default tagging protocol, and thus, not everyone who tests the unbinding path may have switched to it beforehand. The default felix_tag_npi_teardown() does not require rtnl_lock() to be held. Fixes: 7c83a7c539ab ("net: dsa: add a second tagger for Ocelot switches based on tag_8021q") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20230803134253.2711124-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 8da46d284e35..bef879c6d500 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1625,8 +1625,10 @@ static void felix_teardown(struct dsa_switch *ds) struct felix *felix = ocelot_to_felix(ocelot); struct dsa_port *dp; + rtnl_lock(); if (felix->tag_proto_ops) felix->tag_proto_ops->teardown(ds); + rtnl_unlock(); dsa_switch_for_each_available_port(dp, ds) ocelot_deinit_port(ocelot, dp->index); From 8a9896177784063d01068293caea3f74f6830ff6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Aug 2023 14:56:00 +0000 Subject: [PATCH 115/230] net/packet: annotate data-races around tp->status Another syzbot report [1] is about tp->status lockless reads from __packet_get_status() [1] BUG: KCSAN: data-race in __packet_rcv_has_room / __packet_set_status write to 0xffff888117d7c080 of 8 bytes by interrupt on cpu 0: __packet_set_status+0x78/0xa0 net/packet/af_packet.c:407 tpacket_rcv+0x18bb/0x1a60 net/packet/af_packet.c:2483 deliver_skb net/core/dev.c:2173 [inline] __netif_receive_skb_core+0x408/0x1e80 net/core/dev.c:5337 __netif_receive_skb_one_core net/core/dev.c:5491 [inline] __netif_receive_skb+0x57/0x1b0 net/core/dev.c:5607 process_backlog+0x21f/0x380 net/core/dev.c:5935 __napi_poll+0x60/0x3b0 net/core/dev.c:6498 napi_poll net/core/dev.c:6565 [inline] net_rx_action+0x32b/0x750 net/core/dev.c:6698 __do_softirq+0xc1/0x265 kernel/softirq.c:571 invoke_softirq kernel/softirq.c:445 [inline] __irq_exit_rcu+0x57/0xa0 kernel/softirq.c:650 sysvec_apic_timer_interrupt+0x6d/0x80 arch/x86/kernel/apic/apic.c:1106 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:645 smpboot_thread_fn+0x33c/0x4a0 kernel/smpboot.c:112 kthread+0x1d7/0x210 kernel/kthread.c:379 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 read to 0xffff888117d7c080 of 8 bytes by interrupt on cpu 1: __packet_get_status net/packet/af_packet.c:436 [inline] packet_lookup_frame net/packet/af_packet.c:524 [inline] __tpacket_has_room net/packet/af_packet.c:1255 [inline] __packet_rcv_has_room+0x3f9/0x450 net/packet/af_packet.c:1298 tpacket_rcv+0x275/0x1a60 net/packet/af_packet.c:2285 deliver_skb net/core/dev.c:2173 [inline] dev_queue_xmit_nit+0x38a/0x5e0 net/core/dev.c:2243 xmit_one net/core/dev.c:3574 [inline] dev_hard_start_xmit+0xcf/0x3f0 net/core/dev.c:3594 __dev_queue_xmit+0xefb/0x1d10 net/core/dev.c:4244 dev_queue_xmit include/linux/netdevice.h:3088 [inline] can_send+0x4eb/0x5d0 net/can/af_can.c:276 bcm_can_tx+0x314/0x410 net/can/bcm.c:302 bcm_tx_timeout_handler+0xdb/0x260 __run_hrtimer kernel/time/hrtimer.c:1685 [inline] __hrtimer_run_queues+0x217/0x700 kernel/time/hrtimer.c:1749 hrtimer_run_softirq+0xd6/0x120 kernel/time/hrtimer.c:1766 __do_softirq+0xc1/0x265 kernel/softirq.c:571 run_ksoftirqd+0x17/0x20 kernel/softirq.c:939 smpboot_thread_fn+0x30a/0x4a0 kernel/smpboot.c:164 kthread+0x1d7/0x210 kernel/kthread.c:379 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 value changed: 0x0000000000000000 -> 0x0000000020000081 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 19 Comm: ksoftirqd/1 Not tainted 6.4.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/27/2023 Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230803145600.2937518-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a4631cb457a9..a2935bd18ed9 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -401,18 +401,20 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; + /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: - h.h1->tp_status = status; + WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: - h.h2->tp_status = status; + WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: - h.h3->tp_status = status; + WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: @@ -429,17 +431,19 @@ static int __packet_get_status(const struct packet_sock *po, void *frame) smp_rmb(); + /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); - return h.h1->tp_status; + return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); - return h.h2->tp_status; + return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); - return h.h3->tp_status; + return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); From 6a7ac3d20593865209dceb554d8b3f094c6bd940 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Aug 2023 17:26:49 +0200 Subject: [PATCH 116/230] tunnels: fix kasan splat when generating ipv4 pmtu error If we try to emit an icmp error in response to a nonliner skb, we get BUG: KASAN: slab-out-of-bounds in ip_compute_csum+0x134/0x220 Read of size 4 at addr ffff88811c50db00 by task iperf3/1691 CPU: 2 PID: 1691 Comm: iperf3 Not tainted 6.5.0-rc3+ #309 [..] kasan_report+0x105/0x140 ip_compute_csum+0x134/0x220 iptunnel_pmtud_build_icmp+0x554/0x1020 skb_tunnel_check_pmtu+0x513/0xb80 vxlan_xmit_one+0x139e/0x2ef0 vxlan_xmit+0x1867/0x2760 dev_hard_start_xmit+0x1ee/0x4f0 br_dev_queue_push_xmit+0x4d1/0x660 [..] ip_compute_csum() cannot deal with nonlinear skbs, so avoid it. After this change, splat is gone and iperf3 is no longer stuck. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230803152653.29535-2-fw@strlen.de Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 92c02c886fe7..586b1b3e35b8 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -224,7 +224,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) .un.frag.__unused = 0, .un.frag.mtu = htons(mtu), }; - icmph->checksum = ip_compute_csum(icmph, len); + icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0)); skb_reset_transport_header(skb); niph = skb_push(skb, sizeof(*niph)); From 136a1b434bbb90c5e50831646ad0680c744c79bb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Aug 2023 17:26:50 +0200 Subject: [PATCH 117/230] selftests: net: test vxlan pmtu exceptions with tcp TCP might get stuck if a nonlinear skb exceeds the path MTU, icmp error contains an incorrect icmp checksum in that case. Extend the existing test for vxlan to also send at least 1MB worth of data via TCP in addition to the existing 'large icmp packet adds route exception'. On my test VM this fails due to 0-size output file without "tunnels: fix kasan splat when generating ipv4 pmtu error". Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230803152653.29535-3-fw@strlen.de Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 35 +++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index dfe3d287f01d..f838dd370f6a 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -361,6 +361,7 @@ err_buf= tcpdump_pids= nettest_pids= socat_pids= +tmpoutfile= err() { err_buf="${err_buf}${1} @@ -951,6 +952,7 @@ cleanup() { ip link del veth_A-R1 2>/dev/null ovs-vsctl --if-exists del-port vxlan_a 2>/dev/null ovs-vsctl --if-exists del-br ovs_br0 2>/dev/null + rm -f "$tmpoutfile" } mtu() { @@ -1328,6 +1330,39 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() { check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on bridged ${type} interface" pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on locally bridged ${type} interface" + + tmpoutfile=$(mktemp) + + # Flush Exceptions, retry with TCP + run_cmd ${ns_a} ip route flush cached ${dst} + run_cmd ${ns_b} ip route flush cached ${dst} + run_cmd ${ns_c} ip route flush cached ${dst} + + for target in "${ns_a}" "${ns_c}" ; do + if [ ${family} -eq 4 ]; then + TCPDST=TCP:${dst}:50000 + else + TCPDST="TCP:[${dst}]:50000" + fi + ${ns_b} socat -T 3 -u -6 TCP-LISTEN:50000 STDOUT > $tmpoutfile & + + sleep 1 + + dd if=/dev/zero of=/dev/stdout status=none bs=1M count=1 | ${target} socat -T 3 -u STDIN $TCPDST,connect-timeout=3 + + size=$(du -sb $tmpoutfile) + size=${size%%/tmp/*} + + [ $size -ne 1048576 ] && err "File size $size mismatches exepcted value in locally bridged vxlan test" && return 1 + done + + rm -f "$tmpoutfile" + + # Check that exceptions were created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})" + check_pmtu_value ${exp_mtu} "${pmtu}" "tcp: exceeding link layer MTU on bridged ${type} interface" + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" + check_pmtu_value ${exp_mtu} "${pmtu}" "tcp exceeding link layer MTU on locally bridged ${type} interface" } test_pmtu_ipv4_br_vxlan4_exception() { From aaf2123a5cf46dbd97f84b6eee80269758064d93 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Thu, 3 Aug 2023 18:27:27 +0200 Subject: [PATCH 118/230] selftests: mptcp: join: fix 'delete and re-add' test mptcp_join 'delete and re-add' test fails when using ip mptcp: $ ./mptcp_join.sh -iI 002 delete and re-add before delete[ ok ] mptcp_info subflows=1 [ ok ] Error: argument "ADDRESS" is wrong: invalid for non-zero id address after delete[fail] got 2:2 subflows expected 1 This happens because endpoint delete includes an ip address while id is not 0, contrary to what is indicated in the ip mptcp man page: "When used with the delete id operation, an IFADDR is only included when the ID is 0." This fixes the issue using the $addr variable in pm_nl_del_endpoint() only when id is 0. Fixes: 34aa6e3bccd8 ("selftests: mptcp: add ip mptcp wrappers") Cc: stable@vger.kernel.org Signed-off-by: Andrea Claudi Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20230803-upstream-net-20230803-misc-fixes-6-5-v1-1-6671b1ab11cc@tessares.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 3c2096ac97ef..067fabc401f1 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -705,6 +705,7 @@ pm_nl_del_endpoint() local addr=$3 if [ $ip_mptcp -eq 1 ]; then + [ $id -ne 0 ] && addr='' ip -n $ns mptcp endpoint delete id $id $addr else ip netns exec $ns ./pm_nl_ctl del $id $addr From c8c101ae390a3e817369e94a6f12a1ddea420702 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Thu, 3 Aug 2023 18:27:28 +0200 Subject: [PATCH 119/230] selftests: mptcp: join: fix 'implicit EP' test mptcp_join 'implicit EP' test currently fails when using ip mptcp: $ ./mptcp_join.sh -iI 001 implicit EP creation[fail] expected '10.0.2.2 10.0.2.2 id 1 implicit' found '10.0.2.2 id 1 rawflags 10 ' Error: too many addresses or duplicate one: -22. ID change is prevented[fail] expected '10.0.2.2 10.0.2.2 id 1 implicit' found '10.0.2.2 id 1 rawflags 10 ' modif is allowed[fail] expected '10.0.2.2 10.0.2.2 id 1 signal' found '10.0.2.2 id 1 signal ' This happens because of two reasons: - iproute v6.3.0 does not support the implicit flag, fixed with iproute2-next commit 3a2535a41854 ("mptcp: add support for implicit flag") - pm_nl_check_endpoint wrongly expects the ip address to be repeated two times in iproute output, and does not account for a final whitespace in it. This fixes the issue trimming the whitespace in the output string and removing the double address in the expected string. Fixes: 69c6ce7b6eca ("selftests: mptcp: add implicit endpoint test case") Cc: stable@vger.kernel.org Signed-off-by: Andrea Claudi Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20230803-upstream-net-20230803-misc-fixes-6-5-v1-2-6671b1ab11cc@tessares.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 067fabc401f1..d01b73a8ed0f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -796,10 +796,11 @@ pm_nl_check_endpoint() fi if [ $ip_mptcp -eq 1 ]; then + # get line and trim trailing whitespace line=$(ip -n $ns mptcp endpoint show $id) + line="${line% }" # the dump order is: address id flags port dev - expected_line="$addr" - [ -n "$addr" ] && expected_line="$expected_line $addr" + [ -n "$addr" ] && expected_line="$addr" expected_line="$expected_line $id" [ -n "$_flags" ] && expected_line="$expected_line ${_flags//","/" "}" [ -n "$dev" ] && expected_line="$expected_line $dev" From ff18f9ef30ee87740f741b964375d0cfb84e1ec2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 3 Aug 2023 18:27:29 +0200 Subject: [PATCH 120/230] mptcp: avoid bogus reset on fallback close Since the blamed commit, the MPTCP protocol unconditionally sends TCP resets on all the subflows on disconnect(). That fits full-blown MPTCP sockets - to implement the fastclose mechanism - but causes unexpected corruption of the data stream, caught as sporadic self-tests failures. Fixes: d21f83485518 ("mptcp: use fastclose on more edge scenarios") Cc: stable@vger.kernel.org Tested-by: Matthieu Baerts Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/419 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20230803-upstream-net-20230803-misc-fixes-6-5-v1-3-6671b1ab11cc@tessares.net Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index be5f6f4b94c7..d80658547836 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2335,7 +2335,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if (flags & MPTCP_CF_FASTCLOSE) { + if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { /* be sure to force the tcp_disconnect() path, * to generate the egress reset */ From 511b90e39250135a7f900f1c3afbce25543018a2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 3 Aug 2023 18:27:30 +0200 Subject: [PATCH 121/230] mptcp: fix disconnect vs accept race Despite commit 0ad529d9fd2b ("mptcp: fix possible divide by zero in recvmsg()"), the mptcp protocol is still prone to a race between disconnect() (or shutdown) and accept. The root cause is that the mentioned commit checks the msk-level flag, but mptcp_stream_accept() does acquire the msk-level lock, as it can rely directly on the first subflow lock. As reported by Christoph than can lead to a race where an msk socket is accepted after that mptcp_subflow_queue_clean() releases the listener socket lock and just before it takes destructive actions leading to the following splat: BUG: kernel NULL pointer dereference, address: 0000000000000012 PGD 5a4ca067 P4D 5a4ca067 PUD 37d4c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP CPU: 2 PID: 10955 Comm: syz-executor.5 Not tainted 6.5.0-rc1-gdc7b257ee5dd #37 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 RIP: 0010:mptcp_stream_accept+0x1ee/0x2f0 include/net/inet_sock.h:330 Code: 0a 09 00 48 8b 1b 4c 39 e3 74 07 e8 bc 7c 7f fe eb a1 e8 b5 7c 7f fe 4c 8b 6c 24 08 eb 05 e8 a9 7c 7f fe 49 8b 85 d8 09 00 00 <0f> b6 40 12 88 44 24 07 0f b6 6c 24 07 bf 07 00 00 00 89 ee e8 89 RSP: 0018:ffffc90000d07dc0 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff888037e8d020 RCX: ffff88803b093300 RDX: 0000000000000000 RSI: ffffffff833822c5 RDI: ffffffff8333896a RBP: 0000607f82031520 R08: ffff88803b093300 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000003e83 R12: ffff888037e8d020 R13: ffff888037e8c680 R14: ffff888009af7900 R15: ffff888009af6880 FS: 00007fc26d708640(0000) GS:ffff88807dd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000012 CR3: 0000000066bc5001 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: do_accept+0x1ae/0x260 net/socket.c:1872 __sys_accept4+0x9b/0x110 net/socket.c:1913 __do_sys_accept4 net/socket.c:1954 [inline] __se_sys_accept4 net/socket.c:1951 [inline] __x64_sys_accept4+0x20/0x30 net/socket.c:1951 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x47/0xa0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Address the issue by temporary removing the pending request socket from the accept queue, so that racing accept() can't touch them. After depleting the msk - the ssk still exists, as plain TCP sockets, re-insert them into the accept queue, so that later inet_csk_listen_stop() will complete the tcp socket disposal. Fixes: 2a6a870e44dd ("mptcp: stops worker on unaccepted sockets at listener close") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/423 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20230803-upstream-net-20230803-misc-fixes-6-5-v1-4-6671b1ab11cc@tessares.net Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 1 - net/mptcp/subflow.c | 60 ++++++++++++++++++++++---------------------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 37fbe22e2433..ba2a873a4d2e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -325,7 +325,6 @@ struct mptcp_sock { u32 subflow_id; u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; - struct mptcp_sock *dl_next; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9ee3b7abbaf6..94ae7dd01c65 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1793,34 +1793,21 @@ static void subflow_state_change(struct sock *sk) void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; - struct mptcp_sock *msk, *next, *head = NULL; - struct request_sock *req; - struct sock *sk; + struct request_sock *req, *head, *tail; + struct mptcp_subflow_context *subflow; + struct sock *sk, *ssk; - /* build a list of all unaccepted mptcp sockets */ + /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. + * Splice the req list, so that accept() can not reach the pending ssk after + * the listener socket is released below. + */ spin_lock_bh(&queue->rskq_lock); - for (req = queue->rskq_accept_head; req; req = req->dl_next) { - struct mptcp_subflow_context *subflow; - struct sock *ssk = req->sk; - - if (!sk_is_mptcp(ssk)) - continue; - - subflow = mptcp_subflow_ctx(ssk); - if (!subflow || !subflow->conn) - continue; - - /* skip if already in list */ - sk = subflow->conn; - msk = mptcp_sk(sk); - if (msk->dl_next || msk == head) - continue; - - sock_hold(sk); - msk->dl_next = head; - head = msk; - } + head = queue->rskq_accept_head; + tail = queue->rskq_accept_tail; + queue->rskq_accept_head = NULL; + queue->rskq_accept_tail = NULL; spin_unlock_bh(&queue->rskq_lock); + if (!head) return; @@ -1829,13 +1816,19 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s */ release_sock(listener_ssk); - for (msk = head; msk; msk = next) { - sk = (struct sock *)msk; + for (req = head; req; req = req->dl_next) { + ssk = req->sk; + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + sk = subflow->conn; + sock_hold(sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - next = msk->dl_next; - msk->dl_next = NULL; - __mptcp_unaccepted_force_close(sk); release_sock(sk); @@ -1859,6 +1852,13 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s /* we are still under the listener msk socket lock */ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); + + /* restore the listener queue, to let the TCP code clean it up */ + spin_lock_bh(&queue->rskq_lock); + WARN_ON_ONCE(queue->rskq_accept_head); + queue->rskq_accept_head = head; + queue->rskq_accept_tail = tail; + spin_unlock_bh(&queue->rskq_lock); } static int subflow_ulp_init(struct sock *sk) From a47e598fbd8617967e49d85c49c22f9fc642704c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Aug 2023 16:30:21 +0000 Subject: [PATCH 122/230] dccp: fix data-race around dp->dccps_mss_cache dccp_sendmsg() reads dp->dccps_mss_cache before locking the socket. Same thing in do_dccp_getsockopt(). Add READ_ONCE()/WRITE_ONCE() annotations, and change dccp_sendmsg() to check again dccps_mss_cache after socket is locked. Fixes: 7c657876b63c ("[DCCP]: Initial implementation") Reported-by: syzbot Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230803163021.2958262-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/dccp/output.c | 2 +- net/dccp/proto.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/net/dccp/output.c b/net/dccp/output.c index b8a24734385e..fd2eb148d24d 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -187,7 +187,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; - dp->dccps_mss_cache = cur_mps; + WRITE_ONCE(dp->dccps_mss_cache, cur_mps); return cur_mps; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index f331e5977a84..4e3266e4d7c3 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -630,7 +630,7 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, return dccp_getsockopt_service(sk, len, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: - val = dp->dccps_mss_cache; + val = READ_ONCE(dp->dccps_mss_cache); break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); @@ -739,7 +739,7 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) trace_dccp_probe(sk, len); - if (len > dp->dccps_mss_cache) + if (len > READ_ONCE(dp->dccps_mss_cache)) return -EMSGSIZE; lock_sock(sk); @@ -772,6 +772,12 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out_discard; } + /* We need to check dccps_mss_cache after socket is locked. */ + if (len > dp->dccps_mss_cache) { + rc = -EMSGSIZE; + goto out_discard; + } + skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) From 5aa4fda5aa9c2a5a7bac67b4a12b089ab81fee3c Mon Sep 17 00:00:00 2001 From: Long Li Date: Sat, 29 Jul 2023 11:36:18 +0800 Subject: [PATCH 123/230] ksmbd: validate command request size In commit 2b9b8f3b68ed ("ksmbd: validate command payload size"), except for SMB2_OPLOCK_BREAK_HE command, the request size of other commands is not checked, it's not expected. Fix it by add check for request size of other commands. Cc: stable@vger.kernel.org Fixes: 2b9b8f3b68ed ("ksmbd: validate command payload size") Acked-by: Namjae Jeon Signed-off-by: Long Li Signed-off-by: Steve French --- fs/smb/server/smb2misc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 33b7e6c4ceff..e881df1d10cb 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -380,13 +380,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } if (smb2_req_struct_sizes[command] != pdu->StructureSize2) { - if (command == SMB2_OPLOCK_BREAK_HE && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { + if (!(command == SMB2_OPLOCK_BREAK_HE && + (le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_20 || + le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_21))) { /* special case for SMB2.1 lease break message */ ksmbd_debug(SMB, - "Illegal request size %d for oplock break\n", - le16_to_cpu(pdu->StructureSize2)); + "Illegal request size %u for command %d\n", + le16_to_cpu(pdu->StructureSize2), command); return 1; } } From 79ed288cef201f1f212dfb934bcaac75572fb8f6 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 6 Aug 2023 08:44:17 +0900 Subject: [PATCH 124/230] ksmbd: fix wrong next length validation of ea buffer in smb2_set_ea() There are multiple smb2_ea_info buffers in FILE_FULL_EA_INFORMATION request from client. ksmbd find next smb2_ea_info using ->NextEntryOffset of current smb2_ea_info. ksmbd need to validate buffer length Before accessing the next ea. ksmbd should check buffer length using buf_len, not next variable. next is the start offset of current ea that got from previous ea. Cc: stable@vger.kernel.org Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-21598 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 9849d7489345..7cc1b0c47d0a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2324,9 +2324,16 @@ next: break; buf_len -= next; eabuf = (struct smb2_ea_info *)((char *)eabuf + next); - if (next < (u32)eabuf->EaNameLength + le16_to_cpu(eabuf->EaValueLength)) + if (buf_len < sizeof(struct smb2_ea_info)) { + rc = -EINVAL; break; + } + if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + + le16_to_cpu(eabuf->EaValueLength)) { + rc = -EINVAL; + break; + } } while (next != 0); kfree(attr_name); From 6b47808f223c70ff564f9b363446d2a5fa1e05b2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Aug 2023 15:59:51 -0700 Subject: [PATCH 125/230] net: tls: avoid discarding data on record close TLS records end with a 16B tag. For TLS device offload we only need to make space for this tag in the stream, the device will generate and replace it with the actual calculated tag. Long time ago the code would just re-reference the head frag which mostly worked but was suboptimal because it prevented TCP from combining the record into a single skb frag. I'm not sure if it was correct as the first frag may be shorter than the tag. The commit under fixes tried to replace that with using the page frag and if the allocation failed rolling back the data, if record was long enough. It achieves better fragment coalescing but is also buggy. We don't roll back the iterator, so unless we're at the end of send we'll skip the data we designated as tag and start the next record as if the rollback never happened. There's also the possibility that the record was constructed with MSG_MORE and the data came from a different syscall and we already told the user space that we "got it". Allocate a single dummy page and use it as fallback. Found by code inspection, and proven by forcing allocation failures. Fixes: e7b159a48ba6 ("net/tls: remove the record tail optimization") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_device.c | 64 +++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 2021fe557e50..529101eb20bd 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,6 +52,8 @@ static LIST_HEAD(tls_device_list); static LIST_HEAD(tls_device_down_list); static DEFINE_SPINLOCK(tls_device_lock); +static struct page *dummy_page; + static void tls_device_free_ctx(struct tls_context *ctx) { if (ctx->tx_conf == TLS_HW) { @@ -312,36 +314,33 @@ static int tls_push_record(struct sock *sk, return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } -static int tls_device_record_close(struct sock *sk, - struct tls_context *ctx, - struct tls_record_info *record, - struct page_frag *pfrag, - unsigned char record_type) +static void tls_device_record_close(struct sock *sk, + struct tls_context *ctx, + struct tls_record_info *record, + struct page_frag *pfrag, + unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; - int ret; + struct page_frag dummy_tag_frag; /* append tag * device will fill in the tag, we just need to append a placeholder * use socket memory to improve coalescing (re-using a single buffer * increases frag count) - * if we can't allocate memory now, steal some back from data + * if we can't allocate memory now use the dummy page */ - if (likely(skb_page_frag_refill(prot->tag_size, pfrag, - sk->sk_allocation))) { - ret = 0; - tls_append_frag(record, pfrag, prot->tag_size); - } else { - ret = prot->tag_size; - if (record->len <= prot->overhead_size) - return -ENOMEM; + if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) && + !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) { + dummy_tag_frag.page = dummy_page; + dummy_tag_frag.offset = 0; + pfrag = &dummy_tag_frag; } + tls_append_frag(record, pfrag, prot->tag_size); /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, record_type); - return ret; } static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, @@ -541,18 +540,8 @@ last_record: if (done || record->len >= max_open_record_len || (record->num_frags >= MAX_SKB_FRAGS - 1)) { - rc = tls_device_record_close(sk, tls_ctx, record, - pfrag, record_type); - if (rc) { - if (rc > 0) { - size += rc; - } else { - size = orig_size; - destroy_record(record); - ctx->open_record = NULL; - break; - } - } + tls_device_record_close(sk, tls_ctx, record, + pfrag, record_type); rc = tls_push_record(sk, tls_ctx, @@ -1450,14 +1439,26 @@ int __init tls_device_init(void) { int err; - destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); - if (!destruct_wq) + dummy_page = alloc_page(GFP_KERNEL); + if (!dummy_page) return -ENOMEM; + destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); + if (!destruct_wq) { + err = -ENOMEM; + goto err_free_dummy; + } + err = register_netdevice_notifier(&tls_dev_notifier); if (err) - destroy_workqueue(destruct_wq); + goto err_destroy_wq; + return 0; + +err_destroy_wq: + destroy_workqueue(destruct_wq); +err_free_dummy: + put_page(dummy_page); return err; } @@ -1466,4 +1467,5 @@ void __exit tls_device_cleanup(void) unregister_netdevice_notifier(&tls_dev_notifier); destroy_workqueue(destruct_wq); clean_acked_data_flush(); + put_page(dummy_page); } From 32d0a49d36a2a306c2e47fe5659361e424f0ed3f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 17:26:52 +0000 Subject: [PATCH 126/230] macsec: use DEV_STATS_INC() syzbot/KCSAN reported data-races in macsec whenever dev->stats fields are updated. It appears all of these updates can happen from multiple cpus. Adopt SMP safe DEV_STATS_INC() to update dev->stats fields. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/net/macsec.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 984dfa5d6c11..144ec756c796 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -743,7 +743,7 @@ static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsLate++; u64_stats_update_end(&rxsc_stats->syncp); - secy->netdev->stats.rx_dropped++; + DEV_STATS_INC(secy->netdev, rx_dropped); return false; } @@ -767,7 +767,7 @@ static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u rxsc_stats->stats.InPktsNotValid++; u64_stats_update_end(&rxsc_stats->syncp); this_cpu_inc(rx_sa->stats->InPktsNotValid); - secy->netdev->stats.rx_errors++; + DEV_STATS_INC(secy->netdev, rx_errors); return false; } @@ -1069,7 +1069,7 @@ static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoTag++; u64_stats_update_end(&secy_stats->syncp); - macsec->secy.netdev->stats.rx_dropped++; + DEV_STATS_INC(macsec->secy.netdev, rx_dropped); continue; } @@ -1179,7 +1179,7 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsBadTag++; u64_stats_update_end(&secy_stats->syncp); - secy->netdev->stats.rx_errors++; + DEV_STATS_INC(secy->netdev, rx_errors); goto drop_nosa; } @@ -1196,7 +1196,7 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsNotUsingSA++; u64_stats_update_end(&rxsc_stats->syncp); - secy->netdev->stats.rx_errors++; + DEV_STATS_INC(secy->netdev, rx_errors); if (active_rx_sa) this_cpu_inc(active_rx_sa->stats->InPktsNotUsingSA); goto drop_nosa; @@ -1230,7 +1230,7 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsLate++; u64_stats_update_end(&rxsc_stats->syncp); - macsec->secy.netdev->stats.rx_dropped++; + DEV_STATS_INC(macsec->secy.netdev, rx_dropped); goto drop; } } @@ -1271,7 +1271,7 @@ deliver: if (ret == NET_RX_SUCCESS) count_rx(dev, len); else - macsec->secy.netdev->stats.rx_dropped++; + DEV_STATS_INC(macsec->secy.netdev, rx_dropped); rcu_read_unlock(); @@ -1308,7 +1308,7 @@ nosci: u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoSCI++; u64_stats_update_end(&secy_stats->syncp); - macsec->secy.netdev->stats.rx_errors++; + DEV_STATS_INC(macsec->secy.netdev, rx_errors); continue; } @@ -1327,7 +1327,7 @@ nosci: secy_stats->stats.InPktsUnknownSCI++; u64_stats_update_end(&secy_stats->syncp); } else { - macsec->secy.netdev->stats.rx_dropped++; + DEV_STATS_INC(macsec->secy.netdev, rx_dropped); } } @@ -3422,7 +3422,7 @@ static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, if (!secy->operational) { kfree_skb(skb); - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } @@ -3430,7 +3430,7 @@ static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, skb = macsec_encrypt(skb, dev); if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EINPROGRESS) - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } @@ -3667,9 +3667,9 @@ static void macsec_get_stats64(struct net_device *dev, dev_fetch_sw_netstats(s, dev->tstats); - s->rx_dropped = dev->stats.rx_dropped; - s->tx_dropped = dev->stats.tx_dropped; - s->rx_errors = dev->stats.rx_errors; + s->rx_dropped = atomic_long_read(&dev->stats.__rx_dropped); + s->tx_dropped = atomic_long_read(&dev->stats.__tx_dropped); + s->rx_errors = atomic_long_read(&dev->stats.__rx_errors); } static int macsec_get_iflink(const struct net_device *dev) From a0fc452a5d7fed986205539259df1d60546f536c Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 6 Aug 2023 02:11:58 +1000 Subject: [PATCH 127/230] open: make RESOLVE_CACHED correctly test for O_TMPFILE O_TMPFILE is actually __O_TMPFILE|O_DIRECTORY. This means that the old fast-path check for RESOLVE_CACHED would reject all users passing O_DIRECTORY with -EAGAIN, when in fact the intended test was to check for __O_TMPFILE. Cc: stable@vger.kernel.org # v5.12+ Fixes: 99668f618062 ("fs: expose LOOKUP_CACHED through openat2() RESOLVE_CACHED") Signed-off-by: Aleksa Sarai Message-Id: <20230806-resolve_cached-o_tmpfile-v1-1-7ba16308465e@cyphar.com> Signed-off-by: Christian Brauner --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index 0c55c8e7f837..e6ead0f19964 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1322,7 +1322,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ - if (flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } From 0a2c2baafa312ac4cec4f0bababedab3f971f224 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 5 Aug 2023 10:49:31 -0700 Subject: [PATCH 128/230] proc: fix missing conversion to 'iterate_shared' I'm looking at the directory handling due to the discussion about f_pos locking (see commit 797964253d35: "file: reinstate f_pos locking optimization for regular files"), and wanting to clean that up. And one source of ugliness is how we were supposed to move filesystems over to the '->iterate_shared()' function that only takes the inode lock for reading many many years ago, but several filesystems still use the bad old '->iterate()' that takes the inode lock for exclusive access. See commit 6192269444eb ("introduce a parallel variant of ->iterate()") that also added some documentation stating Old method is only used if the new one is absent; eventually it will be removed. Switch while you still can; the old one won't stay. and that was back in April 2016. Here we are, many years later, and the old version is still clearly sadly alive and well. Now, some of those old style iterators are probably just because the filesystem may end up having per-inode mutable data that it uses for iterating a directory, but at least one case is just a mistake. Al switched over most filesystems to use '->iterate_shared()' back when it was introduced. In particular, the /proc filesystem was converted as one of the first ones in commit f50752eaa0b0 ("switch all procfs directories ->iterate_shared()"). But then later one new user of '->iterate()' was then re-introduced by commit 6d9c939dbe4d ("procfs: add smack subdir to attrs"). And that's clearly not what we wanted, since that new case just uses the same 'proc_pident_readdir()' and 'proc_pident_lookup()' helper functions that other /proc pident directories use, and they are most definitely safe to use with the inode lock held shared. So just fix it. This still leaves a fair number of oddball filesystems using the old-style directory iterator (ceph, coda, exfat, jfs, ntfs, ocfs2, overlayfs, and vboxsf), but at least we don't have any remaining in the core filesystems. I'm going to add a wrapper function that just drops the read-lock and takes it as a write lock, so that we can clean up the core vfs layer and make all the ugly 'this filesystem needs exclusive inode locking' be just filesystem-internal warts. I just didn't want to make that conversion when we still had a core user left. Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872..9df3f4839662 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2817,7 +2817,7 @@ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ \ static const struct file_operations proc_##LSM##_attr_dir_ops = { \ .read = generic_read_dir, \ - .iterate = proc_##LSM##_attr_dir_iterate, \ + .iterate_shared = proc_##LSM##_attr_dir_iterate, \ .llseek = default_llseek, \ }; \ \ From 3e3271549670783be20e233a2b78a87a0b04c715 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 5 Aug 2023 12:25:01 -0700 Subject: [PATCH 129/230] vfs: get rid of old '->iterate' directory operation All users now just use '->iterate_shared()', which only takes the directory inode lock for reading. Filesystems that never got convered to shared mode now instead use a wrapper that drops the lock, re-takes it in write mode, calls the old function, and then downgrades the lock back to read mode. This way the VFS layer and other callers no longer need to care about filesystems that never got converted to the modern era. The filesystems that use the new wrapper are ceph, coda, exfat, jfs, ntfs, ocfs2, overlayfs, and vboxsf. Honestly, several of them look like they really could just iterate their directories in shared mode and skip the wrapper entirely, but the point of this change is to not change semantics or fix filesystems that haven't been fixed in the last 7+ years, but to finally get rid of the dual iterators. Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 5 +- Documentation/filesystems/porting.rst | 25 ++++------ fs/ceph/dir.c | 5 +- fs/coda/dir.c | 20 +++----- fs/exfat/dir.c | 3 +- fs/exportfs/expfs.c | 2 +- fs/jfs/namei.c | 3 +- fs/ntfs/dir.c | 3 +- fs/ocfs2/file.c | 5 +- fs/overlayfs/readdir.c | 3 +- fs/readdir.c | 68 ++++++++++++++++++++------- fs/vboxsf/dir.c | 3 +- include/linux/fs.h | 8 +++- 13 files changed, 95 insertions(+), 58 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index ed148919e11a..0ca479dbb1cd 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -551,9 +551,8 @@ mutex or just to use i_size_read() instead. Note: this does not protect the file->f_pos against concurrent modifications since this is something the userspace has to take care about. -->iterate() is called with i_rwsem exclusive. - -->iterate_shared() is called with i_rwsem at least shared. +->iterate_shared() is called with i_rwsem held for reading, and with the +file f_pos_lock held exclusively ->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. Most instances call fasync_helper(), which does that maintenance, so it's diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index d2d684ae7798..0f5da78ef4f9 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -537,7 +537,7 @@ vfs_readdir() is gone; switch to iterate_dir() instead **mandatory** -->readdir() is gone now; switch to ->iterate() +->readdir() is gone now; switch to ->iterate_shared() **mandatory** @@ -693,24 +693,19 @@ parallel now. --- -**recommended** +**mandatory** -->iterate_shared() is added; it's a parallel variant of ->iterate(). +->iterate_shared() is added. Exclusion on struct file level is still provided (as well as that between it and lseek on the same struct file), but if your directory has been opened several times, you can get these called in parallel. Exclusion between that method and all directory-modifying ones is still provided, of course. -Often enough ->iterate() can serve as ->iterate_shared() without any -changes - it is a read-only operation, after all. If you have any -per-inode or per-dentry in-core data structures modified by ->iterate(), -you might need something to serialize the access to them. If you -do dcache pre-seeding, you'll need to switch to d_alloc_parallel() for -that; look for in-tree examples. - -Old method is only used if the new one is absent; eventually it will -be removed. Switch while you still can; the old one won't stay. +If you have any per-inode or per-dentry in-core data structures modified +by ->iterate_shared(), you might need something to serialize the access +to them. If you do dcache pre-seeding, you'll need to switch to +d_alloc_parallel() for that; look for in-tree examples. --- @@ -930,9 +925,9 @@ should be done by looking at FMODE_LSEEK in file->f_mode. filldir_t (readdir callbacks) calling conventions have changed. Instead of returning 0 or -E... it returns bool now. false means "no more" (as -E... used to) and true - "keep going" (as 0 in old calling conventions). Rationale: -callers never looked at specific -E... values anyway. ->iterate() and -->iterate_shared() instance require no changes at all, all filldir_t ones in -the tree converted. +callers never looked at specific -E... values anyway. -> iterate_shared() +instances require no changes at all, all filldir_t ones in the tree +converted. --- diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4a2b39d9a61a..bdcffb04513f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2019,9 +2019,10 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) } } +WRAP_DIR_ITER(ceph_readdir) // FIXME! const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, - .iterate = ceph_readdir, + .iterate_shared = shared_ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, @@ -2033,7 +2034,7 @@ const struct file_operations ceph_dir_fops = { }; const struct file_operations ceph_snapdir_fops = { - .iterate = ceph_readdir, + .iterate_shared = shared_ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 8450b1bd354b..1b960de2bf39 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -429,21 +429,14 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) cfi = coda_ftoc(coda_file); host_file = cfi->cfi_container; - if (host_file->f_op->iterate || host_file->f_op->iterate_shared) { + if (host_file->f_op->iterate_shared) { struct inode *host_inode = file_inode(host_file); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { - if (host_file->f_op->iterate_shared) { - inode_lock_shared(host_inode); - ret = host_file->f_op->iterate_shared(host_file, ctx); - file_accessed(host_file); - inode_unlock_shared(host_inode); - } else { - inode_lock(host_inode); - ret = host_file->f_op->iterate(host_file, ctx); - file_accessed(host_file); - inode_unlock(host_inode); - } + inode_lock_shared(host_inode); + ret = host_file->f_op->iterate_shared(host_file, ctx); + file_accessed(host_file); + inode_unlock_shared(host_inode); } return ret; } @@ -585,10 +578,11 @@ const struct inode_operations coda_dir_inode_operations = { .setattr = coda_setattr, }; +WRAP_DIR_ITER(coda_readdir) // FIXME! const struct file_operations coda_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = coda_readdir, + .iterate_shared = shared_coda_readdir, .open = coda_open, .release = coda_release, .fsync = coda_fsync, diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 598081d0d059..e1586bba6d86 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -306,10 +306,11 @@ out: return err; } +WRAP_DIR_ITER(exfat_iterate) // FIXME! const struct file_operations exfat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = exfat_iterate, + .iterate_shared = shared_exfat_iterate, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 40e624cf7e92..d1dbe47c7975 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -315,7 +315,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child) goto out; error = -EINVAL; - if (!file->f_op->iterate && !file->f_op->iterate_shared) + if (!file->f_op->iterate_shared) goto out_close; buffer.sequence = 0; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9b030297aa64..e98ddb2b1cf2 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1535,9 +1535,10 @@ const struct inode_operations jfs_dir_inode_operations = { #endif }; +WRAP_DIR_ITER(jfs_readdir) // FIXME! const struct file_operations jfs_dir_operations = { .read = generic_read_dir, - .iterate = jfs_readdir, + .iterate_shared = shared_jfs_readdir, .fsync = jfs_fsync, .unlocked_ioctl = jfs_ioctl, .compat_ioctl = compat_ptr_ioctl, diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 518c3a21a556..4596c90e7b7c 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1525,10 +1525,11 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, #endif /* NTFS_RW */ +WRAP_DIR_ITER(ntfs_readdir) // FIXME! const struct file_operations ntfs_dir_ops = { .llseek = generic_file_llseek, /* Seek inside directory. */ .read = generic_read_dir, /* Return -EISDIR. */ - .iterate = ntfs_readdir, /* Read directory contents. */ + .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ #ifdef NTFS_RW .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ #endif /* NTFS_RW */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 91a194596552..bf2c17ea96a0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2793,10 +2793,11 @@ const struct file_operations ocfs2_fops = { .remap_file_range = ocfs2_remap_file_range, }; +WRAP_DIR_ITER(ocfs2_readdir) // FIXME! const struct file_operations ocfs2_dops = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, @@ -2842,7 +2843,7 @@ const struct file_operations ocfs2_fops_no_plocks = { const struct file_operations ocfs2_dops_no_plocks = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index ee5c4736480f..de39e067ae65 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -954,10 +954,11 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return 0; } +WRAP_DIR_ITER(ovl_iterate) // FIXME! const struct file_operations ovl_dir_operations = { .read = generic_read_dir, .open = ovl_dir_open, - .iterate = ovl_iterate, + .iterate_shared = shared_ovl_iterate, .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, diff --git a/fs/readdir.c b/fs/readdir.c index b264ce60114d..c8c46e294431 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -24,6 +24,53 @@ #include +/* + * Some filesystems were never converted to '->iterate_shared()' + * and their directory iterators want the inode lock held for + * writing. This wrapper allows for converting from the shared + * semantics to the exclusive inode use. + */ +int wrap_directory_iterator(struct file *file, + struct dir_context *ctx, + int (*iter)(struct file *, struct dir_context *)) +{ + struct inode *inode = file_inode(file); + int ret; + + /* + * We'd love to have an 'inode_upgrade_trylock()' operation, + * see the comment in mmap_upgrade_trylock() in mm/memory.c. + * + * But considering this is for "filesystems that never got + * converted", it really doesn't matter. + * + * Also note that since we have to return with the lock held + * for reading, we can't use the "killable()" locking here, + * since we do need to get the lock even if we're dying. + * + * We could do the write part killably and then get the read + * lock unconditionally if it mattered, but see above on why + * this does the very simplistic conversion. + */ + up_read(&inode->i_rwsem); + down_write(&inode->i_rwsem); + + /* + * Since we dropped the inode lock, we should do the + * DEADDIR test again. See 'iterate_dir()' below. + * + * Note that we don't need to re-do the f_pos games, + * since the file must be locked wrt f_pos anyway. + */ + ret = -ENOENT; + if (!IS_DEADDIR(inode)) + ret = iter(file, ctx); + + downgrade_write(&inode->i_rwsem); + return ret; +} +EXPORT_SYMBOL(wrap_directory_iterator); + /* * Note the "unsafe_put_user() semantics: we goto a * label for errors. @@ -40,39 +87,28 @@ int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - bool shared = false; int res = -ENOTDIR; - if (file->f_op->iterate_shared) - shared = true; - else if (!file->f_op->iterate) + + if (!file->f_op->iterate_shared) goto out; res = security_file_permission(file, MAY_READ); if (res) goto out; - if (shared) - res = down_read_killable(&inode->i_rwsem); - else - res = down_write_killable(&inode->i_rwsem); + res = down_read_killable(&inode->i_rwsem); if (res) goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; - if (shared) - res = file->f_op->iterate_shared(file, ctx); - else - res = file->f_op->iterate(file, ctx); + res = file->f_op->iterate_shared(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } - if (shared) - inode_unlock_shared(inode); - else - inode_unlock(inode); + inode_unlock_shared(inode); out: return res; } diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 075f15c43c78..5f1a14d5b927 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -179,9 +179,10 @@ static int vboxsf_dir_iterate(struct file *dir, struct dir_context *ctx) return 0; } +WRAP_DIR_ITER(vboxsf_dir_iterate) // FIXME! const struct file_operations vboxsf_dir_fops = { .open = vboxsf_dir_open, - .iterate = vboxsf_dir_iterate, + .iterate_shared = shared_vboxsf_dir_iterate, .release = vboxsf_dir_release, .read = generic_read_dir, .llseek = generic_file_llseek, diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..562f2623c9c9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1780,7 +1780,6 @@ struct file_operations { ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags); - int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); @@ -1817,6 +1816,13 @@ struct file_operations { unsigned int poll_flags); } __randomize_layout; +/* Wrap a directory iterator that needs exclusive inode access */ +int wrap_directory_iterator(struct file *, struct dir_context *, + int (*) (struct file *, struct dir_context *)); +#define WRAP_DIR_ITER(x) \ + static int shared_##x(struct file *file , struct dir_context *ctx) \ + { return wrap_directory_iterator(file, ctx, x); } + struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); From 7d84d1b9af6366aa9df1b523bdb7e002372e38d0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 6 Aug 2023 14:49:35 +0200 Subject: [PATCH 130/230] fs: rely on ->iterate_shared to determine f_pos locking Now that we removed ->iterate we don't need to check for either ->iterate or ->iterate_shared in file_needs_f_pos_lock(). Simply check for ->iterate_shared instead. This will tell us whether we need to unconditionally take the lock. Not just does it allow us to avoid checking f_inode's mode it also actually clearly shows that we're locking because of readdir. Signed-off-by: Christian Brauner --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index dbca26ef7a01..3fd003a8604f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1049,7 +1049,7 @@ unsigned long __fdget_raw(unsigned int fd) static inline bool file_needs_f_pos_lock(struct file *file) { return (file->f_mode & FMODE_ATOMIC_POS) && - (file_count(file) > 1 || S_ISDIR(file_inode(file)->i_mode)); + (file_count(file) > 1 || file->f_op->iterate_shared); } unsigned long __fdget_pos(unsigned int fd) From b1c936e9af5dd08636d568736fc6075ed9d1d529 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Fri, 4 Aug 2023 18:53:36 +0300 Subject: [PATCH 131/230] drivers: vxlan: vnifilter: free percpu vni stats on error path In case rhashtable_lookup_insert_fast() fails inside vxlan_vni_add(), the allocated percpu vni stats are not freed on the error path. Introduce vxlan_vni_free() which would work as a nice wrapper to free vxlan_vni_node resources properly. Found by Linux Verification Center (linuxtesting.org). Fixes: 4095e0e1328a ("drivers: vxlan: vnifilter: per vni stats") Suggested-by: Ido Schimmel Signed-off-by: Fedor Pchelkin Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_vnifilter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index a3de081cda5e..c3ff30ab782e 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -713,6 +713,12 @@ static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan, return vninode; } +static void vxlan_vni_free(struct vxlan_vni_node *vninode) +{ + free_percpu(vninode->stats); + kfree(vninode); +} + static int vxlan_vni_add(struct vxlan_dev *vxlan, struct vxlan_vni_group *vg, u32 vni, union vxlan_addr *group, @@ -740,7 +746,7 @@ static int vxlan_vni_add(struct vxlan_dev *vxlan, &vninode->vnode, vxlan_vni_rht_params); if (err) { - kfree(vninode); + vxlan_vni_free(vninode); return err; } @@ -763,8 +769,7 @@ static void vxlan_vni_node_rcu_free(struct rcu_head *rcu) struct vxlan_vni_node *v; v = container_of(rcu, struct vxlan_vni_node, rcu); - free_percpu(v->stats); - kfree(v); + vxlan_vni_free(v); } static int vxlan_vni_del(struct vxlan_dev *vxlan, From 52417a95ff2d810dc31a68ae71102e741efea772 Mon Sep 17 00:00:00 2001 From: Nitya Sunkad Date: Fri, 4 Aug 2023 13:56:22 -0700 Subject: [PATCH 132/230] ionic: Add missing err handling for queue reconfig ionic_start_queues_reconfig returns an error code if txrx_init fails. Handle this error code in the relevant places. This fixes a corner case where the device could get left in a detached state if the CMB reconfig fails and the attempt to clean up the mess also fails. Note that calling netif_device_attach when the netdev is already attached does not lead to unexpected behavior. Change goto name "errout" to "err_out" to maintain consistency across goto statements. Fixes: 40bc471dc714 ("ionic: add tx/rx-push support with device Component Memory Buffers") Fixes: 6f7d6f0fd7a3 ("ionic: pull reset_queues into tx_timeout handler") Signed-off-by: Nitya Sunkad Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- .../net/ethernet/pensando/ionic/ionic_lif.c | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 612b0015dc43..432fb93aa801 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1817,6 +1817,7 @@ static int ionic_change_mtu(struct net_device *netdev, int new_mtu) static void ionic_tx_timeout_work(struct work_struct *ws) { struct ionic_lif *lif = container_of(ws, struct ionic_lif, tx_timeout_work); + int err; if (test_bit(IONIC_LIF_F_FW_RESET, lif->state)) return; @@ -1829,8 +1830,11 @@ static void ionic_tx_timeout_work(struct work_struct *ws) mutex_lock(&lif->queue_lock); ionic_stop_queues_reconfig(lif); - ionic_start_queues_reconfig(lif); + err = ionic_start_queues_reconfig(lif); mutex_unlock(&lif->queue_lock); + + if (err) + dev_err(lif->ionic->dev, "%s: Restarting queues failed\n", __func__); } static void ionic_tx_timeout(struct net_device *netdev, unsigned int txqueue) @@ -2800,17 +2804,22 @@ static int ionic_cmb_reconfig(struct ionic_lif *lif, if (err) { dev_err(lif->ionic->dev, "CMB restore failed: %d\n", err); - goto errout; + goto err_out; } } - ionic_start_queues_reconfig(lif); - } else { - /* This was detached in ionic_stop_queues_reconfig() */ - netif_device_attach(lif->netdev); + err = ionic_start_queues_reconfig(lif); + if (err) { + dev_err(lif->ionic->dev, + "CMB reconfig failed: %d\n", err); + goto err_out; + } } -errout: +err_out: + /* This was detached in ionic_stop_queues_reconfig() */ + netif_device_attach(lif->netdev); + return err; } From 0a46781c89dece85386885a407244ca26e5c1c44 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 12 Jul 2023 18:26:45 +0530 Subject: [PATCH 133/230] dmaengine: mcf-edma: Fix a potential un-allocated memory access When 'mcf_edma' is allocated, some space is allocated for a flexible array at the end of the struct. 'chans' item are allocated, that is to say 'pdata->dma_channels'. Then, this number of item is stored in 'mcf_edma->n_chans'. A few lines later, if 'mcf_edma->n_chans' is 0, then a default value of 64 is set. This ends to no space allocated by devm_kzalloc() because chans was 0, but 64 items are read and/or written in some not allocated memory. Change the logic to define a default value before allocating the memory. Fixes: e7a3ff92eaf1 ("dmaengine: fsl-edma: add ColdFire mcf5441x edma support") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/f55d914407c900828f6fad3ea5fa791a5f17b9a4.1685172449.git.christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- drivers/dma/mcf-edma.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/dma/mcf-edma.c b/drivers/dma/mcf-edma.c index ebd8733f72ad..9413fad08a60 100644 --- a/drivers/dma/mcf-edma.c +++ b/drivers/dma/mcf-edma.c @@ -190,7 +190,13 @@ static int mcf_edma_probe(struct platform_device *pdev) return -EINVAL; } - chans = pdata->dma_channels; + if (!pdata->dma_channels) { + dev_info(&pdev->dev, "setting default channel number to 64"); + chans = 64; + } else { + chans = pdata->dma_channels; + } + len = sizeof(*mcf_edma) + sizeof(*mcf_chan) * chans; mcf_edma = devm_kzalloc(&pdev->dev, len, GFP_KERNEL); if (!mcf_edma) @@ -202,11 +208,6 @@ static int mcf_edma_probe(struct platform_device *pdev) mcf_edma->drvdata = &mcf_data; mcf_edma->big_endian = 1; - if (!mcf_edma->n_chans) { - dev_info(&pdev->dev, "setting default channel number to 64"); - mcf_edma->n_chans = 64; - } - mutex_init(&mcf_edma->fsl_edma_mutex); mcf_edma->membase = devm_platform_ioremap_resource(pdev, 0); From e2dcbc330f46afb82fd49a6dcbb10f6cdcb466ec Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 7 Jul 2023 13:50:03 -0600 Subject: [PATCH 134/230] dmaengine: qcom_hidma: Update codeaurora email domain The codeaurora.org email domain is defunct and will bounce. Update entries to Sinan's kernel.org address which is the address in MAINTAINERS for this component. Signed-off-by: Jeffrey Hugo Acked-By: Sinan Kaya Link: https://lore.kernel.org/r/20230707195003.6619-1-quic_jhugo@quicinc.com Signed-off-by: Vinod Koul --- .../ABI/testing/sysfs-platform-hidma | 2 +- .../ABI/testing/sysfs-platform-hidma-mgmt | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-platform-hidma b/Documentation/ABI/testing/sysfs-platform-hidma index fca40a54df59..a80aeda85ef6 100644 --- a/Documentation/ABI/testing/sysfs-platform-hidma +++ b/Documentation/ABI/testing/sysfs-platform-hidma @@ -2,7 +2,7 @@ What: /sys/devices/platform/hidma-*/chid /sys/devices/platform/QCOM8061:*/chid Date: Dec 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Contains the ID of the channel within the HIDMA instance. It is used to associate a given HIDMA channel with the diff --git a/Documentation/ABI/testing/sysfs-platform-hidma-mgmt b/Documentation/ABI/testing/sysfs-platform-hidma-mgmt index 3b6c5c9eabdc..0373745b4e18 100644 --- a/Documentation/ABI/testing/sysfs-platform-hidma-mgmt +++ b/Documentation/ABI/testing/sysfs-platform-hidma-mgmt @@ -2,7 +2,7 @@ What: /sys/devices/platform/hidma-mgmt*/chanops/chan*/priority /sys/devices/platform/QCOM8060:*/chanops/chan*/priority Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Contains either 0 or 1 and indicates if the DMA channel is a low priority (0) or high priority (1) channel. @@ -11,7 +11,7 @@ What: /sys/devices/platform/hidma-mgmt*/chanops/chan*/weight /sys/devices/platform/QCOM8060:*/chanops/chan*/weight Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Contains 0..15 and indicates the weight of the channel among equal priority channels during round robin scheduling. @@ -20,7 +20,7 @@ What: /sys/devices/platform/hidma-mgmt*/chreset_timeout_cycles /sys/devices/platform/QCOM8060:*/chreset_timeout_cycles Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Contains the platform specific cycle value to wait after a reset command is issued. If the value is chosen too short, @@ -32,7 +32,7 @@ What: /sys/devices/platform/hidma-mgmt*/dma_channels /sys/devices/platform/QCOM8060:*/dma_channels Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Contains the number of dma channels supported by one instance of HIDMA hardware. The value may change from chip to chip. @@ -41,7 +41,7 @@ What: /sys/devices/platform/hidma-mgmt*/hw_version_major /sys/devices/platform/QCOM8060:*/hw_version_major Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Version number major for the hardware. @@ -49,7 +49,7 @@ What: /sys/devices/platform/hidma-mgmt*/hw_version_minor /sys/devices/platform/QCOM8060:*/hw_version_minor Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Version number minor for the hardware. @@ -57,7 +57,7 @@ What: /sys/devices/platform/hidma-mgmt*/max_rd_xactions /sys/devices/platform/QCOM8060:*/max_rd_xactions Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Contains a value between 0 and 31. Maximum number of read transactions that can be issued back to back. @@ -69,7 +69,7 @@ What: /sys/devices/platform/hidma-mgmt*/max_read_request /sys/devices/platform/QCOM8060:*/max_read_request Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Size of each read request. The value needs to be a power of two and can be between 128 and 1024. @@ -78,7 +78,7 @@ What: /sys/devices/platform/hidma-mgmt*/max_wr_xactions /sys/devices/platform/QCOM8060:*/max_wr_xactions Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Contains a value between 0 and 31. Maximum number of write transactions that can be issued back to back. @@ -91,7 +91,7 @@ What: /sys/devices/platform/hidma-mgmt*/max_write_request /sys/devices/platform/QCOM8060:*/max_write_request Date: Nov 2015 KernelVersion: 4.4 -Contact: "Sinan Kaya " +Contact: "Sinan Kaya " Description: Size of each write request. The value needs to be a power of two and can be between 128 and 1024. From 8cda3ececf07d374774f6a13e5a94bc2dc04c26c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 26 May 2023 13:54:34 +0300 Subject: [PATCH 135/230] dmaengine: pl330: Return DMA_PAUSED when transaction is paused MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pl330_pause() does not set anything to indicate paused condition which causes pl330_tx_status() to return DMA_IN_PROGRESS. This breaks 8250 DMA flush after the fix in commit 57e9af7831dc ("serial: 8250_dma: Fix DMA Rx rearm race"). The function comment for pl330_pause() claims pause is supported but resume is not which is enough for 8250 DMA flush to work as long as DMA status reports DMA_PAUSED when appropriate. Add PAUSED state for descriptor and mark BUSY descriptors with PAUSED in pl330_pause(). Return DMA_PAUSED from pl330_tx_status() when the descriptor is PAUSED. Reported-by: Richard Tresidder Tested-by: Richard Tresidder Fixes: 88987d2c7534 ("dmaengine: pl330: add DMA_PAUSE feature") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-serial/f8a86ecd-64b1-573f-c2fa-59f541083f1a@electromag.com.au/ Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20230526105434.14959-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Vinod Koul --- drivers/dma/pl330.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index b4731fe6bbc1..3cf0b38387ae 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -403,6 +403,12 @@ enum desc_status { * of a channel can be BUSY at any time. */ BUSY, + /* + * Pause was called while descriptor was BUSY. Due to hardware + * limitations, only termination is possible for descriptors + * that have been paused. + */ + PAUSED, /* * Sitting on the channel work_list but xfer done * by PL330 core @@ -2041,7 +2047,7 @@ static inline void fill_queue(struct dma_pl330_chan *pch) list_for_each_entry(desc, &pch->work_list, node) { /* If already submitted */ - if (desc->status == BUSY) + if (desc->status == BUSY || desc->status == PAUSED) continue; ret = pl330_submit_req(pch->thread, desc); @@ -2326,6 +2332,7 @@ static int pl330_pause(struct dma_chan *chan) { struct dma_pl330_chan *pch = to_pchan(chan); struct pl330_dmac *pl330 = pch->dmac; + struct dma_pl330_desc *desc; unsigned long flags; pm_runtime_get_sync(pl330->ddma.dev); @@ -2335,6 +2342,10 @@ static int pl330_pause(struct dma_chan *chan) _stop(pch->thread); spin_unlock(&pl330->lock); + list_for_each_entry(desc, &pch->work_list, node) { + if (desc->status == BUSY) + desc->status = PAUSED; + } spin_unlock_irqrestore(&pch->lock, flags); pm_runtime_mark_last_busy(pl330->ddma.dev); pm_runtime_put_autosuspend(pl330->ddma.dev); @@ -2425,7 +2436,7 @@ pl330_tx_status(struct dma_chan *chan, dma_cookie_t cookie, else if (running && desc == running) transferred = pl330_get_current_xferred_count(pch, desc); - else if (desc->status == BUSY) + else if (desc->status == BUSY || desc->status == PAUSED) /* * Busy but not running means either just enqueued, * or finished and not yet marked done @@ -2442,6 +2453,9 @@ pl330_tx_status(struct dma_chan *chan, dma_cookie_t cookie, case DONE: ret = DMA_COMPLETE; break; + case PAUSED: + ret = DMA_PAUSED; + break; case PREP: case BUSY: ret = DMA_IN_PROGRESS; From 863676fe1ac1b82fc9eb56c242e80acfbfc18b76 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 12 Jul 2023 12:35:05 -0700 Subject: [PATCH 136/230] dmaengine: idxd: Clear PRS disable flag when disabling IDXD device Disabling IDXD device doesn't reset Page Request Service (PRS) disable flag to its initial value 0. This may cause user confusion because once PRS is disabled user will see PRS still remains the previous setting (i.e. disabled) via sysfs interface even after the device is disabled. To eliminate user confusion, reset PRS disable flag to ensure that the PRS flag bit reflects correct state after the device is disabled. Additionally, simplify the code by setting wq->flags to 0, which clears all flag bits, including any future additions. Fixes: f2dc327131b5 ("dmaengine: idxd: add per wq PRS disable") Tested-by: Tony Zhu Signed-off-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20230712193505.3440752-1-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 5abbcc61c528..9a15f0d12c79 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -384,9 +384,7 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq) wq->threshold = 0; wq->priority = 0; wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; - clear_bit(WQ_FLAG_DEDICATED, &wq->flags); - clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags); - clear_bit(WQ_FLAG_ATS_DISABLE, &wq->flags); + wq->flags = 0; memset(wq->name, 0, WQ_NAME_SIZE); wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER; idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH); From 74d7221c1f9c9f3a8c316a3557ca7dca8b99d14c Mon Sep 17 00:00:00 2001 From: Zhang Jianhua Date: Sat, 22 Jul 2023 15:32:44 +0000 Subject: [PATCH 137/230] dmaengine: owl-dma: Modify mismatched function name No functional modification involved. drivers/dma/owl-dma.c:208: warning: expecting prototype for struct owl_dma_pchan. Prototype was for struct owl_dma_vchan instead HDRTEST usr/include/sound/asequencer.h Fixes: 47e20577c24d ("dmaengine: Add Actions Semi Owl family S900 DMA driver") Signed-off-by: Zhang Jianhua Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20230722153244.2086949-1-chris.zjh@huawei.com Signed-off-by: Vinod Koul --- drivers/dma/owl-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/owl-dma.c b/drivers/dma/owl-dma.c index 95a462a1f511..b6e0ac8314e5 100644 --- a/drivers/dma/owl-dma.c +++ b/drivers/dma/owl-dma.c @@ -192,7 +192,7 @@ struct owl_dma_pchan { }; /** - * struct owl_dma_pchan - Wrapper for DMA ENGINE channel + * struct owl_dma_vchan - Wrapper for DMA ENGINE channel * @vc: wrapped virtual channel * @pchan: the physical channel utilized by this channel * @txd: active transaction on this channel From 96891e90d1256b569b1c183e7c9a0cfc568fa3b0 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 31 Jul 2023 12:14:39 +0200 Subject: [PATCH 138/230] dmaengine: xilinx: xdma: Fix interrupt vector setting A couple of hardware registers need to be set to reflect which interrupts have been allocated to the device. Each register is 32-bit wide and can receive four 8-bit values. If we provide any other interrupt number than four, the irq_num variable will never be 0 within the while check and the while block will loop forever. There is an easy way to prevent this: just break the for loop when we reach "irq_num == 0", which anyway means all interrupts have been processed. Cc: stable@vger.kernel.org Fixes: 17ce252266c7 ("dmaengine: xilinx: xdma: Add xilinx xdma driver") Signed-off-by: Miquel Raynal Acked-by: Lizhi Hou Link: https://lore.kernel.org/r/20230731101442.792514-2-miquel.raynal@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index ad5ff63354cf..5116188b9977 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -668,6 +668,8 @@ static int xdma_set_vector_reg(struct xdma_device *xdev, u32 vec_tbl_start, val |= irq_start << shift; irq_start++; irq_num--; + if (!irq_num) + break; } /* write IRQ register */ From 422dbc66b7702ae797326d5480c3c9b6467053da Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 31 Jul 2023 12:14:40 +0200 Subject: [PATCH 139/230] dmaengine: xilinx: xdma: Fix typo Probably a copy/paste error with the previous block, here we are actually managing C2H IRQs. Fixes: 17ce252266c7 ("dmaengine: xilinx: xdma: Add xilinx xdma driver") Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20230731101442.792514-3-miquel.raynal@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 5116188b9977..e0bfd129d563 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -717,7 +717,7 @@ static int xdma_irq_init(struct xdma_device *xdev) ret = request_irq(irq, xdma_channel_isr, 0, "xdma-c2h-channel", &xdev->c2h_chans[j]); if (ret) { - xdma_err(xdev, "H2C channel%d request irq%d failed: %d", + xdma_err(xdev, "C2H channel%d request irq%d failed: %d", j, irq, ret); goto failed_init_c2h; } From 52a93d39b17dc7eb98b6aa3edb93943248e03b2f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Aug 2023 15:07:51 -0700 Subject: [PATCH 140/230] Linux 6.5-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 653238528aac..6bbf9db6b414 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* From 5a15d8348881e9371afdf9f5357a135489496955 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 7 Aug 2023 10:46:04 +0200 Subject: [PATCH 141/230] x86/srso: Tie SBPB bit setting to microcode patch detection The SBPB bit in MSR_IA32_PRED_CMD is supported only after a microcode patch has been applied so set X86_FEATURE_SBPB only then. Otherwise, guests would attempt to set that bit and #GP on the MSR write. While at it, make SMT detection more robust as some guests - depending on how and what CPUID leafs their report - lead to cpu_smt_control getting set to CPU_SMT_NOT_SUPPORTED but SRSO_NO should be set for any guest incarnation where one simply cannot do SMT, for whatever reason. Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation") Reported-by: Konrad Rzeszutek Wilk Reported-by: Salvatore Bonaccorso Signed-off-by: Borislav Petkov (AMD) --- arch/x86/kernel/cpu/amd.c | 19 ++++++++++++------- arch/x86/kernel/cpu/bugs.c | 7 +++---- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 834f310b2f1a..41e10c26efb5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1238,14 +1238,19 @@ EXPORT_SYMBOL_GPL(amd_get_highest_perf); bool cpu_has_ibpb_brtype_microcode(void) { - u8 fam = boot_cpu_data.x86; - + switch (boot_cpu_data.x86) { /* Zen1/2 IBPB flushes branch type predictions too. */ - if (fam == 0x17) + case 0x17: return boot_cpu_has(X86_FEATURE_AMD_IBPB); - /* Poke the MSR bit on Zen3/4 to check its presence. */ - else if (fam == 0x19) - return !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB); - else + case 0x19: + /* Poke the MSR bit on Zen3/4 to check its presence. */ + if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + setup_force_cpu_cap(X86_FEATURE_SBPB); + return true; + } else { + return false; + } + default: return false; + } } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7314a6bdc862..2afe69bff218 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2265,14 +2265,13 @@ static void __init srso_select_mitigation(void) * flags for guests. */ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); - setup_force_cpu_cap(X86_FEATURE_SBPB); /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. */ if ((boot_cpu_data.x86 < 0x19) && - (cpu_smt_control == CPU_SMT_DISABLED)) + (!cpu_smt_possible() || (cpu_smt_control == CPU_SMT_DISABLED))) setup_force_cpu_cap(X86_FEATURE_SRSO_NO); } @@ -2345,8 +2344,8 @@ static void __init srso_select_mitigation(void) pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); pred_cmd: - if (boot_cpu_has(X86_FEATURE_SRSO_NO) || - srso_cmd == SRSO_CMD_OFF) + if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) && + boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; } From 2cbd80642b76480c9b0697297af917d9388a0b46 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 4 Aug 2023 22:17:32 +0200 Subject: [PATCH 142/230] gfs2: Fix freeze consistency check in gfs2_trans_add_meta Function gfs2_trans_add_meta() checks for the SDF_FROZEN flag to make sure that no buffers are added to a transaction while the filesystem is frozen. With the recent freeze/thaw rework, the SDF_FROZEN flag is cleared after thaw_super() is called, which is sufficient for serializing freeze/thaw. However, other filesystem operations started after thaw_super() may now be calling gfs2_trans_add_meta() before the SDF_FROZEN flag is cleared, which will trigger the SDF_FROZEN check in gfs2_trans_add_meta(). Fix that by checking the s_writers.frozen state instead. In addition, make sure not to call gfs2_assert_withdraw() with the sd_log_lock spin lock held. Check for a withdrawn filesystem before checking for a frozen filesystem, and don't pin/add buffers to the current transaction in case of a failure in either case. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/trans.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ec1631257978..7e835be7032d 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -230,9 +230,11 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct super_block *sb = sdp->sd_vfs; struct gfs2_bufdata *bd; struct gfs2_meta_header *mh; struct gfs2_trans *tr = current->journal_info; + bool withdraw = false; lock_buffer(bh); if (buffer_pinned(bh)) { @@ -266,13 +268,15 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } - if (unlikely(test_bit(SDF_FROZEN, &sdp->sd_flags))) { - fs_info(sdp, "GFS2:adding buf while frozen\n"); - gfs2_assert_withdraw(sdp, 0); - } if (unlikely(gfs2_withdrawn(sdp))) { fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n", (unsigned long long)bd->bd_bh->b_blocknr); + goto out_unlock; + } + if (unlikely(sb->s_writers.frozen == SB_FREEZE_COMPLETE)) { + fs_info(sdp, "GFS2:adding buf while frozen\n"); + withdraw = true; + goto out_unlock; } gfs2_pin(sdp, bd->bd_bh); mh->__pad0 = cpu_to_be64(0); @@ -281,6 +285,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) tr->tr_num_buf_new++; out_unlock: gfs2_log_unlock(sdp); + if (withdraw) + gfs2_assert_withdraw(sdp, 0); out: unlock_buffer(bh); } From 0be8432166a61abc537e1247e530f4b85970b56b Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 2 Aug 2023 09:24:12 -0500 Subject: [PATCH 143/230] gfs2: Don't use filemap_splice_read Starting with patch 2cb1e08985, gfs2 started using the new function filemap_splice_read rather than the old (and subsequently deleted) function generic_file_splice_read. filemap_splice_read works by taking references to a number of folios in the page cache and splicing those folios into a pipe. The folios are then read from the pipe and the folio references are dropped. This can take an arbitrary amount of time. We cannot allow that in gfs2 because those folio references will pin the inode glock to the node and prevent it from being demoted, which can lead to cluster-wide deadlocks. Instead, use copy_splice_read. (In addition, the old generic_file_splice_read called into ->read_iter, which called gfs2_file_read_iter, which took the inode glock during the operation. The new filemap_splice_read interface does not take the inode glock anymore. This is fixable, but it still wouldn't prevent cluster-wide deadlocks.) Fixes: 2cb1e08985e3 ("splice: Use filemap_splice_read() instead of generic_file_splice_read()") Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 1bf3c4453516..b43fa8b8fc05 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1578,7 +1578,7 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read = filemap_splice_read, + .splice_read = copy_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, @@ -1609,7 +1609,7 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, - .splice_read = filemap_splice_read, + .splice_read = copy_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, From 0b15afc9038146bb2009e7924b1ead2e919b2a56 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 26 Jul 2023 20:00:35 +0200 Subject: [PATCH 144/230] tpm/tpm_tis: Disable interrupts for TUXEDO InfinityBook S 15/17 Gen7 TUXEDO InfinityBook S 15/17 Gen7 suffers from an IRQ problem on tpm_tis like a few other laptops. Add an entry for the workaround. Cc: stable@vger.kernel.org Fixes: e644b2f498d2 ("tpm, tpm_tis: Enable interrupt test") Link: https://bugzilla.suse.com/show_bug.cgi?id=1213645 Signed-off-by: Takashi Iwai Acked-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index cc42cf3de960..a98773ac2e55 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -162,6 +162,14 @@ static const struct dmi_system_id tpm_tis_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad L590"), }, }, + { + .callback = tpm_tis_disable_irq, + .ident = "TUXEDO InfinityBook S 15/17 Gen7", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "TUXEDO"), + DMI_MATCH(DMI_PRODUCT_NAME, "TUXEDO InfinityBook S 15/17 Gen7"), + }, + }, { .callback = tpm_tis_disable_irq, .ident = "UPX-TGL", From 0de030b308236a1392f924f527cf74614d8b6aef Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 11 Jun 2023 07:32:10 -0400 Subject: [PATCH 145/230] sysctl: set variable key_sysctls storage-class-specifier to static smatch reports security/keys/sysctl.c:12:18: warning: symbol 'key_sysctls' was not declared. Should it be static? This variable is only used in its defining file, so it should be static. Signed-off-by: Tom Rix Signed-off-by: Jarkko Sakkinen --- security/keys/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c index b72b82bb20c6..b348e1679d5d 100644 --- a/security/keys/sysctl.c +++ b/security/keys/sysctl.c @@ -9,7 +9,7 @@ #include #include "internal.h" -struct ctl_table key_sysctls[] = { +static struct ctl_table key_sysctls[] = { { .procname = "maxkeys", .data = &key_quota_maxkeys, From 554b841d470338a3b1d6335b14ee1cd0c8f5d754 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 2 Aug 2023 07:25:33 -0500 Subject: [PATCH 146/230] tpm: Disable RNG for all AMD fTPMs The TPM RNG functionality is not necessary for entropy when the CPU already supports the RDRAND instruction. The TPM RNG functionality was previously disabled on a subset of AMD fTPM series, but reports continue to show problems on some systems causing stutter root caused to TPM RNG functionality. Expand disabling TPM RNG use for all AMD fTPMs whether they have versions that claim to have fixed or not. To accomplish this, move the detection into part of the TPM CRB registration and add a flag indicating that the TPM should opt-out of registration to hwrng. Cc: stable@vger.kernel.org # 6.1.y+ Fixes: b006c439d58d ("hwrng: core - start hwrng kthread also for untrusted sources") Fixes: f1324bbc4011 ("tpm: disable hwrng for fTPM on some AMD designs") Reported-by: daniil.stas@posteo.net Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217719 Reported-by: bitlord0xff@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217212 Signed-off-by: Mario Limonciello Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-chip.c | 68 ++----------------------------------- drivers/char/tpm/tpm_crb.c | 30 ++++++++++++++++ include/linux/tpm.h | 1 + 3 files changed, 33 insertions(+), 66 deletions(-) diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index cf5499e51999..e904aae9771b 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -510,70 +510,6 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip) return 0; } -/* - * Some AMD fTPM versions may cause stutter - * https://www.amd.com/en/support/kb/faq/pa-410 - * - * Fixes are available in two series of fTPM firmware: - * 6.x.y.z series: 6.0.18.6 + - * 3.x.y.z series: 3.57.y.5 + - */ -#ifdef CONFIG_X86 -static bool tpm_amd_is_rng_defective(struct tpm_chip *chip) -{ - u32 val1, val2; - u64 version; - int ret; - - if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) - return false; - - ret = tpm_request_locality(chip); - if (ret) - return false; - - ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL); - if (ret) - goto release; - if (val1 != 0x414D4400U /* AMD */) { - ret = -ENODEV; - goto release; - } - ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_1, &val1, NULL); - if (ret) - goto release; - ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL); - -release: - tpm_relinquish_locality(chip); - - if (ret) - return false; - - version = ((u64)val1 << 32) | val2; - if ((version >> 48) == 6) { - if (version >= 0x0006000000180006ULL) - return false; - } else if ((version >> 48) == 3) { - if (version >= 0x0003005700000005ULL) - return false; - } else { - return false; - } - - dev_warn(&chip->dev, - "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n", - version); - - return true; -} -#else -static inline bool tpm_amd_is_rng_defective(struct tpm_chip *chip) -{ - return false; -} -#endif /* CONFIG_X86 */ - static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) { struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng); @@ -588,7 +524,7 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) static int tpm_add_hwrng(struct tpm_chip *chip) { if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) || - tpm_amd_is_rng_defective(chip)) + chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED) return 0; snprintf(chip->hwrng_name, sizeof(chip->hwrng_name), @@ -719,7 +655,7 @@ void tpm_chip_unregister(struct tpm_chip *chip) { tpm_del_legacy_sysfs(chip); if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip) && - !tpm_amd_is_rng_defective(chip)) + !(chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED)) hwrng_unregister(&chip->hwrng); tpm_bios_log_teardown(chip); if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip)) diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index 1a5d09b18513..9eb1a1859012 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -463,6 +463,28 @@ static bool crb_req_canceled(struct tpm_chip *chip, u8 status) return (cancel & CRB_CANCEL_INVOKE) == CRB_CANCEL_INVOKE; } +static int crb_check_flags(struct tpm_chip *chip) +{ + u32 val; + int ret; + + ret = crb_request_locality(chip, 0); + if (ret) + return ret; + + ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val, NULL); + if (ret) + goto release; + + if (val == 0x414D4400U /* AMD */) + chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED; + +release: + crb_relinquish_locality(chip, 0); + + return ret; +} + static const struct tpm_class_ops tpm_crb = { .flags = TPM_OPS_AUTO_STARTUP, .status = crb_status, @@ -800,6 +822,14 @@ static int crb_acpi_add(struct acpi_device *device) chip->acpi_dev_handle = device->handle; chip->flags = TPM_CHIP_FLAG_TPM2; + rc = tpm_chip_bootstrap(chip); + if (rc) + goto out; + + rc = crb_check_flags(chip); + if (rc) + goto out; + rc = tpm_chip_register(chip); out: diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 6a1e8f157255..4ee9d13749ad 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -283,6 +283,7 @@ enum tpm_chip_flags { TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED = BIT(6), TPM_CHIP_FLAG_FIRMWARE_UPGRADE = BIT(7), TPM_CHIP_FLAG_SUSPENDED = BIT(8), + TPM_CHIP_FLAG_HWRNG_DISABLED = BIT(9), }; #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev) From e117e7adc637e364b599dc766f1d740698e7e027 Mon Sep 17 00:00:00 2001 From: Jonathan McDowell Date: Fri, 4 Aug 2023 17:08:40 +0000 Subject: [PATCH 147/230] tpm/tpm_tis: Disable interrupts for Lenovo P620 devices The Lenovo ThinkStation P620 suffers from an irq storm issue like various other Lenovo machines, so add an entry for it to tpm_tis_dmi_table and force polling. It is worth noting that 481c2d14627d (tpm,tpm_tis: Disable interrupts after 1000 unhandled IRQs) does not seem to fix the problem on this machine, but setting 'tpm_tis.interrupts=0' on the kernel command line does. [jarkko@kernel.org: truncated the commit ID in the description to 12 characters] Cc: stable@vger.kernel.org # v6.4+ Fixes: e644b2f498d2 ("tpm, tpm_tis: Enable interrupt test") Signed-off-by: Jonathan McDowell Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index a98773ac2e55..ac4daaf294a3 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -162,6 +162,14 @@ static const struct dmi_system_id tpm_tis_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad L590"), }, }, + { + .callback = tpm_tis_disable_irq, + .ident = "ThinkStation P620", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkStation P620"), + }, + }, { .callback = tpm_tis_disable_irq, .ident = "TUXEDO InfinityBook S 15/17 Gen7", From 72cc654970658e88a1cdea08f06b11c218efa4da Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 16 Jul 2023 14:28:10 +0300 Subject: [PATCH 148/230] net/mlx5e: Take RTNL lock when needed before calling xdp_set_features() Hold RTNL lock when calling xdp_set_features() with a registered netdev, as the call triggers the netdev notifiers. This could happen when switching from uplink rep to nic profile for example. This resolves the following call trace: RTNL: assertion failed at net/core/dev.c (1953) WARNING: CPU: 6 PID: 112670 at net/core/dev.c:1953 call_netdevice_notifiers_info+0x7c/0x80 Modules linked in: sch_mqprio sch_mqprio_lib act_tunnel_key act_mirred act_skbedit cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress bonding ib_umad ip_gre rdma_ucm mlx5_vfio_pci ipip tunnel4 ip6_gre gre mlx5_ib vfio_pci vfio_pci_core vfio_iommu_type1 ib_uverbs vfio mlx5_core ib_ipoib geneve nf_tables ip6_tunnel tunnel6 iptable_raw openvswitch nsh rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: ib_uverbs] CPU: 6 PID: 112670 Comm: devlink Not tainted 6.4.0-rc7_for_upstream_min_debug_2023_06_28_17_02 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:call_netdevice_notifiers_info+0x7c/0x80 Code: 90 ff 80 3d 2d 6b f7 00 00 75 c5 ba a1 07 00 00 48 c7 c6 e4 ce 0b 82 48 c7 c7 c8 f4 04 82 c6 05 11 6b f7 00 01 e8 a4 7c 8e ff <0f> 0b eb a2 0f 1f 44 00 00 55 48 89 e5 41 54 48 83 e4 f0 48 83 ec RSP: 0018:ffff8882a21c3948 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffff82e6f880 RCX: 0000000000000027 RDX: ffff88885f99b5c8 RSI: 0000000000000001 RDI: ffff88885f99b5c0 RBP: 0000000000000028 R08: ffff88887ffabaa8 R09: 0000000000000003 R10: ffff88887fecbac0 R11: ffff88887ff7bac0 R12: ffff8882a21c3968 R13: ffff88811c018940 R14: 0000000000000000 R15: ffff8881274401a0 FS: 00007fe141c81800(0000) GS:ffff88885f980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f787c28b948 CR3: 000000014bcf3005 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __warn+0x79/0x120 ? call_netdevice_notifiers_info+0x7c/0x80 ? report_bug+0x17c/0x190 ? handle_bug+0x3c/0x60 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? call_netdevice_notifiers_info+0x7c/0x80 ? call_netdevice_notifiers_info+0x7c/0x80 call_netdevice_notifiers+0x2e/0x50 mlx5e_set_xdp_feature+0x21/0x50 [mlx5_core] mlx5e_nic_init+0xf1/0x1a0 [mlx5_core] mlx5e_netdev_init_profile+0x76/0x110 [mlx5_core] mlx5e_netdev_attach_profile+0x1f/0x90 [mlx5_core] mlx5e_netdev_change_profile+0x92/0x160 [mlx5_core] mlx5e_netdev_attach_nic_profile+0x1b/0x30 [mlx5_core] mlx5e_vport_rep_unload+0xaa/0xc0 [mlx5_core] __esw_offloads_unload_rep+0x52/0x60 [mlx5_core] mlx5_esw_offloads_rep_unload+0x52/0x70 [mlx5_core] esw_offloads_unload_rep+0x34/0x70 [mlx5_core] esw_offloads_disable+0x2b/0x90 [mlx5_core] mlx5_eswitch_disable_locked+0x1b9/0x210 [mlx5_core] mlx5_devlink_eswitch_mode_set+0xf5/0x630 [mlx5_core] ? devlink_get_from_attrs_lock+0x9e/0x110 devlink_nl_cmd_eswitch_set_doit+0x60/0xe0 genl_family_rcv_msg_doit.isra.0+0xc2/0x110 genl_rcv_msg+0x17d/0x2b0 ? devlink_get_from_attrs_lock+0x110/0x110 ? devlink_nl_cmd_eswitch_get_doit+0x290/0x290 ? devlink_pernet_pre_exit+0xf0/0xf0 ? genl_family_rcv_msg_doit.isra.0+0x110/0x110 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1f6/0x2c0 netlink_sendmsg+0x232/0x4a0 sock_sendmsg+0x38/0x60 ? _copy_from_user+0x2a/0x60 __sys_sendto+0x110/0x160 ? __count_memcg_events+0x48/0x90 ? handle_mm_fault+0x161/0x260 ? do_user_addr_fault+0x278/0x6e0 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fe141b1340a Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89 RSP: 002b:00007fff61d03de8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000afab00 RCX: 00007fe141b1340a RDX: 0000000000000038 RSI: 0000000000afab00 RDI: 0000000000000003 RBP: 0000000000afa910 R08: 00007fe141d80200 R09: 000000000000000c R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 Fixes: 4d5ab0ad964d ("net/mlx5e: take into account device reconfiguration for xdp_features flag") Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 1c820119e438..c27df14df145 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5266,6 +5266,7 @@ void mlx5e_destroy_q_counters(struct mlx5e_priv *priv) static int mlx5e_nic_init(struct mlx5_core_dev *mdev, struct net_device *netdev) { + const bool take_rtnl = netdev->reg_state == NETREG_REGISTERED; struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5e_flow_steering *fs; int err; @@ -5294,9 +5295,19 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, mlx5_core_err(mdev, "TLS initialization failed, %d\n", err); mlx5e_health_create_reporters(priv); + + /* If netdev is already registered (e.g. move from uplink to nic profile), + * RTNL lock must be held before triggering netdev notifiers. + */ + if (take_rtnl) + rtnl_lock(); + /* update XDP supported features */ mlx5e_set_xdp_feature(netdev); + if (take_rtnl) + rtnl_unlock(); + return 0; } From ac5da544a3c2047cbfd715acd9cec8380d7fe5c6 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 14 Apr 2023 08:48:20 +0000 Subject: [PATCH 149/230] net/mlx5e: TC, Fix internal port memory leak The flow rule can be splited, and the extra post_act rules are added to post_act table. It's possible to trigger memleak when the rule forwards packets from internal port and over tunnel, in the case that, for example, CT 'new' state offload is allowed. As int_port object is assigned to the flow attribute of post_act rule, and its refcnt is incremented by mlx5e_tc_int_port_get(), but mlx5e_tc_int_port_put() is not called, the refcnt is never decremented, then int_port is never freed. The kmemleak reports the following error: unreferenced object 0xffff888128204b80 (size 64): comm "handler20", pid 50121, jiffies 4296973009 (age 642.932s) hex dump (first 32 bytes): 01 00 00 00 19 00 00 00 03 f0 00 00 04 00 00 00 ................ 98 77 67 41 81 88 ff ff 98 77 67 41 81 88 ff ff .wgA.....wgA.... backtrace: [<00000000e992680d>] kmalloc_trace+0x27/0x120 [<000000009e945a98>] mlx5e_tc_int_port_get+0x3f3/0xe20 [mlx5_core] [<0000000035a537f0>] mlx5e_tc_add_fdb_flow+0x473/0xcf0 [mlx5_core] [<0000000070c2cec6>] __mlx5e_add_fdb_flow+0x7cf/0xe90 [mlx5_core] [<000000005cc84048>] mlx5e_configure_flower+0xd40/0x4c40 [mlx5_core] [<000000004f8a2031>] mlx5e_rep_indr_offload.isra.0+0x10e/0x1c0 [mlx5_core] [<000000007df797dc>] mlx5e_rep_indr_setup_tc_cb+0x90/0x130 [mlx5_core] [<0000000016c15cc3>] tc_setup_cb_add+0x1cf/0x410 [<00000000a63305b4>] fl_hw_replace_filter+0x38f/0x670 [cls_flower] [<000000008bc9e77c>] fl_change+0x1fd5/0x4430 [cls_flower] [<00000000e7f766e4>] tc_new_tfilter+0x867/0x2010 [<00000000e101c0ef>] rtnetlink_rcv_msg+0x6fc/0x9f0 [<00000000e1111d44>] netlink_rcv_skb+0x12c/0x360 [<0000000082dd6c8b>] netlink_unicast+0x438/0x710 [<00000000fc568f70>] netlink_sendmsg+0x794/0xc50 [<0000000016e92590>] sock_sendmsg+0xc5/0x190 So fix this by moving int_port cleanup code to the flow attribute free helper, which is used by all the attribute free cases. Fixes: 8300f225268b ("net/mlx5e: Create new flow attr for multi table actions") Signed-off-by: Jianbo Liu Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 92377632f9e0..31708d5aa608 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1943,9 +1943,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5_flow_attr *attr = flow->attr; - struct mlx5_esw_flow_attr *esw_attr; - esw_attr = attr->esw_attr; mlx5e_put_flow_tunnel_id(flow); remove_unready_flow(flow); @@ -1966,12 +1964,6 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, mlx5_tc_ct_match_del(get_ct_priv(priv), &flow->attr->ct_attr); - if (esw_attr->int_port) - mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->int_port); - - if (esw_attr->dest_int_port) - mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->dest_int_port); - if (flow_flag_test(flow, L3_TO_L2_DECAP)) mlx5e_detach_decap(priv, flow); @@ -4268,6 +4260,7 @@ static void mlx5_free_flow_attr_actions(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr) { struct mlx5_core_dev *counter_dev = get_flow_counter_dev(flow); + struct mlx5_esw_flow_attr *esw_attr; if (!attr) return; @@ -4285,6 +4278,18 @@ mlx5_free_flow_attr_actions(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *a mlx5e_tc_detach_mod_hdr(flow->priv, flow, attr); } + if (mlx5e_is_eswitch_flow(flow)) { + esw_attr = attr->esw_attr; + + if (esw_attr->int_port) + mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(flow->priv), + esw_attr->int_port); + + if (esw_attr->dest_int_port) + mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(flow->priv), + esw_attr->dest_int_port); + } + mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr); free_branch_attr(flow, attr->branch_true); From 8bfe1e19fb96d89fce14302e35cba0cd9f39d0a1 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 26 Jul 2023 14:38:03 +0300 Subject: [PATCH 150/230] net/mlx5: DR, Fix wrong allocation of modify hdr pattern Fixing wrong calculation of the modify hdr pattern size, where the previously calculated number would not be enough to accommodate the required number of actions. Fixes: da5d0027d666 ("net/mlx5: DR, Add cache for modify header pattern") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c index d6947fe13d56..8ca534ef5d03 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c @@ -82,7 +82,7 @@ dr_ptrn_alloc_pattern(struct mlx5dr_ptrn_mgr *mgr, u32 chunk_size; u32 index; - chunk_size = ilog2(num_of_actions); + chunk_size = ilog2(roundup_pow_of_two(num_of_actions)); /* HW modify action index granularity is at least 64B */ chunk_size = max_t(u32, chunk_size, DR_CHUNK_SIZE_8); From 06c868fde61fd0bbf9a7c7405f6eb9925bf0c2ed Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Mon, 26 Jun 2023 22:55:03 +0300 Subject: [PATCH 151/230] net/mlx5: Return correct EC_VF function ID The ECVF function ID range is 1..max_ec_vfs. Currently mlx5_vport_to_func_id returns 0..max_ec_vfs - 1. Which results in a syndrome when querying the caps with more recent firmware, or reading incorrect caps with older firmware that supports EC VFs. Fixes: 9ac0b128248e ("net/mlx5: Update vport caps query/set for EC VFs") Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index c4be257c043d..682d3dc00dd1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -361,7 +361,7 @@ static inline bool mlx5_core_is_ec_vf_vport(const struct mlx5_core_dev *dev, u16 static inline int mlx5_vport_to_func_id(const struct mlx5_core_dev *dev, u16 vport, bool ec_vf_func) { - return ec_vf_func ? vport - mlx5_core_ec_vf_vport_base(dev) + return ec_vf_func ? vport - mlx5_core_ec_vf_vport_base(dev) + 1 : vport; } From 2dc2b3922d3c0f52d3a792d15dcacfbc4cc76b8f Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 11 Jul 2023 00:28:10 +0300 Subject: [PATCH 152/230] net/mlx5: Allow 0 for total host VFs When querying eswitch functions 0 is a valid number of host VFs. After introducing ARM SRIOV falling through to getting the max value from PCI results in using the total VFs allowed on the ARM for the host. Fixes: 86eec50beaf3 ("net/mlx5: Support querying max VFs from device"); Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index 4e42a3b9b8ee..a2fc937d5461 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -285,8 +285,7 @@ static u16 mlx5_get_max_vfs(struct mlx5_core_dev *dev) host_total_vfs = MLX5_GET(query_esw_functions_out, out, host_params_context.host_total_vfs); kvfree(out); - if (host_total_vfs) - return host_total_vfs; + return host_total_vfs; } done: From 2d691c90f45ad2d0eafdd24c6abb0973a831c1d2 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 1 Aug 2023 21:46:00 +0300 Subject: [PATCH 153/230] net/mlx5: Fix devlink controller number for ECVF The controller number for ECVFs is always 0, because the ECPF must be the eswitch owner for EC VFs to be enabled. Fixes: dc13180824b7 ("net/mlx5: Enable devlink port for embedded cpu VF vports") Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index af779c700278..fdf2be548e85 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -60,7 +60,7 @@ static struct devlink_port *mlx5_esw_dl_port_alloc(struct mlx5_eswitch *esw, u16 } else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) { memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum, + devlink_port_attrs_pci_vf_set(dl_port, 0, pfnum, vport_num - 1, false); } return dl_port; From 6b5926eb1c034affff3fb44a98cb8c67153847d8 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Wed, 26 Jul 2023 09:06:33 +0300 Subject: [PATCH 154/230] net/mlx5e: Unoffload post act rule when handling FIB events If having the following tc rule on stack device: filter parent ffff: protocol ip pref 3 flower chain 1 filter parent ffff: protocol ip pref 3 flower chain 1 handle 0x1 dst_mac 24:25:d0:e1:00:00 src_mac 02:25:d0:25:01:02 eth_type ipv4 ct_state +trk+new in_hw in_hw_count 1 action order 1: ct commit zone 0 pipe index 2 ref 1 bind 1 installed 3807 sec used 3779 sec firstused 3800 sec Action statistics: Sent 120 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 used_hw_stats delayed action order 2: tunnel_key set src_ip 192.168.1.25 dst_ip 192.168.1.26 key_id 4 dst_port 4789 csum pipe index 3 ref 1 bind 1 installed 3807 sec used 3779 sec firstused 3800 sec Action statistics: Sent 120 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 used_hw_stats delayed action order 3: mirred (Egress Redirect to device vxlan1) stolen index 9 ref 1 bind 1 installed 3807 sec used 3779 sec firstused 3800 sec Action statistics: Sent 120 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 used_hw_stats delayed When handling FIB events, the rule in post act will not be deleted. And because the post act rule has packet reformat and modify header actions, also will hit the following syndromes: mlx5_core 0000:08:00.0: mlx5_cmd_out_err:829:(pid 11613): DEALLOC_MODIFY_HEADER_CONTEXT(0x941) op_mod(0x0) failed, status bad resource state(0x9), syndrome (0x1ab444), err(-22) mlx5_core 0000:08:00.0: mlx5_cmd_out_err:829:(pid 11613): DEALLOC_PACKET_REFORMAT_CONTEXT(0x93e) op_mod(0x0) failed, status bad resource state(0x9), syndrome (0x179e84), err(-22) Fix it by unoffloading post act rule when handling FIB events. Fixes: 314e1105831b ("net/mlx5e: Add post act offload/unoffload API") Signed-off-by: Chris Mi Reviewed-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 0c88cf47af01..1730f6a716ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -1461,10 +1461,12 @@ static void mlx5e_invalidate_encap(struct mlx5e_priv *priv, attr = mlx5e_tc_get_encap_attr(flow); esw_attr = attr->esw_attr; - if (flow_flag_test(flow, SLOW)) + if (flow_flag_test(flow, SLOW)) { mlx5e_tc_unoffload_from_slow_path(esw, flow); - else + } else { mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr); + mlx5e_tc_unoffload_flow_post_acts(flow); + } mlx5e_tc_detach_mod_hdr(priv, flow, attr); attr->modify_hdr = NULL; From 86ed7b773c01ba71617538b3b107c33fd9cf90b8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 30 Jul 2023 09:26:27 +0300 Subject: [PATCH 155/230] net/mlx5: LAG, Check correct bucket when modifying LAG Cited patch introduced buckets in hash mode, but missed to update the ports/bucket check when modifying LAG. Fix the check. Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode") Signed-off-by: Shay Drory Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index d3a3fe4ce670..7d9bbb494d95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -574,7 +574,7 @@ static int __mlx5_lag_modify_definers_destinations(struct mlx5_lag *ldev, for (i = 0; i < ldev->ports; i++) { for (j = 0; j < ldev->buckets; j++) { idx = i * ldev->buckets + j; - if (ldev->v2p_map[i] == ports[i]) + if (ldev->v2p_map[idx] == ports[idx]) continue; dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[ports[idx] - 1].dev, From d006207625657322ba8251b6e7e829f9659755dc Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 19 Jul 2023 11:33:44 +0300 Subject: [PATCH 156/230] net/mlx5: Skip clock update work when device is in error state When device is in error state, marked by the flag MLX5_DEVICE_STATE_INTERNAL_ERROR, the HW and PCI may not be accessible and so clock update work should be skipped. Furthermore, such access through PCI in error state, after calling mlx5_pci_disable_device() can result in failing to recover from pci errors. Fixes: ef9814deafd0 ("net/mlx5e: Add HW timestamping (TS) support") Reported-and-tested-by: Ganesh G R Closes: https://lore.kernel.org/netdev/9bdb9b9d-140a-7a28-f0de-2e64e873c068@nvidia.com Signed-off-by: Moshe Shemesh Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 973babfaff25..377372f0578a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -227,10 +227,15 @@ static void mlx5_timestamp_overflow(struct work_struct *work) clock = container_of(timer, struct mlx5_clock, timer); mdev = container_of(clock, struct mlx5_core_dev, clock); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto out; + write_seqlock_irqsave(&clock->lock, flags); timecounter_read(&timer->tc); mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); + +out: schedule_delayed_work(&timer->overflow_work, timer->overflow_period); } From aab8e1a200b926147db51e3f82fd07bb9edf6a98 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 23 Jul 2023 11:03:01 +0300 Subject: [PATCH 157/230] net/mlx5: Reload auxiliary devices in pci error handlers Handling pci errors should fully teardown and load back auxiliary devices, same as done through mlx5 health recovery flow. Fixes: 72ed5d5624af ("net/mlx5: Suspend auxiliary devices only in case of PCI device suspend") Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index f42abc2ea73c..72ae560a1c68 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1989,7 +1989,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, mlx5_enter_error_state(dev, false); mlx5_error_sw_reset(dev); - mlx5_unload_one(dev, true); + mlx5_unload_one(dev, false); mlx5_drain_health_wq(dev); mlx5_pci_disable_device(dev); From 548ee049b19fb9a3d0a4335314d0d1217a521bc5 Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Sun, 11 Jun 2023 16:29:13 +0300 Subject: [PATCH 158/230] net/mlx5e: Add capability check for vnic counters Add missing capability check for each of the vnic counters exposed by devlink health reporter, and thus avoid unexpected behavior due to invalid access to registers. While at it, read only the exact number of bits for each counter whether it was 32 bits or 64 bits. Fixes: b0bc615df488 ("net/mlx5: Add vnic devlink health reporter to PFs/VFs") Fixes: a33682e4e78e ("net/mlx5e: Expose catastrophic steering error counters") Signed-off-by: Lama Kayal Reviewed-by: Gal Pressman Reviewed-by: Rahul Rameshbabu Reviewed-by: Maher Sanalla Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/diag/reporter_vnic.c | 98 +++++++++++-------- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c index b0128336ff01..e869c65d8e90 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c @@ -2,6 +2,7 @@ /* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */ #include "reporter_vnic.h" +#include "en_stats.h" #include "devlink.h" #define VNIC_ENV_GET64(vnic_env_stats, c) \ @@ -36,55 +37,72 @@ int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev, if (err) return err; - err = devlink_fmsg_u64_pair_put(fmsg, "total_error_queues", - VNIC_ENV_GET64(&vnic, total_error_queues)); - if (err) - return err; + if (MLX5_CAP_GEN(dev, vnic_env_queue_counters)) { + err = devlink_fmsg_u32_pair_put(fmsg, "total_error_queues", + VNIC_ENV_GET(&vnic, total_error_queues)); + if (err) + return err; - err = devlink_fmsg_u64_pair_put(fmsg, "send_queue_priority_update_flow", - VNIC_ENV_GET64(&vnic, send_queue_priority_update_flow)); - if (err) - return err; + err = devlink_fmsg_u32_pair_put(fmsg, "send_queue_priority_update_flow", + VNIC_ENV_GET(&vnic, + send_queue_priority_update_flow)); + if (err) + return err; + } - err = devlink_fmsg_u64_pair_put(fmsg, "comp_eq_overrun", - VNIC_ENV_GET64(&vnic, comp_eq_overrun)); - if (err) - return err; + if (MLX5_CAP_GEN(dev, eq_overrun_count)) { + err = devlink_fmsg_u32_pair_put(fmsg, "comp_eq_overrun", + VNIC_ENV_GET(&vnic, comp_eq_overrun)); + if (err) + return err; - err = devlink_fmsg_u64_pair_put(fmsg, "async_eq_overrun", - VNIC_ENV_GET64(&vnic, async_eq_overrun)); - if (err) - return err; + err = devlink_fmsg_u32_pair_put(fmsg, "async_eq_overrun", + VNIC_ENV_GET(&vnic, async_eq_overrun)); + if (err) + return err; + } - err = devlink_fmsg_u64_pair_put(fmsg, "cq_overrun", - VNIC_ENV_GET64(&vnic, cq_overrun)); - if (err) - return err; + if (MLX5_CAP_GEN(dev, vnic_env_cq_overrun)) { + err = devlink_fmsg_u32_pair_put(fmsg, "cq_overrun", + VNIC_ENV_GET(&vnic, cq_overrun)); + if (err) + return err; + } - err = devlink_fmsg_u64_pair_put(fmsg, "invalid_command", - VNIC_ENV_GET64(&vnic, invalid_command)); - if (err) - return err; + if (MLX5_CAP_GEN(dev, invalid_command_count)) { + err = devlink_fmsg_u32_pair_put(fmsg, "invalid_command", + VNIC_ENV_GET(&vnic, invalid_command)); + if (err) + return err; + } - err = devlink_fmsg_u64_pair_put(fmsg, "quota_exceeded_command", - VNIC_ENV_GET64(&vnic, quota_exceeded_command)); - if (err) - return err; + if (MLX5_CAP_GEN(dev, quota_exceeded_count)) { + err = devlink_fmsg_u32_pair_put(fmsg, "quota_exceeded_command", + VNIC_ENV_GET(&vnic, quota_exceeded_command)); + if (err) + return err; + } - err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard", - VNIC_ENV_GET64(&vnic, nic_receive_steering_discard)); - if (err) - return err; + if (MLX5_CAP_GEN(dev, nic_receive_steering_discard)) { + err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard", + VNIC_ENV_GET64(&vnic, + nic_receive_steering_discard)); + if (err) + return err; + } - err = devlink_fmsg_u64_pair_put(fmsg, "generated_pkt_steering_fail", - VNIC_ENV_GET64(&vnic, generated_pkt_steering_fail)); - if (err) - return err; + if (MLX5_CAP_GEN(dev, vnic_env_cnt_steering_fail)) { + err = devlink_fmsg_u64_pair_put(fmsg, "generated_pkt_steering_fail", + VNIC_ENV_GET64(&vnic, + generated_pkt_steering_fail)); + if (err) + return err; - err = devlink_fmsg_u64_pair_put(fmsg, "handled_pkt_steering_fail", - VNIC_ENV_GET64(&vnic, handled_pkt_steering_fail)); - if (err) - return err; + err = devlink_fmsg_u64_pair_put(fmsg, "handled_pkt_steering_fail", + VNIC_ENV_GET64(&vnic, handled_pkt_steering_fail)); + if (err) + return err; + } err = devlink_fmsg_obj_nest_end(fmsg); if (err) From 01f4fd27087078c90a0e22860d1dfa2cd0510791 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Wed, 2 Aug 2023 19:43:20 +0800 Subject: [PATCH 159/230] bonding: Fix incorrect deletion of ETH_P_8021AD protocol vid from slaves BUG_ON(!vlan_info) is triggered in unregister_vlan_dev() with following testcase: # ip netns add ns1 # ip netns exec ns1 ip link add bond0 type bond mode 0 # ip netns exec ns1 ip link add bond_slave_1 type veth peer veth2 # ip netns exec ns1 ip link set bond_slave_1 master bond0 # ip netns exec ns1 ip link add link bond_slave_1 name vlan10 type vlan id 10 protocol 802.1ad # ip netns exec ns1 ip link add link bond0 name bond0_vlan10 type vlan id 10 protocol 802.1ad # ip netns exec ns1 ip link set bond_slave_1 nomaster # ip netns del ns1 The logical analysis of the problem is as follows: 1. create ETH_P_8021AD protocol vlan10 for bond_slave_1: register_vlan_dev() vlan_vid_add() vlan_info_alloc() __vlan_vid_add() // add [ETH_P_8021AD, 10] vid to bond_slave_1 2. create ETH_P_8021AD protocol bond0_vlan10 for bond0: register_vlan_dev() vlan_vid_add() __vlan_vid_add() vlan_add_rx_filter_info() if (!vlan_hw_filter_capable(dev, proto)) // condition established because bond0 without NETIF_F_HW_VLAN_STAG_FILTER return 0; if (netif_device_present(dev)) return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid); // will be never called // The slaves of bond0 will not refer to the [ETH_P_8021AD, 10] vid. 3. detach bond_slave_1 from bond0: __bond_release_one() vlan_vids_del_by_dev() list_for_each_entry(vid_info, &vlan_info->vid_list, list) vlan_vid_del(dev, vid_info->proto, vid_info->vid); // bond_slave_1 [ETH_P_8021AD, 10] vid will be deleted. // bond_slave_1->vlan_info will be assigned NULL. 4. delete vlan10 during delete ns1: default_device_exit_batch() dev->rtnl_link_ops->dellink() // unregister_vlan_dev() for vlan10 vlan_info = rtnl_dereference(real_dev->vlan_info); // real_dev of vlan10 is bond_slave_1 BUG_ON(!vlan_info); // bond_slave_1->vlan_info is NULL now, bug is triggered!!! Add S-VLAN tag related features support to bond driver. So the bond driver will always propagate the VLAN info to its slaves. Fixes: 8ad227ff89a7 ("net: vlan: add 802.1ad support") Suggested-by: Ido Schimmel Signed-off-by: Ziyang Xuan Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/20230802114320.4156068-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 484c9e3e5e82..447b06ea4fc9 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5901,7 +5901,9 @@ void bond_setup(struct net_device *bond_dev) bond_dev->hw_features = BOND_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | - NETIF_F_HW_VLAN_CTAG_FILTER; + NETIF_F_HW_VLAN_CTAG_FILTER | + NETIF_F_HW_VLAN_STAG_RX | + NETIF_F_HW_VLAN_STAG_FILTER; bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; From 46622219aae2b67813fe31a7b8cb7da5baff5c8a Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 7 Aug 2023 15:21:27 +0200 Subject: [PATCH 160/230] wireguard: allowedips: expand maximum node depth In the allowedips self-test, nodes are inserted into the tree, but it generated an even amount of nodes, but for checking maximum node depth, there is of course the root node, which makes the total number necessarily odd. With two few nodes added, it never triggered the maximum depth check like it should have. So, add 129 nodes instead of 128 nodes, and do so with a more straightforward scheme, starting with all the bits set, and shifting over one each time. Then increase the maximum depth to 129, and choose a better name for that variable to make it clear that it represents depth as opposed to bits. Cc: stable@vger.kernel.org Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Link: https://lore.kernel.org/r/20230807132146.2191597-2-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/allowedips.c | 8 ++++---- drivers/net/wireguard/selftest/allowedips.c | 16 ++++++++++------ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c index 5bf7822c53f1..0ba714ca5185 100644 --- a/drivers/net/wireguard/allowedips.c +++ b/drivers/net/wireguard/allowedips.c @@ -6,7 +6,7 @@ #include "allowedips.h" #include "peer.h" -enum { MAX_ALLOWEDIPS_BITS = 128 }; +enum { MAX_ALLOWEDIPS_DEPTH = 129 }; static struct kmem_cache *node_cache; @@ -42,7 +42,7 @@ static void push_rcu(struct allowedips_node **stack, struct allowedips_node __rcu *p, unsigned int *len) { if (rcu_access_pointer(p)) { - if (WARN_ON(IS_ENABLED(DEBUG) && *len >= MAX_ALLOWEDIPS_BITS)) + if (WARN_ON(IS_ENABLED(DEBUG) && *len >= MAX_ALLOWEDIPS_DEPTH)) return; stack[(*len)++] = rcu_dereference_raw(p); } @@ -55,7 +55,7 @@ static void node_free_rcu(struct rcu_head *rcu) static void root_free_rcu(struct rcu_head *rcu) { - struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_BITS] = { + struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_DEPTH] = { container_of(rcu, struct allowedips_node, rcu) }; unsigned int len = 1; @@ -68,7 +68,7 @@ static void root_free_rcu(struct rcu_head *rcu) static void root_remove_peer_lists(struct allowedips_node *root) { - struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_BITS] = { root }; + struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_DEPTH] = { root }; unsigned int len = 1; while (len > 0 && (node = stack[--len])) { diff --git a/drivers/net/wireguard/selftest/allowedips.c b/drivers/net/wireguard/selftest/allowedips.c index 78ebe2892a78..3d1f64ff2e12 100644 --- a/drivers/net/wireguard/selftest/allowedips.c +++ b/drivers/net/wireguard/selftest/allowedips.c @@ -593,16 +593,20 @@ bool __init wg_allowedips_selftest(void) wg_allowedips_remove_by_peer(&t, a, &mutex); test_negative(4, a, 192, 168, 0, 1); - /* These will hit the WARN_ON(len >= MAX_ALLOWEDIPS_BITS) in free_node + /* These will hit the WARN_ON(len >= MAX_ALLOWEDIPS_DEPTH) in free_node * if something goes wrong. */ - for (i = 0; i < MAX_ALLOWEDIPS_BITS; ++i) { - part = cpu_to_be64(~(1LLU << (i % 64))); - memset(&ip, 0xff, 16); - memcpy((u8 *)&ip + (i < 64) * 8, &part, 8); + for (i = 0; i < 64; ++i) { + part = cpu_to_be64(~0LLU << i); + memset(&ip, 0xff, 8); + memcpy((u8 *)&ip + 8, &part, 8); + wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex); + memcpy(&ip, &part, 8); + memset((u8 *)&ip + 8, 0, 8); wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex); } - + memset(&ip, 0, 16); + wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex); wg_allowedips_free(&t, &mutex); wg_allowedips_init(&t); From 59eeb232940515590de513b997539ef495faca9a Mon Sep 17 00:00:00 2001 From: Andrew Kanner Date: Thu, 3 Aug 2023 20:59:48 +0200 Subject: [PATCH 161/230] drivers: net: prevent tun_build_skb() to exceed the packet size limit Using the syzkaller repro with reduced packet size it was discovered that XDP_PACKET_HEADROOM is not checked in tun_can_build_skb(), although pad may be incremented in tun_build_skb(). This may end up with exceeding the PAGE_SIZE limit in tun_build_skb(). Jason Wang proposed to count XDP_PACKET_HEADROOM always (e.g. without rcu_access_pointer(tun->xdp_prog)) in tun_can_build_skb() since there's a window during which XDP program might be attached between tun_can_build_skb() and tun_build_skb(). Fixes: 7df13219d757 ("tun: reserve extra headroom only when XDP is set") Link: https://syzkaller.appspot.com/bug?extid=f817490f5bd20541b90a Signed-off-by: Andrew Kanner Link: https://lore.kernel.org/r/20230803185947.2379988-1-andrew.kanner@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 25f0191df00b..100339bc8b04 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1594,7 +1594,7 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, if (zerocopy) return false; - if (SKB_DATA_ALIGN(len + TUN_RX_PAD) + + if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; From d14eea09edf427fa36bd446f4a3271f99164202f Mon Sep 17 00:00:00 2001 From: Andrew Kanner Date: Thu, 3 Aug 2023 21:03:18 +0200 Subject: [PATCH 162/230] net: core: remove unnecessary frame_sz check in bpf_xdp_adjust_tail() Syzkaller reported the following issue: ======================================= Too BIG xdp->frame_sz = 131072 WARNING: CPU: 0 PID: 5020 at net/core/filter.c:4121 ____bpf_xdp_adjust_tail net/core/filter.c:4121 [inline] WARNING: CPU: 0 PID: 5020 at net/core/filter.c:4121 bpf_xdp_adjust_tail+0x466/0xa10 net/core/filter.c:4103 ... Call Trace: bpf_prog_4add87e5301a4105+0x1a/0x1c __bpf_prog_run include/linux/filter.h:600 [inline] bpf_prog_run_xdp include/linux/filter.h:775 [inline] bpf_prog_run_generic_xdp+0x57e/0x11e0 net/core/dev.c:4721 netif_receive_generic_xdp net/core/dev.c:4807 [inline] do_xdp_generic+0x35c/0x770 net/core/dev.c:4866 tun_get_user+0x2340/0x3ca0 drivers/net/tun.c:1919 tun_chr_write_iter+0xe8/0x210 drivers/net/tun.c:2043 call_write_iter include/linux/fs.h:1871 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x650/0xe40 fs/read_write.c:584 ksys_write+0x12f/0x250 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd xdp->frame_sz > PAGE_SIZE check was introduced in commit c8741e2bfe87 ("xdp: Allow bpf_xdp_adjust_tail() to grow packet size"). But Jesper Dangaard Brouer noted that after introducing the xdp_init_buff() which all XDP driver use - it's safe to remove this check. The original intend was to catch cases where XDP drivers have not been updated to use xdp.frame_sz, but that is not longer a concern (since xdp_init_buff). Running the initial syzkaller repro it was discovered that the contiguous physical memory allocation is used for both xdp paths in tun_get_user(), e.g. tun_build_skb() and tun_alloc_skb(). It was also stated by Jesper Dangaard Brouer that XDP can work on higher order pages, as long as this is contiguous physical memory (e.g. a page). Reported-and-tested-by: syzbot+f817490f5bd20541b90a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000774b9205f1d8a80d@google.com/T/ Link: https://syzkaller.appspot.com/bug?extid=f817490f5bd20541b90a Link: https://lore.kernel.org/all/20230725155403.796-1-andrew.kanner@gmail.com/T/ Fixes: 43b5169d8355 ("net, xdp: Introduce xdp_init_buff utility routine") Signed-off-by: Andrew Kanner Acked-by: Jesper Dangaard Brouer Acked-by: Jason Wang Link: https://lore.kernel.org/r/20230803190316.2380231-1-andrew.kanner@gmail.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 06ba0e56e369..28a59596987a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4116,12 +4116,6 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) if (unlikely(data_end > data_hard_end)) return -EINVAL; - /* ALL drivers MUST init xdp->frame_sz, chicken check below */ - if (unlikely(xdp->frame_sz > PAGE_SIZE)) { - WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); - return -EINVAL; - } - if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; From 2aa71b4b294ee2c3041d085404cea914be9b3225 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 4 Aug 2023 12:12:20 +0200 Subject: [PATCH 163/230] net: marvell: prestera: fix handling IPv4 routes with nhid Fix handling IPv4 routes referencing a nexthop via its id by replacing calls to fib_info_nh() with fib_info_nhc(). Trying to add an IPv4 route referencing a nextop via nhid: $ ip link set up swp5 $ ip a a 10.0.0.1/24 dev swp5 $ ip nexthop add dev swp5 id 20 via 10.0.0.2 $ ip route add 10.0.1.0/24 nhid 20 triggers warnings when trying to handle the route: [ 528.805763] ------------[ cut here ]------------ [ 528.810437] WARNING: CPU: 3 PID: 53 at include/net/nexthop.h:468 __prestera_fi_is_direct+0x2c/0x68 [prestera] [ 528.820434] Modules linked in: prestera_pci act_gact act_police sch_ingress cls_u32 cls_flower prestera arm64_delta_tn48m_dn_led(O) arm64_delta_tn48m_dn_cpld(O) [last unloaded: prestera_pci] [ 528.837485] CPU: 3 PID: 53 Comm: kworker/u8:3 Tainted: G O 6.4.5 #1 [ 528.845178] Hardware name: delta,tn48m-dn (DT) [ 528.849641] Workqueue: prestera_ordered __prestera_router_fib_event_work [prestera] [ 528.857352] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 528.864347] pc : __prestera_fi_is_direct+0x2c/0x68 [prestera] [ 528.870135] lr : prestera_k_arb_fib_evt+0xb20/0xd50 [prestera] [ 528.876007] sp : ffff80000b20bc90 [ 528.879336] x29: ffff80000b20bc90 x28: 0000000000000000 x27: ffff0001374d3a48 [ 528.886510] x26: ffff000105604000 x25: ffff000134af8a28 x24: ffff0001374d3800 [ 528.893683] x23: ffff000101c89148 x22: ffff000101c89000 x21: ffff000101c89200 [ 528.900855] x20: ffff00013641fda0 x19: ffff800009d01088 x18: 0000000000000059 [ 528.908027] x17: 0000000000000277 x16: 0000000000000000 x15: 0000000000000000 [ 528.915198] x14: 0000000000000003 x13: 00000000000fe400 x12: 0000000000000000 [ 528.922371] x11: 0000000000000002 x10: 0000000000000aa0 x9 : ffff8000013d2020 [ 528.929543] x8 : 0000000000000018 x7 : 000000007b1703f8 x6 : 000000001ca72f86 [ 528.936715] x5 : 0000000033399ea7 x4 : 0000000000000000 x3 : ffff0001374d3acc [ 528.943886] x2 : 0000000000000000 x1 : ffff00010200de00 x0 : ffff000134ae3f80 [ 528.951058] Call trace: [ 528.953516] __prestera_fi_is_direct+0x2c/0x68 [prestera] [ 528.958952] __prestera_router_fib_event_work+0x100/0x158 [prestera] [ 528.965348] process_one_work+0x208/0x488 [ 528.969387] worker_thread+0x4c/0x430 [ 528.973068] kthread+0x120/0x138 [ 528.976313] ret_from_fork+0x10/0x20 [ 528.979909] ---[ end trace 0000000000000000 ]--- [ 528.984998] ------------[ cut here ]------------ [ 528.989645] WARNING: CPU: 3 PID: 53 at include/net/nexthop.h:468 __prestera_fi_is_direct+0x2c/0x68 [prestera] [ 528.999628] Modules linked in: prestera_pci act_gact act_police sch_ingress cls_u32 cls_flower prestera arm64_delta_tn48m_dn_led(O) arm64_delta_tn48m_dn_cpld(O) [last unloaded: prestera_pci] [ 529.016676] CPU: 3 PID: 53 Comm: kworker/u8:3 Tainted: G W O 6.4.5 #1 [ 529.024368] Hardware name: delta,tn48m-dn (DT) [ 529.028830] Workqueue: prestera_ordered __prestera_router_fib_event_work [prestera] [ 529.036539] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 529.043533] pc : __prestera_fi_is_direct+0x2c/0x68 [prestera] [ 529.049318] lr : __prestera_k_arb_fc_apply+0x280/0x2f8 [prestera] [ 529.055452] sp : ffff80000b20bc60 [ 529.058781] x29: ffff80000b20bc60 x28: 0000000000000000 x27: ffff0001374d3a48 [ 529.065953] x26: ffff000105604000 x25: ffff000134af8a28 x24: ffff0001374d3800 [ 529.073126] x23: ffff000101c89148 x22: ffff000101c89148 x21: ffff00013641fda0 [ 529.080299] x20: ffff000101c89000 x19: ffff000101c89020 x18: 0000000000000059 [ 529.087471] x17: 0000000000000277 x16: 0000000000000000 x15: 0000000000000000 [ 529.094642] x14: 0000000000000003 x13: 00000000000fe400 x12: 0000000000000000 [ 529.101814] x11: 0000000000000002 x10: 0000000000000aa0 x9 : ffff8000013cee80 [ 529.108985] x8 : 0000000000000018 x7 : 000000007b1703f8 x6 : 0000000000000018 [ 529.116157] x5 : 00000000d3497eb6 x4 : ffff000105604081 x3 : 000000008e979557 [ 529.123329] x2 : 0000000000000000 x1 : ffff00010200de00 x0 : ffff000134ae3f80 [ 529.130501] Call trace: [ 529.132958] __prestera_fi_is_direct+0x2c/0x68 [prestera] [ 529.138394] prestera_k_arb_fib_evt+0x6b8/0xd50 [prestera] [ 529.143918] __prestera_router_fib_event_work+0x100/0x158 [prestera] [ 529.150313] process_one_work+0x208/0x488 [ 529.154348] worker_thread+0x4c/0x430 [ 529.158030] kthread+0x120/0x138 [ 529.161274] ret_from_fork+0x10/0x20 [ 529.164867] ---[ end trace 0000000000000000 ]--- and results in a non offloaded route: $ ip route 10.0.0.0/24 dev swp5 proto kernel scope link src 10.0.0.1 rt_trap 10.0.1.0/24 nhid 20 via 10.0.0.2 dev swp5 rt_trap When creating a route referencing a nexthop via its ID, the nexthop will be stored in a separate nh pointer instead of the array of nexthops in the fib_info struct. This causes issues since fib_info_nh() only handles the nexthops array, but not the separate nh pointer, and will loudly WARN about it. In contrast fib_info_nhc() handles both, but returns a fib_nh_common pointer instead of a fib_nh pointer. Luckily we only ever access fields from the fib_nh_common parts, so we can just replace all instances of fib_info_nh() with fib_info_nhc() and access the fields via their fib_nh_common names. This allows handling IPv4 routes with an external nexthop, and they now get offloaded as expected: $ ip route 10.0.0.0/24 dev swp5 proto kernel scope link src 10.0.0.1 rt_trap 10.0.1.0/24 nhid 20 via 10.0.0.2 dev swp5 offload rt_offload Fixes: 396b80cb5cc8 ("net: marvell: prestera: Add neighbour cache accounting") Signed-off-by: Jonas Gorski Acked-by: Elad Nachman Link: https://lore.kernel.org/r/20230804101220.247515-1-jonas.gorski@bisdn.de Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/prestera/prestera_router.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c index a9a1028cb17b..de317179a7dc 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_router.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c @@ -166,11 +166,11 @@ prestera_util_neigh2nc_key(struct prestera_switch *sw, struct neighbour *n, static bool __prestera_fi_is_direct(struct fib_info *fi) { - struct fib_nh *fib_nh; + struct fib_nh_common *fib_nhc; if (fib_info_num_path(fi) == 1) { - fib_nh = fib_info_nh(fi, 0); - if (fib_nh->fib_nh_gw_family == AF_UNSPEC) + fib_nhc = fib_info_nhc(fi, 0); + if (fib_nhc->nhc_gw_family == AF_UNSPEC) return true; } @@ -261,7 +261,7 @@ static bool __prestera_util_kern_n_is_reachable_v4(u32 tb_id, __be32 *addr, struct net_device *dev) { - struct fib_nh *fib_nh; + struct fib_nh_common *fib_nhc; struct fib_result res; bool reachable; @@ -269,8 +269,8 @@ __prestera_util_kern_n_is_reachable_v4(u32 tb_id, __be32 *addr, if (!prestera_util_kern_get_route(&res, tb_id, addr)) if (prestera_fi_is_direct(res.fi)) { - fib_nh = fib_info_nh(res.fi, 0); - if (dev == fib_nh->fib_nh_dev) + fib_nhc = fib_info_nhc(res.fi, 0); + if (dev == fib_nhc->nhc_dev) reachable = true; } @@ -324,7 +324,7 @@ prestera_kern_fib_info_nhc(struct fib_notifier_info *info, int n) if (info->family == AF_INET) { fen4_info = container_of(info, struct fib_entry_notifier_info, info); - return &fib_info_nh(fen4_info->fi, n)->nh_common; + return fib_info_nhc(fen4_info->fi, n); } else if (info->family == AF_INET6) { fen6_info = container_of(info, struct fib6_entry_notifier_info, info); From 07d698324110339b420deebab7a7805815340b4f Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 2 Aug 2023 18:34:30 +0200 Subject: [PATCH 164/230] wifi: brcm80211: handle params_v1 allocation failure Return -ENOMEM from brcmf_run_escan() if kzalloc() fails for v1 params. Fixes: 398ce273d6b1 ("wifi: brcmfmac: cfg80211: Add support for scan params v2") Signed-off-by: Petr Tesarik Link: https://lore.kernel.org/r/20230802163430.1656-1-petrtesarik@huaweicloud.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index de8a2e27f49c..2a90bb24ba77 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -1456,6 +1456,10 @@ brcmf_run_escan(struct brcmf_cfg80211_info *cfg, struct brcmf_if *ifp, params_size -= BRCMF_SCAN_PARAMS_V2_FIXED_SIZE; params_size += BRCMF_SCAN_PARAMS_FIXED_SIZE; params_v1 = kzalloc(params_size, GFP_KERNEL); + if (!params_v1) { + err = -ENOMEM; + goto exit_params; + } params_v1->version = cpu_to_le32(BRCMF_ESCAN_REQ_VERSION); brcmf_scan_params_v2_to_v1(¶ms->params_v2_le, ¶ms_v1->params_le); kfree(params); @@ -1473,6 +1477,7 @@ brcmf_run_escan(struct brcmf_cfg80211_info *cfg, struct brcmf_if *ifp, bphy_err(drvr, "error (%d)\n", err); } +exit_params: kfree(params); exit: return err; From 6a67fe45fe3fffb0721ba068e21103b94a1e57a0 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Fri, 4 Aug 2023 17:24:37 -0500 Subject: [PATCH 165/230] MAINTAINERS: Update entry for rtl8187 As Herton Ronaldo Krzesinski is no longer active, remove him as maintainer for rtl8187. The git tree entry is also removed. Signed-off-by: Larry Finger Link: https://lore.kernel.org/r/20230804222438.16076-2-Larry.Finger@lwfinger.net Signed-off-by: Johannes Berg --- MAINTAINERS | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 71edae152e3a..af4913fbade4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18498,13 +18498,11 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.g F: drivers/net/wireless/realtek/rtl818x/rtl8180/ RTL8187 WIRELESS DRIVER -M: Herton Ronaldo Krzesinski -M: Hin-Tak Leung +M: Hin-Tak Leung M: Larry Finger L: linux-wireless@vger.kernel.org S: Maintained W: https://wireless.wiki.kernel.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git F: drivers/net/wireless/realtek/rtl818x/rtl8187/ RTL8XXXU WIRELESS DRIVER (rtl8xxxu) From 017e9420c1ca19bc169e20fa749709723eaf1eb7 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Fri, 4 Aug 2023 17:24:38 -0500 Subject: [PATCH 166/230] MAINTAINERS: Remove tree entry for rtl8180 This entry is not needed. Remove it. Signed-off-by: Larry Finger Link: https://lore.kernel.org/r/20230804222438.16076-3-Larry.Finger@lwfinger.net Signed-off-by: Johannes Berg --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index af4913fbade4..08396bdd41b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18494,7 +18494,6 @@ RTL8180 WIRELESS DRIVER L: linux-wireless@vger.kernel.org S: Orphan W: https://wireless.wiki.kernel.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git F: drivers/net/wireless/realtek/rtl818x/rtl8180/ RTL8187 WIRELESS DRIVER From b74bb07cdab6859e1a3fc9fe7351052176322ddf Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Tue, 8 Aug 2023 08:54:26 +0800 Subject: [PATCH 167/230] wifi: rtw89: fix 8852AE disconnection caused by RX full flags RX full flags are raised if certain types of RX FIFO are full, and then drop all following MPDU of AMPDU. In order to resume to receive MPDU when RX FIFO becomes available, we clear the register bits by the commit a0d99ebb3ecd ("wifi: rtw89: initialize DMA of CMAC"). But, 8852AE needs more settings to support this. To quickly fix disconnection problem, revert the behavior as before. Fixes: a0d99ebb3ecd ("wifi: rtw89: initialize DMA of CMAC") Reported-by: Damian B Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217710 Cc: Signed-off-by: Ping-Ke Shih Tested-by: Damian B Link: https://lore.kernel.org/r/20230808005426.5327-1-pkshih@realtek.com Signed-off-by: Johannes Berg --- drivers/net/wireless/realtek/rtw89/mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw89/mac.c b/drivers/net/wireless/realtek/rtw89/mac.c index b114babec698..c93e6250cb8b 100644 --- a/drivers/net/wireless/realtek/rtw89/mac.c +++ b/drivers/net/wireless/realtek/rtw89/mac.c @@ -2524,7 +2524,7 @@ static int cmac_dma_init(struct rtw89_dev *rtwdev, u8 mac_idx) u32 reg; int ret; - if (chip_id != RTL8852A && chip_id != RTL8852B) + if (chip_id != RTL8852B) return 0; ret = rtw89_mac_check_mac_en(rtwdev, mac_idx, RTW89_CMAC_SEL); From 5fb9a9fb71a33be61d7d8e8ba4597bfb18d604d0 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 22 Jun 2023 18:59:19 +0200 Subject: [PATCH 168/230] wifi: cfg80211: fix sband iftype data lookup for AP_VLAN AP_VLAN interfaces are virtual, so doesn't really exist as a type for capabilities. When passed in as a type, AP is the one that's really intended. Fixes: c4cbaf7973a7 ("cfg80211: Add support for HE") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20230622165919.46841-1-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7c7d03aa9d06..d6fa7c8767ad 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -562,6 +562,9 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) return NULL; + if (iftype == NL80211_IFTYPE_AP_VLAN) + iftype = NL80211_IFTYPE_AP; + for (i = 0; i < sband->n_iftype_data; i++) { const struct ieee80211_sband_iftype_data *data = &sband->iftype_data[i]; From c0b067588a4836b762cfc6a4c83f122ca1dbb93a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 1 Aug 2023 18:42:47 -0300 Subject: [PATCH 169/230] Revert "perf report: Append inlines to non-DWARF callchains" This reverts commit 46d21ec067490ab9cdcc89b9de5aae28786a8b8e. The tests were made with a specific workload, further tests on a recently updated fedora 38 system with a system wide perf.data file shows 'perf report' taking excessive time resolving inlines in vmlinux, so lets revert this until a full investigation and improvement on the addr2line support code is made. Reported-by: Jesper Dangaard Brouer Acked-by: Artem Savkov Tested-by: Jesper Dangaard Brouer Cc: Andrii Nakryiko Cc: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Peter Zijlstra Link: https://lore.kernel.org/r/ZMl8VyhdwhClTM5g@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 4e62843d51b7..f4cb41ee23cd 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -45,7 +45,6 @@ static void __machine__remove_thread(struct machine *machine, struct thread_rb_node *nd, struct thread *th, bool lock); -static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip); static struct dso *machine__kernel_dso(struct machine *machine) { @@ -2385,10 +2384,6 @@ static int add_callchain_ip(struct thread *thread, ms.maps = maps__get(al.maps); ms.map = map__get(al.map); ms.sym = al.sym; - - if (!branch && append_inlines(cursor, &ms, ip) == 0) - goto out; - srcline = callchain_srcline(&ms, al.addr); err = callchain_cursor_append(cursor, ip, &ms, branch, flags, nr_loop_iter, From 8cdd4aeff2e858c95bb088409028893cfb4e53d4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 7 Aug 2023 10:41:42 -0300 Subject: [PATCH 170/230] tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes from these csets: 522b1d69219d8f08 ("x86/cpu/amd: Add a Zenbleed fix") That cause no changes to tooling: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after $ Just silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Adrian Hunter Cc: Borislav Petkov (AMD) Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZND17H7BI4ariERn@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 3aedae61af4f..a00a53e15ab7 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -545,6 +545,7 @@ #define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) +#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9 #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 From 487ae3b42d1040b4cd5ff9754e7516b409204029 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 1 Aug 2023 13:54:52 -0700 Subject: [PATCH 171/230] perf stat: Don't display zero tool counts Andi reported (see link below) a regression when printing the 'duration_time' tool event, where it gets printed as "not counted" for most of the CPUs, fix it by skipping zero counts for tool events. Reported-by: Andi Kleen Signed-off-by: Ian Rogers Tested-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Rajeev Cc: Claire Jensen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/all/ZMlrzcVrVi1lTDmn@tassilo/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 7329b3340f88..d45d5dcb0e2b 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -931,6 +931,11 @@ static bool should_skip_zero_counter(struct perf_stat_config *config, */ if (config->aggr_mode == AGGR_THREAD && config->system_wide) return true; + + /* Tool events have the software PMU but are only gathered on 1. */ + if (evsel__is_tool(counter)) + return true; + /* * Skip value 0 when it's an uncore event and the given aggr id * does not belong to the PMU cpumask. From fa40ea27ede397cb19b8cb264f136db9c43c6f7e Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Fri, 4 Aug 2023 08:00:07 +0300 Subject: [PATCH 172/230] MAINTAINERS: update Claudiu Beznea's email address Update MAINTAINERS entries with a valid email address as the Microchip one is no longer valid. Acked-by: Conor Dooley Acked-by: Nicolas Ferre Signed-off-by: Claudiu Beznea Acked-by: Sebastian Reichel Link: https://lore.kernel.org/r/20230804050007.235799-1-claudiu.beznea@tuxon.dev Signed-off-by: Jakub Kicinski --- MAINTAINERS | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 92d23da63638..fa3e8ff79bf0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2339,7 +2339,7 @@ F: drivers/phy/mediatek/ ARM/MICROCHIP (ARM64) SoC support M: Conor Dooley M: Nicolas Ferre -M: Claudiu Beznea +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported T: git https://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git @@ -2348,7 +2348,7 @@ F: arch/arm64/boot/dts/microchip/ ARM/Microchip (AT91) SoC support M: Nicolas Ferre M: Alexandre Belloni -M: Claudiu Beznea +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported W: http://www.linux4sam.org @@ -3250,7 +3250,7 @@ F: include/uapi/linux/atm* ATMEL MACB ETHERNET DRIVER M: Nicolas Ferre -M: Claudiu Beznea +M: Claudiu Beznea S: Supported F: drivers/net/ethernet/cadence/ @@ -13785,7 +13785,7 @@ F: Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml F: drivers/spi/spi-at91-usart.c MICROCHIP AUDIO ASOC DRIVERS -M: Claudiu Beznea +M: Claudiu Beznea L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Supported F: Documentation/devicetree/bindings/sound/atmel* @@ -13808,7 +13808,7 @@ S: Maintained F: drivers/crypto/atmel-ecc.* MICROCHIP EIC DRIVER -M: Claudiu Beznea +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported F: Documentation/devicetree/bindings/interrupt-controller/microchip,sama7g5-eic.yaml @@ -13881,7 +13881,7 @@ F: drivers/video/fbdev/atmel_lcdfb.c F: include/video/atmel_lcdc.h MICROCHIP MCP16502 PMIC DRIVER -M: Claudiu Beznea +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported F: Documentation/devicetree/bindings/regulator/mcp16502-regulator.txt @@ -13908,7 +13908,7 @@ F: Documentation/devicetree/bindings/mtd/atmel-nand.txt F: drivers/mtd/nand/raw/atmel/* MICROCHIP OTPC DRIVER -M: Claudiu Beznea +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported F: Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml @@ -13947,7 +13947,7 @@ F: Documentation/devicetree/bindings/fpga/microchip,mpf-spi-fpga-mgr.yaml F: drivers/fpga/microchip-spi.c MICROCHIP PWM DRIVER -M: Claudiu Beznea +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-pwm@vger.kernel.org S: Supported @@ -13963,7 +13963,7 @@ F: drivers/iio/adc/at91-sama5d2_adc.c F: include/dt-bindings/iio/adc/at91-sama5d2_adc.h MICROCHIP SAMA5D2-COMPATIBLE SHUTDOWN CONTROLLER -M: Claudiu Beznea +M: Claudiu Beznea S: Supported F: Documentation/devicetree/bindings/power/reset/atmel,sama5d2-shdwc.yaml F: drivers/power/reset/at91-sama5d2_shdwc.c @@ -13980,7 +13980,7 @@ S: Supported F: drivers/spi/spi-atmel.* MICROCHIP SSC DRIVER -M: Claudiu Beznea +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported F: Documentation/devicetree/bindings/misc/atmel-ssc.txt @@ -14009,7 +14009,7 @@ F: drivers/usb/gadget/udc/atmel_usba_udc.* MICROCHIP WILC1000 WIFI DRIVER M: Ajay Singh -M: Claudiu Beznea +M: Claudiu Beznea L: linux-wireless@vger.kernel.org S: Supported F: drivers/net/wireless/microchip/wilc1000/ From b6f79e826fbd26e91e2fb28070484634cacdeb26 Mon Sep 17 00:00:00 2001 From: David Rheinsberg Date: Mon, 7 Aug 2023 10:12:25 +0200 Subject: [PATCH 173/230] net/unix: use consistent error code in SO_PEERPIDFD Change the new (unreleased) SO_PEERPIDFD sockopt to return ENODATA rather than ESRCH if a socket type does not support remote peer-PID queries. Currently, SO_PEERPIDFD returns ESRCH when the socket in question is not an AF_UNIX socket. This is quite unexpected, given that one would assume ESRCH means the peer process already exited and thus cannot be found. However, in that case the sockopt actually returns EINVAL (via pidfd_prepare()). This is rather inconsistent with other syscalls, which usually return ESRCH if a given PID refers to a non-existant process. This changes SO_PEERPIDFD to return ENODATA instead. This is also what SO_PEERGROUPS returns, and thus keeps a consistent behavior across sockopts. Note that this code is returned in 2 cases: First, if the socket type is not AF_UNIX, and secondly if the socket was not yet connected. In both cases ENODATA seems suitable. Signed-off-by: David Rheinsberg Reviewed-by: Christian Brauner Acked-by: Luca Boccassi Fixes: 7b26952a91cf ("net: core: add getsockopt SO_PEERPIDFD") Link: https://lore.kernel.org/r/20230807081225.816199-1-david@readahead.eu Signed-off-by: Jakub Kicinski --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 6d4f28efe29a..732fc37a4771 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1778,7 +1778,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, spin_unlock(&sk->sk_peer_lock); if (!peer_pid) - return -ESRCH; + return -ENODATA; pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); put_pid(peer_pid); From 15159ec0c831b565820c2de05114ea1b4cf07681 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 7 Aug 2023 19:34:49 +0800 Subject: [PATCH 174/230] net: hns3: restore user pause configure when disable autoneg Restore the mac pause state to user configuration when autoneg is disabled Signed-off-by: Jian Shen Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230807113452.474224-2-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 5 ++++- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index bf675c15fbb9..5594b8dd1e1d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10915,9 +10915,12 @@ int hclge_cfg_flowctrl(struct hclge_dev *hdev) u32 rx_pause, tx_pause; u8 flowctl; - if (!phydev->link || !phydev->autoneg) + if (!phydev->link) return 0; + if (!phydev->autoneg) + return hclge_mac_pause_setup_hw(hdev); + local_advertising = linkmode_adv_to_lcl_adv_t(phydev->advertising); if (phydev->pause) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index de509e5751a7..c58c31221762 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -1553,7 +1553,7 @@ static int hclge_bp_setup_hw(struct hclge_dev *hdev, u8 tc) return 0; } -static int hclge_mac_pause_setup_hw(struct hclge_dev *hdev) +int hclge_mac_pause_setup_hw(struct hclge_dev *hdev) { bool tx_en, rx_en; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 45dcfef3f90c..53eec6df5194 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -245,6 +245,7 @@ int hclge_pfc_pause_en_cfg(struct hclge_dev *hdev, u8 tx_rx_bitmap, u8 pfc_bitmap); int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx); int hclge_pause_addr_cfg(struct hclge_dev *hdev, const u8 *mac_addr); +int hclge_mac_pause_setup_hw(struct hclge_dev *hdev); void hclge_pfc_rx_stats_get(struct hclge_dev *hdev, u64 *stats); void hclge_pfc_tx_stats_get(struct hclge_dev *hdev, u64 *stats); int hclge_tm_qs_shaper_cfg(struct hclge_vport *vport, int max_tx_rate); From 08469dacfad25428b66549716811807203744f4f Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Mon, 7 Aug 2023 19:34:50 +0800 Subject: [PATCH 175/230] net: hns3: refactor hclge_mac_link_status_wait for interface reuse Some nic configurations could only be performed after link is down. So this patch refactor this API for reuse. Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230807113452.474224-3-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 5594b8dd1e1d..b440e42e1d9c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -72,6 +72,8 @@ static void hclge_restore_hw_table(struct hclge_dev *hdev); static void hclge_sync_promisc_mode(struct hclge_dev *hdev); static void hclge_sync_fd_table(struct hclge_dev *hdev); static void hclge_update_fec_stats(struct hclge_dev *hdev); +static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret, + int wait_cnt); static struct hnae3_ae_algo ae_algo; @@ -7647,10 +7649,9 @@ static void hclge_phy_link_status_wait(struct hclge_dev *hdev, } while (++i < HCLGE_PHY_LINK_STATUS_NUM); } -static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret) +static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret, + int wait_cnt) { -#define HCLGE_MAC_LINK_STATUS_NUM 100 - int link_status; int i = 0; int ret; @@ -7663,13 +7664,15 @@ static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret) return 0; msleep(HCLGE_LINK_STATUS_MS); - } while (++i < HCLGE_MAC_LINK_STATUS_NUM); + } while (++i < wait_cnt); return -EBUSY; } static int hclge_mac_phy_link_status_wait(struct hclge_dev *hdev, bool en, bool is_phy) { +#define HCLGE_MAC_LINK_STATUS_NUM 100 + int link_ret; link_ret = en ? HCLGE_LINK_STATUS_UP : HCLGE_LINK_STATUS_DOWN; @@ -7677,7 +7680,8 @@ static int hclge_mac_phy_link_status_wait(struct hclge_dev *hdev, bool en, if (is_phy) hclge_phy_link_status_wait(hdev, link_ret); - return hclge_mac_link_status_wait(hdev, link_ret); + return hclge_mac_link_status_wait(hdev, link_ret, + HCLGE_MAC_LINK_STATUS_NUM); } static int hclge_set_app_loopback(struct hclge_dev *hdev, bool en) From 6265e242f7b95f2c1195b42ec912b84ad161470e Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Mon, 7 Aug 2023 19:34:51 +0800 Subject: [PATCH 176/230] net: hns3: add wait until mac link down In some configure flow of hns3 driver, for example, change mtu, it will disable MAC through firmware before configuration. But firmware disables MAC asynchronously. The rx traffic may be not stopped in this case. So fixes it by waiting until mac link is down. Fixes: a9775bb64aa7 ("net: hns3: fix set and get link ksettings issue") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230807113452.474224-4-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index b440e42e1d9c..a940e35aef29 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -7560,6 +7560,8 @@ static void hclge_enable_fd(struct hnae3_handle *handle, bool enable) static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable) { +#define HCLGE_LINK_STATUS_WAIT_CNT 3 + struct hclge_desc desc; struct hclge_config_mac_mode_cmd *req = (struct hclge_config_mac_mode_cmd *)desc.data; @@ -7584,9 +7586,15 @@ static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable) req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en); ret = hclge_cmd_send(&hdev->hw, &desc, 1); - if (ret) + if (ret) { dev_err(&hdev->pdev->dev, "mac enable fail, ret =%d.\n", ret); + return; + } + + if (!enable) + hclge_mac_link_status_wait(hdev, HCLGE_LINK_STATUS_DOWN, + HCLGE_LINK_STATUS_WAIT_CNT); } static int hclge_config_switch_param(struct hclge_dev *hdev, int vfid, From ac6257a3ae5db5193b1f19c268e4f72d274ddb88 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Mon, 7 Aug 2023 19:34:52 +0800 Subject: [PATCH 177/230] net: hns3: fix deadlock issue when externel_lb and reset are executed together When externel_lb and reset are executed together, a deadlock may occur: [ 3147.217009] INFO: task kworker/u321:0:7 blocked for more than 120 seconds. [ 3147.230483] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 3147.238999] task:kworker/u321:0 state:D stack: 0 pid: 7 ppid: 2 flags:0x00000008 [ 3147.248045] Workqueue: hclge hclge_service_task [hclge] [ 3147.253957] Call trace: [ 3147.257093] __switch_to+0x7c/0xbc [ 3147.261183] __schedule+0x338/0x6f0 [ 3147.265357] schedule+0x50/0xe0 [ 3147.269185] schedule_preempt_disabled+0x18/0x24 [ 3147.274488] __mutex_lock.constprop.0+0x1d4/0x5dc [ 3147.279880] __mutex_lock_slowpath+0x1c/0x30 [ 3147.284839] mutex_lock+0x50/0x60 [ 3147.288841] rtnl_lock+0x20/0x2c [ 3147.292759] hclge_reset_prepare+0x68/0x90 [hclge] [ 3147.298239] hclge_reset_subtask+0x88/0xe0 [hclge] [ 3147.303718] hclge_reset_service_task+0x84/0x120 [hclge] [ 3147.309718] hclge_service_task+0x2c/0x70 [hclge] [ 3147.315109] process_one_work+0x1d0/0x490 [ 3147.319805] worker_thread+0x158/0x3d0 [ 3147.324240] kthread+0x108/0x13c [ 3147.328154] ret_from_fork+0x10/0x18 In externel_lb process, the hns3 driver call napi_disable() first, then the reset happen, then the restore process of the externel_lb will fail, and will not call napi_enable(). When doing externel_lb again, napi_disable() will be double call, cause a deadlock of rtnl_lock(). This patch use the HNS3_NIC_STATE_DOWN state to protect the calling of napi_disable() and napi_enable() in externel_lb process, just as the usage in ndo_stop() and ndo_start(). Fixes: 04b6ba143521 ("net: hns3: add support for external loopback test") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230807113452.474224-5-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 9f6890059666..b7b51e56b030 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -5854,6 +5854,9 @@ void hns3_external_lb_prepare(struct net_device *ndev, bool if_running) if (!if_running) return; + if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state)) + return; + netif_carrier_off(ndev); netif_tx_disable(ndev); @@ -5882,7 +5885,16 @@ void hns3_external_lb_restore(struct net_device *ndev, bool if_running) if (!if_running) return; - hns3_nic_reset_all_ring(priv->ae_handle); + if (hns3_nic_resetting(ndev)) + return; + + if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state)) + return; + + if (hns3_nic_reset_all_ring(priv->ae_handle)) + return; + + clear_bit(HNS3_NIC_STATE_DOWN, &priv->state); for (i = 0; i < priv->vector_num; i++) hns3_vector_enable(&priv->tqp_vector[i]); From 06b412589eef780b792e73df131d35dc43cc4a49 Mon Sep 17 00:00:00 2001 From: Muhammad Husaini Zulkifli Date: Mon, 7 Aug 2023 13:51:29 -0700 Subject: [PATCH 178/230] igc: Add lock to safeguard global Qbv variables Access to shared variables through hrtimer requires locking in order to protect the variables because actions to write into these variables (oper_gate_closed, admin_gate_closed, and qbv_transition) might potentially occur simultaneously. This patch provides a locking mechanisms to avoid such scenarios. Fixes: 175c241288c0 ("igc: Fix TX Hang issue when QBV Gate is closed") Suggested-by: Leon Romanovsky Signed-off-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20230807205129.3129346-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc.h | 4 +++ drivers/net/ethernet/intel/igc/igc_main.c | 34 +++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 9db384f66a8e..38901d2a4680 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -195,6 +195,10 @@ struct igc_adapter { u32 qbv_config_change_errors; bool qbv_transition; unsigned int qbv_count; + /* Access to oper_gate_closed, admin_gate_closed and qbv_transition + * are protected by the qbv_tx_lock. + */ + spinlock_t qbv_tx_lock; /* OS defined structs */ struct pci_dev *pdev; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index bdeb36790d77..6f557e843e49 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -4801,6 +4801,7 @@ static int igc_sw_init(struct igc_adapter *adapter) adapter->nfc_rule_count = 0; spin_lock_init(&adapter->stats64_lock); + spin_lock_init(&adapter->qbv_tx_lock); /* Assume MSI-X interrupts, will be checked during IRQ allocation */ adapter->flags |= IGC_FLAG_HAS_MSIX; @@ -6119,15 +6120,15 @@ static int igc_tsn_enable_launchtime(struct igc_adapter *adapter, return igc_tsn_offload_apply(adapter); } -static int igc_tsn_clear_schedule(struct igc_adapter *adapter) +static int igc_qbv_clear_schedule(struct igc_adapter *adapter) { + unsigned long flags; int i; adapter->base_time = 0; adapter->cycle_time = NSEC_PER_SEC; adapter->taprio_offload_enable = false; adapter->qbv_config_change_errors = 0; - adapter->qbv_transition = false; adapter->qbv_count = 0; for (i = 0; i < adapter->num_tx_queues; i++) { @@ -6136,10 +6137,28 @@ static int igc_tsn_clear_schedule(struct igc_adapter *adapter) ring->start_time = 0; ring->end_time = NSEC_PER_SEC; ring->max_sdu = 0; + } + + spin_lock_irqsave(&adapter->qbv_tx_lock, flags); + + adapter->qbv_transition = false; + + for (i = 0; i < adapter->num_tx_queues; i++) { + struct igc_ring *ring = adapter->tx_ring[i]; + ring->oper_gate_closed = false; ring->admin_gate_closed = false; } + spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags); + + return 0; +} + +static int igc_tsn_clear_schedule(struct igc_adapter *adapter) +{ + igc_qbv_clear_schedule(adapter); + return 0; } @@ -6150,6 +6169,7 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, struct igc_hw *hw = &adapter->hw; u32 start_time = 0, end_time = 0; struct timespec64 now; + unsigned long flags; size_t n; int i; @@ -6217,6 +6237,8 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, start_time += e->interval; } + spin_lock_irqsave(&adapter->qbv_tx_lock, flags); + /* Check whether a queue gets configured. * If not, set the start and end time to be end time. */ @@ -6241,6 +6263,8 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, } } + spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags); + for (i = 0; i < adapter->num_tx_queues; i++) { struct igc_ring *ring = adapter->tx_ring[i]; struct net_device *dev = adapter->netdev; @@ -6619,8 +6643,11 @@ static enum hrtimer_restart igc_qbv_scheduling_timer(struct hrtimer *timer) { struct igc_adapter *adapter = container_of(timer, struct igc_adapter, hrtimer); + unsigned long flags; unsigned int i; + spin_lock_irqsave(&adapter->qbv_tx_lock, flags); + adapter->qbv_transition = true; for (i = 0; i < adapter->num_tx_queues; i++) { struct igc_ring *tx_ring = adapter->tx_ring[i]; @@ -6633,6 +6660,9 @@ static enum hrtimer_restart igc_qbv_scheduling_timer(struct hrtimer *timer) } } adapter->qbv_transition = false; + + spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags); + return HRTIMER_NORESTART; } From 0fb1d8eb234b6979d4981d2d385780dd7d8d9771 Mon Sep 17 00:00:00 2001 From: Piotr Gardocki Date: Mon, 7 Aug 2023 13:50:11 -0700 Subject: [PATCH 179/230] iavf: fix potential races for FDIR filters Add fdir_fltr_lock locking in unprotected places. The change in iavf_fdir_is_dup_fltr adds a spinlock around a loop which iterates over all filters and looks for a duplicate. The filter can be removed from list and freed from memory at the same time it's being compared. All other places where filters are deleted are already protected with spinlock. The remaining changes protect adapter->fdir_active_fltr variable so now all its uses are under a spinlock. Fixes: 527691bf0682 ("iavf: Support IPv4 Flow Director filters") Signed-off-by: Piotr Gardocki Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230807205011.3129224-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 5 ++++- drivers/net/ethernet/intel/iavf/iavf_fdir.c | 11 ++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 2f47cfa7f06e..460ca561819a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -1401,14 +1401,15 @@ static int iavf_add_fdir_ethtool(struct iavf_adapter *adapter, struct ethtool_rx if (fsp->flow_type & FLOW_MAC_EXT) return -EINVAL; + spin_lock_bh(&adapter->fdir_fltr_lock); if (adapter->fdir_active_fltr >= IAVF_MAX_FDIR_FILTERS) { + spin_unlock_bh(&adapter->fdir_fltr_lock); dev_err(&adapter->pdev->dev, "Unable to add Flow Director filter because VF reached the limit of max allowed filters (%u)\n", IAVF_MAX_FDIR_FILTERS); return -ENOSPC; } - spin_lock_bh(&adapter->fdir_fltr_lock); if (iavf_find_fdir_fltr_by_loc(adapter, fsp->location)) { dev_err(&adapter->pdev->dev, "Failed to add Flow Director filter, it already exists\n"); spin_unlock_bh(&adapter->fdir_fltr_lock); @@ -1781,7 +1782,9 @@ static int iavf_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd, case ETHTOOL_GRXCLSRLCNT: if (!FDIR_FLTR_SUPPORT(adapter)) break; + spin_lock_bh(&adapter->fdir_fltr_lock); cmd->rule_cnt = adapter->fdir_active_fltr; + spin_unlock_bh(&adapter->fdir_fltr_lock); cmd->data = IAVF_MAX_FDIR_FILTERS; ret = 0; break; diff --git a/drivers/net/ethernet/intel/iavf/iavf_fdir.c b/drivers/net/ethernet/intel/iavf/iavf_fdir.c index 6146203efd84..505e82ebafe4 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_fdir.c +++ b/drivers/net/ethernet/intel/iavf/iavf_fdir.c @@ -722,7 +722,9 @@ void iavf_print_fdir_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr *f bool iavf_fdir_is_dup_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr *fltr) { struct iavf_fdir_fltr *tmp; + bool ret = false; + spin_lock_bh(&adapter->fdir_fltr_lock); list_for_each_entry(tmp, &adapter->fdir_list_head, list) { if (tmp->flow_type != fltr->flow_type) continue; @@ -732,11 +734,14 @@ bool iavf_fdir_is_dup_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr * !memcmp(&tmp->ip_data, &fltr->ip_data, sizeof(fltr->ip_data)) && !memcmp(&tmp->ext_data, &fltr->ext_data, - sizeof(fltr->ext_data))) - return true; + sizeof(fltr->ext_data))) { + ret = true; + break; + } } + spin_unlock_bh(&adapter->fdir_fltr_lock); - return false; + return ret; } /** From 1a8c251cff2052b60009a070173308322e9600d3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Aug 2023 16:58:56 +0300 Subject: [PATCH 180/230] PCI: move OF status = "disabled" detection to dev->match_driver The blamed commit has broken probing on arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi when &enetc_port0 (PCI function 0) has status = "disabled". Background: pci_scan_slot() has logic to say that if the function 0 of a device is absent, the entire device is absent and we can skip the other functions entirely. Traditionally, this has meant that pci_bus_read_dev_vendor_id() returns an error code for that function. However, since the blamed commit, there is an extra confounding condition: function 0 of the device exists and has a valid vendor id, but it is disabled in the device tree. In that case, pci_scan_slot() would incorrectly skip the entire device instead of just that function. In the case of NXP LS1028A, status = "disabled" does not mean that the PCI function's config space is not available for reading. It is, but the Ethernet port is just not functionally useful with a particular SerDes protocol configuration (0x9999) due to pinmuxing constraints of the Soc. So, pci_scan_slot() skips all other functions on the ENETC ECAM (enetc_port1, enetc_port2, enetc_mdio_pf3 etc) when just enetc_port0 had to not be probed. There is an additional regression introduced by the change, caused by its fundamental premise. The enetc driver needs to run code for all PCI functions, regardless of whether they're enabled or not in the device tree. That is no longer possible if the driver's probe function is no longer called. But Rob recommends that we move the of_device_is_available() detection to dev->match_driver, and this makes the PCI fixups still run on all functions, while just probing drivers for those functions that are enabled. So, a separate change in the enetc driver will have to move the workarounds to a PCI fixup. Fixes: 6fffbc7ae137 ("PCI: Honor firmware's device disabled status") Link: https://lore.kernel.org/netdev/CAL_JsqLsVYiPLx2kcHkDQ4t=hQVCR7NHziDwi9cCFUFhx48Qow@mail.gmail.com/ Suggested-by: Rob Herring Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/pci/bus.c | 4 +++- drivers/pci/of.c | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 5bc81cc0a2de..46b252bbe500 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -332,6 +333,7 @@ void __weak pcibios_bus_add_device(struct pci_dev *pdev) { } */ void pci_bus_add_device(struct pci_dev *dev) { + struct device_node *dn = dev->dev.of_node; int retval; /* @@ -344,7 +346,7 @@ void pci_bus_add_device(struct pci_dev *dev) pci_proc_attach_device(dev); pci_bridge_d3_update(dev); - dev->match_driver = true; + dev->match_driver = !dn || of_device_is_available(dn); retval = device_attach(&dev->dev); if (retval < 0 && retval != -EPROBE_DEFER) pci_warn(dev, "device attach failed (%d)\n", retval); diff --git a/drivers/pci/of.c b/drivers/pci/of.c index e51219f9f523..3c158b17dcb5 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -34,11 +34,6 @@ int pci_set_of_node(struct pci_dev *dev) if (!node) return 0; - if (!of_device_is_available(node)) { - of_node_put(node); - return -ENODEV; - } - device_set_node(&dev->dev, of_fwnode_handle(node)); return 0; } From f0168042a21292d20007d24ab2e4fc32f79ebf11 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Aug 2023 16:58:57 +0300 Subject: [PATCH 181/230] net: enetc: reimplement RFS/RSS memory clearing as PCI quirk The workaround implemented in commit 3222b5b613db ("net: enetc: initialize RFS/RSS memories for unused ports too") is no longer effective after commit 6fffbc7ae137 ("PCI: Honor firmware's device disabled status"). Thus, it has introduced a regression and we see AER errors being reported again: $ ip link set sw2p0 up && dhclient -i sw2p0 && ip addr show sw2p0 fsl_enetc 0000:00:00.2 eno2: configuring for fixed/internal link mode fsl_enetc 0000:00:00.2 eno2: Link is Up - 2.5Gbps/Full - flow control rx/tx mscc_felix 0000:00:00.5 swp2: configuring for fixed/sgmii link mode mscc_felix 0000:00:00.5 swp2: Link is Up - 1Gbps/Full - flow control off sja1105 spi2.2 sw2p0: configuring for phy/rgmii-id link mode sja1105 spi2.2 sw2p0: Link is Up - 1Gbps/Full - flow control off pcieport 0000:00:1f.0: AER: Multiple Corrected error received: 0000:00:00.0 pcieport 0000:00:1f.0: AER: can't find device of ID0000 Rob's suggestion is to reimplement the enetc driver workaround as a PCI fixup, and to modify the PCI core to run the fixups for all PCI functions. This change handles the first part. We refactor the common code in enetc_psi_create() and enetc_psi_destroy(), and use the PCI fixup only for those functions for which enetc_pf_probe() won't get called. This avoids some work being done twice for the PFs which are enabled. Fixes: 6fffbc7ae137 ("PCI: Honor firmware's device disabled status") Link: https://lore.kernel.org/netdev/CAL_JsqLsVYiPLx2kcHkDQ4t=hQVCR7NHziDwi9cCFUFhx48Qow@mail.gmail.com/ Suggested-by: Rob Herring Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/ethernet/freescale/enetc/enetc_pf.c | 113 ++++++++++++------ 1 file changed, 78 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 1416262d4296..d52ac0b6add6 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -1208,6 +1208,59 @@ static int enetc_pf_register_with_ierb(struct pci_dev *pdev) return enetc_ierb_register_pf(ierb_pdev, pdev); } +static struct enetc_si *enetc_psi_create(struct pci_dev *pdev) +{ + struct enetc_si *si; + int err; + + err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(struct enetc_pf)); + if (err) { + dev_err_probe(&pdev->dev, err, "PCI probing failed\n"); + goto out; + } + + si = pci_get_drvdata(pdev); + if (!si->hw.port || !si->hw.global) { + err = -ENODEV; + dev_err(&pdev->dev, "could not map PF space, probing a VF?\n"); + goto out_pci_remove; + } + + err = enetc_setup_cbdr(&pdev->dev, &si->hw, ENETC_CBDR_DEFAULT_SIZE, + &si->cbd_ring); + if (err) + goto out_pci_remove; + + err = enetc_init_port_rfs_memory(si); + if (err) { + dev_err(&pdev->dev, "Failed to initialize RFS memory\n"); + goto out_teardown_cbdr; + } + + err = enetc_init_port_rss_memory(si); + if (err) { + dev_err(&pdev->dev, "Failed to initialize RSS memory\n"); + goto out_teardown_cbdr; + } + + return si; + +out_teardown_cbdr: + enetc_teardown_cbdr(&si->cbd_ring); +out_pci_remove: + enetc_pci_remove(pdev); +out: + return ERR_PTR(err); +} + +static void enetc_psi_destroy(struct pci_dev *pdev) +{ + struct enetc_si *si = pci_get_drvdata(pdev); + + enetc_teardown_cbdr(&si->cbd_ring); + enetc_pci_remove(pdev); +} + static int enetc_pf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -1226,32 +1279,10 @@ static int enetc_pf_probe(struct pci_dev *pdev, "Could not register with IERB driver: %pe, please update the device tree\n", ERR_PTR(err)); - err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(*pf)); - if (err) - return dev_err_probe(&pdev->dev, err, "PCI probing failed\n"); - - si = pci_get_drvdata(pdev); - if (!si->hw.port || !si->hw.global) { - err = -ENODEV; - dev_err(&pdev->dev, "could not map PF space, probing a VF?\n"); - goto err_map_pf_space; - } - - err = enetc_setup_cbdr(&pdev->dev, &si->hw, ENETC_CBDR_DEFAULT_SIZE, - &si->cbd_ring); - if (err) - goto err_setup_cbdr; - - err = enetc_init_port_rfs_memory(si); - if (err) { - dev_err(&pdev->dev, "Failed to initialize RFS memory\n"); - goto err_init_port_rfs; - } - - err = enetc_init_port_rss_memory(si); - if (err) { - dev_err(&pdev->dev, "Failed to initialize RSS memory\n"); - goto err_init_port_rss; + si = enetc_psi_create(pdev); + if (IS_ERR(si)) { + err = PTR_ERR(si); + goto err_psi_create; } if (node && !of_device_is_available(node)) { @@ -1339,15 +1370,10 @@ err_alloc_si_res: si->ndev = NULL; free_netdev(ndev); err_alloc_netdev: -err_init_port_rss: -err_init_port_rfs: err_device_disabled: err_setup_mac_addresses: - enetc_teardown_cbdr(&si->cbd_ring); -err_setup_cbdr: -err_map_pf_space: - enetc_pci_remove(pdev); - + enetc_psi_destroy(pdev); +err_psi_create: return err; } @@ -1370,13 +1396,30 @@ static void enetc_pf_remove(struct pci_dev *pdev) enetc_free_msix(priv); enetc_free_si_resources(priv); - enetc_teardown_cbdr(&si->cbd_ring); free_netdev(si->ndev); - enetc_pci_remove(pdev); + enetc_psi_destroy(pdev); } +static void enetc_fixup_clear_rss_rfs(struct pci_dev *pdev) +{ + struct device_node *node = pdev->dev.of_node; + struct enetc_si *si; + + /* Only apply quirk for disabled functions. For the ones + * that are enabled, enetc_pf_probe() will apply it. + */ + if (node && of_device_is_available(node)) + return; + + si = enetc_psi_create(pdev); + if (si) + enetc_psi_destroy(pdev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_FREESCALE, ENETC_DEV_ID_PF, + enetc_fixup_clear_rss_rfs); + static const struct pci_device_id enetc_pf_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, ENETC_DEV_ID_PF) }, { 0, } /* End of table. */ From bfce089ddd0e440d799717cb7312b44faac47983 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Aug 2023 16:58:58 +0300 Subject: [PATCH 182/230] net: enetc: remove of_device_is_available() handling Since commit 6fffbc7ae137 ("PCI: Honor firmware's device disabled status"), this is redundant and does nothing, because enetc_pf_probe() no longer even gets called. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_pf.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index d52ac0b6add6..e0a4cb7e3f50 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -1186,14 +1186,9 @@ static int enetc_init_port_rss_memory(struct enetc_si *si) static int enetc_pf_register_with_ierb(struct pci_dev *pdev) { - struct device_node *node = pdev->dev.of_node; struct platform_device *ierb_pdev; struct device_node *ierb_node; - /* Don't register with the IERB if the PF itself is disabled */ - if (!node || !of_device_is_available(node)) - return 0; - ierb_node = of_find_compatible_node(NULL, NULL, "fsl,ls1028a-enetc-ierb"); if (!ierb_node || !of_device_is_available(ierb_node)) @@ -1285,12 +1280,6 @@ static int enetc_pf_probe(struct pci_dev *pdev, goto err_psi_create; } - if (node && !of_device_is_available(node)) { - dev_info(&pdev->dev, "device is disabled, skipping\n"); - err = -ENODEV; - goto err_device_disabled; - } - pf = enetc_si_priv(si); pf->si = si; pf->total_vfs = pci_sriov_get_totalvfs(pdev); @@ -1370,7 +1359,6 @@ err_alloc_si_res: si->ndev = NULL; free_netdev(ndev); err_alloc_netdev: -err_device_disabled: err_setup_mac_addresses: enetc_psi_destroy(pdev); err_psi_create: From 833bac7ec392bf75053c8a4fa4c36d4148dac77d Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Fri, 4 Aug 2023 19:06:23 +0200 Subject: [PATCH 183/230] net/smc: Fix setsockopt and sysctl to specify same buffer size again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") introduced the net.smc.rmem and net.smc.wmem sysctls to specify the size of buffers to be used for SMC type connections. This created a regression for users that specified the buffer size via setsockopt() as the effective buffer size was now doubled. Re-introduce the division by 2 in the SMC buffer create code and level this out by duplicating the net.smc.[rw]mem values used for initializing sk_rcvbuf/sk_sndbuf at socket creation time. This gives users of both methods (setsockopt or sysctl) the effective buffer size that they expect. Initialize net.smc.[rw]mem from its own constant of 64kB, respectively. Internal performance tests show that this value is a good compromise between throughput/latency and memory consumption. Also, this decouples it from any tuning that was done to net.ipv4.tcp_[rw]mem[1] before the module for SMC protocol was loaded. Check that no more than INT_MAX / 2 is assigned to net.smc.[rw]mem, in order to avoid any overflow condition when that is doubled for use in sk_sndbuf or sk_rcvbuf. While at it, drop the confusing sk_buf_size variable from __smc_buf_create and name "compressed" buffer size variables more consistently. Background: Before the commit mentioned above, SMC's buffer allocator in __smc_buf_create() always used half of the sockets' sk_rcvbuf/sk_sndbuf value as initial value to search for appropriate buffers. If the search resorted to using a bigger buffer when all buffers of the specified size were busy, the duplicate of the used effective buffer size is stored back to sk_rcvbuf/sk_sndbuf. When available, buffers of exactly the size that a user had specified as input to setsockopt() were used, despite setsockopt()'s documentation in "man 7 socket" talking of a mandatory duplication: [...] SO_SNDBUF Sets or gets the maximum socket send buffer in bytes. The kernel doubles this value (to allow space for book‐ keeping overhead) when it is set using setsockopt(2), and this doubled value is returned by getsockopt(2). The default value is set by the /proc/sys/net/core/wmem_default file and the maximum allowed value is set by the /proc/sys/net/core/wmem_max file. The minimum (doubled) value for this option is 2048. [...] Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") Co-developed-by: Jan Karcher Signed-off-by: Jan Karcher Reviewed-by: Wenjia Zhang Reviewed-by: Tony Lu Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ++-- net/smc/smc.h | 2 +- net/smc/smc_clc.c | 4 ++-- net/smc/smc_core.c | 25 ++++++++++++------------- net/smc/smc_sysctl.c | 10 ++++++++-- 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0c013d2b5d8f..5b878e523abf 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -378,8 +378,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); - WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); + WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); diff --git a/net/smc/smc.h b/net/smc/smc.h index 2eeea4cdc718..1f2b912c43d1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -161,7 +161,7 @@ struct smc_connection { struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size_short;/* compressed notation */ + int rmbe_size_comp; /* compressed notation */ int rmbe_update_limit; /* lower limit for consumer * cursor update diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index b9b8b07aa702..c90d9e5dda54 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1007,7 +1007,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->d0.gid = conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd); clc->d0.token = conn->rmb_desc->token; - clc->d0.dmbe_size = conn->rmbe_size_short; + clc->d0.dmbe_size = conn->rmbe_size_comp; clc->d0.dmbe_idx = 0; memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); if (version == SMC_V1) { @@ -1050,7 +1050,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); break; } - clc->r0.rmbe_size = conn->rmbe_size_short; + clc->r0.rmbe_size = conn->rmbe_size_comp; clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : cpu_to_be64((u64)sg_dma_address diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3f465faf2b68..6b78075404d7 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2309,31 +2309,30 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; - int bufsize, bufsize_short; + int bufsize, bufsize_comp; struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; - int sk_buf_size; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf; + bufsize = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf; + bufsize = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); - bufsize_short >= 0; bufsize_short--) { + for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb); + bufsize_comp >= 0; bufsize_comp--) { if (is_rmb) { lock = &lgr->rmbs_lock; - buf_list = &lgr->rmbs[bufsize_short]; + buf_list = &lgr->rmbs[bufsize_comp]; } else { lock = &lgr->sndbufs_lock; - buf_list = &lgr->sndbufs[bufsize_short]; + buf_list = &lgr->sndbufs[bufsize_comp]; } - bufsize = smc_uncompress_bufsize(bufsize_short); + bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); @@ -2377,8 +2376,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; - conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize; + conn->rmbe_size_comp = bufsize_comp; + smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -2386,7 +2385,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize; + smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } return 0; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index b6f79fabb9d3..0b2a957ca5f5 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -21,6 +21,10 @@ static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; +static int max_sndbuf = INT_MAX / 2; +static int max_rcvbuf = INT_MAX / 2; +static const int net_smc_wmem_init = (64 * 1024); +static const int net_smc_rmem_init = (64 * 1024); static struct ctl_table smc_table[] = { { @@ -53,6 +57,7 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, + .extra2 = &max_sndbuf, }, { .procname = "rmem", @@ -61,6 +66,7 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, + .extra2 = &max_rcvbuf, }, { } }; @@ -88,8 +94,8 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; - WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); - WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); + WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init); + WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); return 0; From 30c3c4a4497c3765bf6b298f5072c8165aeaf7cc Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Fri, 4 Aug 2023 19:06:24 +0200 Subject: [PATCH 184/230] net/smc: Use correct buffer sizes when switching between TCP and SMC Tuning of the effective buffer size through setsockopts was working for SMC traffic only but not for TCP fall-back connections even before commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable"). That change made it apparent that TCP fall-back connections would use net.smc.[rw]mem as buffer size instead of net.ipv4_tcp_[rw]mem. Amend the code that copies attributes between the (TCP) clcsock and the SMC socket and adjust buffer sizes appropriately: - Copy over sk_userlocks so that both sockets agree on whether tuning via setsockopt is active. - When falling back to TCP use sk_sndbuf or sk_rcvbuf as specified with setsockopt. Otherwise, use the sysctl value for TCP/IPv4. - Likewise, use either values from setsockopt or from sysctl for SMC (duplicated) on successful SMC connect. In smc_tcp_listen_work() drop the explicit copy of buffer sizes as that is taken care of by the attribute copy. Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") Reviewed-by: Wenjia Zhang Reviewed-by: Tony Lu Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller --- net/smc/af_smc.c | 75 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5b878e523abf..f5834af5fad5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -436,24 +436,9 @@ out: return rc; } -static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, - unsigned long mask) -{ - /* options we don't get control via setsockopt for */ - nsk->sk_type = osk->sk_type; - nsk->sk_sndbuf = osk->sk_sndbuf; - nsk->sk_rcvbuf = osk->sk_rcvbuf; - nsk->sk_sndtimeo = osk->sk_sndtimeo; - nsk->sk_rcvtimeo = osk->sk_rcvtimeo; - nsk->sk_mark = READ_ONCE(osk->sk_mark); - nsk->sk_priority = osk->sk_priority; - nsk->sk_rcvlowat = osk->sk_rcvlowat; - nsk->sk_bound_dev_if = osk->sk_bound_dev_if; - nsk->sk_err = osk->sk_err; - - nsk->sk_flags &= ~mask; - nsk->sk_flags |= osk->sk_flags & mask; -} +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ @@ -470,9 +455,55 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, (1UL << SOCK_NOFCS) | \ (1UL << SOCK_FILTER_LOCKED) | \ (1UL << SOCK_TSTAMP_NEW)) -/* copy only relevant settings and flags of SOL_SOCKET level from smc to - * clc socket (since smc is not called for these options from net/core) - */ + +/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ +static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + struct net *nnet = sock_net(nsk); + + nsk->sk_userlocks = osk->sk_userlocks; + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { + nsk->sk_sndbuf = osk->sk_sndbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_sndbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); + else + WRITE_ONCE(nsk->sk_sndbuf, + 2 * READ_ONCE(nnet->smc.sysctl_wmem)); + } + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { + nsk->sk_rcvbuf = osk->sk_rcvbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_rcvbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); + else + WRITE_ONCE(nsk->sk_rcvbuf, + 2 * READ_ONCE(nnet->smc.sysctl_rmem)); + } +} + +static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + /* options we don't get control via setsockopt for */ + nsk->sk_type = osk->sk_type; + nsk->sk_sndtimeo = osk->sk_sndtimeo; + nsk->sk_rcvtimeo = osk->sk_rcvtimeo; + nsk->sk_mark = READ_ONCE(osk->sk_mark); + nsk->sk_priority = osk->sk_priority; + nsk->sk_rcvlowat = osk->sk_rcvlowat; + nsk->sk_bound_dev_if = osk->sk_bound_dev_if; + nsk->sk_err = osk->sk_err; + + nsk->sk_flags &= ~mask; + nsk->sk_flags |= osk->sk_flags & mask; + + smc_adjust_sock_bufsizes(nsk, osk, mask); +} + static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) { smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); @@ -2479,8 +2510,6 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); - new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; - new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); From 24138933b97b055d486e8064b4a1721702442a9b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Aug 2023 14:31:15 +0200 Subject: [PATCH 185/230] netfilter: nf_tables: don't skip expired elements during walk There is an asymmetry between commit/abort and preparation phase if the following conditions are met: 1. set is a verdict map ("1.2.3.4 : jump foo") 2. timeouts are enabled In this case, following sequence is problematic: 1. element E in set S refers to chain C 2. userspace requests removal of set S 3. kernel does a set walk to decrement chain->use count for all elements from preparation phase 4. kernel does another set walk to remove elements from the commit phase (or another walk to do a chain->use increment for all elements from abort phase) If E has already expired in 1), it will be ignored during list walk, so its use count won't have been changed. Then, when set is culled, ->destroy callback will zap the element via nf_tables_set_elem_destroy(), but this function is only safe for elements that have been deactivated earlier from the preparation phase: lack of earlier deactivate removes the element but leaks the chain use count, which results in a WARN splat when the chain gets removed later, plus a leak of the nft_chain structure. Update pipapo_get() not to skip expired elements, otherwise flush command reports bogus ENOENT errors. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++++ net/netfilter/nft_set_hash.c | 2 -- net/netfilter/nft_set_pipapo.c | 18 ++++++++++++------ net/netfilter/nft_set_rbtree.c | 2 -- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d3c6ecd1f5a6..b4321869e5c6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5602,8 +5602,12 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_set_elem *elem) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); struct nft_set_dump_args *args; + if (nft_set_elem_expired(ext)) + return 0; + args = container_of(iter, struct nft_set_dump_args, iter); return nf_tables_fill_setelem(args->skb, set, elem, args->reset); } diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 0b73cb0e752f..24caa31fa231 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -278,8 +278,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&he->ext)) - goto cont; if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 49915a2a58eb..d54784ea465b 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,8 +566,7 @@ next_match: goto out; if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext) || - (genmask && + if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -601,8 +600,17 @@ out: static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - return pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + struct nft_pipapo_elem *ret; + + ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); + if (IS_ERR(ret)) + return ret; + + if (nft_set_elem_expired(&ret->ext)) + return ERR_PTR(-ENOENT); + + return ret; } /** @@ -2005,8 +2013,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, goto cont; e = f->mt[r].e; - if (nft_set_elem_expired(&e->ext)) - goto cont; elem.priv = e; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 8d73fffd2d09..39956e5341c9 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -552,8 +552,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&rbe->ext)) - goto cont; if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; From 6311071a056272e1e761de8d0305e87cc566f734 Mon Sep 17 00:00:00 2001 From: Keith Yeo Date: Mon, 31 Jul 2023 11:47:20 +0800 Subject: [PATCH 186/230] wifi: nl80211: fix integer overflow in nl80211_parse_mbssid_elems() nl80211_parse_mbssid_elems() uses a u8 variable num_elems to count the number of MBSSID elements in the nested netlink attribute attrs, which can lead to an integer overflow if a user of the nl80211 interface specifies 256 or more elements in the corresponding attribute in userspace. The integer overflow can lead to a heap buffer overflow as num_elems determines the size of the trailing array in elems, and this array is thereafter written to for each element in attrs. Note that this vulnerability only affects devices with the wiphy->mbssid_max_interfaces member set for the wireless physical device struct in the device driver, and can only be triggered by a process with CAP_NET_ADMIN capabilities. Fix this by checking for a maximum of 255 elements in attrs. Cc: stable@vger.kernel.org Fixes: dc1e3cb8da8b ("nl80211: MBSSID and EMA support in AP mode") Signed-off-by: Keith Yeo Link: https://lore.kernel.org/r/20230731034719.77206-1-keithyjy@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0da2e6a2a7ea..8bcf8e293308 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5430,8 +5430,11 @@ nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs) if (!wiphy->mbssid_max_interfaces) return ERR_PTR(-EINVAL); - nla_for_each_nested(nl_elems, attrs, rem_elems) + nla_for_each_nested(nl_elems, attrs, rem_elems) { + if (num_elems >= 255) + return ERR_PTR(-EINVAL); num_elems++; + } elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL); if (!elems) From 06f2ab86a5b6ed55f013258de4be9319841853ea Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Wed, 9 Aug 2023 04:12:41 -0400 Subject: [PATCH 187/230] wifi: ath12k: Fix buffer overflow when scanning with extraie If cfg80211 is providing extraie's for a scanning process then ath12k will copy that over to the firmware. The extraie.len is a 32 bit value in struct element_info and describes the amount of bytes for the vendor information elements. The problem is the allocation of the buffer. It has to align the TLV sections by 4 bytes. But the code was using an u8 to store the newly calculated length of this section (with alignment). And the new calculated length was then used to allocate the skbuff. But the actual code to copy in the data is using the extraie.len and not the calculated "aligned" length. The length of extraie with IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS enabled was 264 bytes during tests with a wifi card. But it only allocated 8 bytes (264 bytes % 256) for it. As consequence, the code to memcpy the extraie into the skb was then just overwriting data after skb->end. Things like shinfo were therefore corrupted. This could usually be seen by a crash in skb_zcopy_clear which tried to call a ubuf_info callback (using a bogus address). Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0-03427-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.15378.4 Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20230809081241.32765-1-quic_wgong@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath12k/wmi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 6512267ae4ca..4928e4e91660 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -2144,8 +2144,7 @@ int ath12k_wmi_send_scan_start_cmd(struct ath12k *ar, struct wmi_tlv *tlv; void *ptr; int i, ret, len; - u32 *tmp_ptr; - u8 extraie_len_with_pad = 0; + u32 *tmp_ptr, extraie_len_with_pad = 0; struct ath12k_wmi_hint_short_ssid_arg *s_ssid = NULL; struct ath12k_wmi_hint_bssid_arg *hint_bssid = NULL; From 77245f1c3c6495521f6a3af082696ee2f8ce3921 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 5 Aug 2023 00:06:43 +0200 Subject: [PATCH 188/230] x86/CPU/AMD: Do not leak quotient data after a division by 0 Under certain circumstances, an integer division by 0 which faults, can leave stale quotient data from a previous division operation on Zen1 microarchitectures. Do a dummy division 0/1 before returning from the #DE exception handler in order to avoid any leaks of potentially sensitive data. Signed-off-by: Borislav Petkov (AMD) Cc: Signed-off-by: Linus Torvalds --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/amd.c | 19 +++++++++++++++++++ arch/x86/kernel/traps.c | 2 ++ 4 files changed, 24 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 094f88fee536..b69b0d7756aa 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -495,4 +495,5 @@ /* BUG word 2 */ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ +#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7c67db7c9f53..973db0406528 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -683,10 +683,12 @@ extern u16 get_llc_id(unsigned int cpu); extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); extern bool cpu_has_ibpb_brtype_microcode(void); +extern void amd_clear_divider(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; } +static inline void amd_clear_divider(void) { } #endif extern unsigned long arch_align_stack(unsigned long sp); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1e1e253038ce..b55d8f82b621 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -75,6 +75,10 @@ static const int amd_zenbleed[] = AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); +static const int amd_div0[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), + AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); + static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { int osvw_id = *erratum++; @@ -1130,6 +1134,11 @@ static void init_amd(struct cpuinfo_x86 *c) WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); zenbleed_check(c); + + if (cpu_has_amd_erratum(c, amd_div0)) { + pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); + setup_force_cpu_bug(X86_BUG_DIV0); + } } #ifdef CONFIG_X86_32 @@ -1309,3 +1318,13 @@ bool cpu_has_ibpb_brtype_microcode(void) return false; } } + +/* + * Issue a DIV 0/1 insn to clear any division data from previous DIV + * operations. + */ +void noinstr amd_clear_divider(void) +{ + asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) + :: "a" (0), "d" (0), "r" (1)); +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4a817d20ce3b..1885326a8f65 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -206,6 +206,8 @@ DEFINE_IDTENTRY(exc_divide_error) { do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); + + amd_clear_divider(); } DEFINE_IDTENTRY(exc_overflow) From 22883973244b1caaa26f9c6171a41ba843c8d4bd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 9 Aug 2023 17:46:00 +0300 Subject: [PATCH 189/230] mm: Fix access_remote_vm() regression on tagged addresses GDB uses /proc/PID/mem to access memory of the target process. GDB doesn't untag addresses manually, but relies on kernel to do the right thing. mem_rw() of procfs uses access_remote_vm() to get data from the target process. It worked fine until recent changes in __access_remote_vm() that now checks if there's VMA at target address using raw address. Untag the address before looking up the VMA. Signed-off-by: Kirill A. Shutemov Reported-by: Christina Schimpe Fixes: eee9c708cc89 ("gup: avoid stack expansion warning for known-good case") Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index 603b2f419948..1ec1ef3418bf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5705,6 +5705,9 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, if (mmap_read_lock_killable(mm)) return 0; + /* Untag the address before looking up the VMA */ + addr = untagged_addr_remote(mm, addr); + /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) return 0; From cacc6e22932f373a91d7be55a9b992dc77f4c59b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 7 Aug 2023 23:12:29 -0500 Subject: [PATCH 190/230] tpm: Add a helper for checking hwrng enabled The same checks are repeated in three places to decide whether to use hwrng. Consolidate these into a helper. Also this fixes a case that one of them was missing a check in the cleanup path. Fixes: 554b841d4703 ("tpm: Disable RNG for all AMD fTPMs") Signed-off-by: Mario Limonciello Signed-off-by: Linus Torvalds --- drivers/char/tpm/tpm-chip.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index e904aae9771b..ea6b4013bc38 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -521,10 +521,20 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) return tpm_get_random(chip, data, max); } +static bool tpm_is_hwrng_enabled(struct tpm_chip *chip) +{ + if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM)) + return false; + if (tpm_is_firmware_upgrade(chip)) + return false; + if (chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED) + return false; + return true; +} + static int tpm_add_hwrng(struct tpm_chip *chip) { - if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) || - chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED) + if (!tpm_is_hwrng_enabled(chip)) return 0; snprintf(chip->hwrng_name, sizeof(chip->hwrng_name), @@ -629,7 +639,7 @@ int tpm_chip_register(struct tpm_chip *chip) return 0; out_hwrng: - if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip)) + if (tpm_is_hwrng_enabled(chip)) hwrng_unregister(&chip->hwrng); out_ppi: tpm_bios_log_teardown(chip); @@ -654,8 +664,7 @@ EXPORT_SYMBOL_GPL(tpm_chip_register); void tpm_chip_unregister(struct tpm_chip *chip) { tpm_del_legacy_sysfs(chip); - if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip) && - !(chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED)) + if (tpm_is_hwrng_enabled(chip)) hwrng_unregister(&chip->hwrng); tpm_bios_log_teardown(chip); if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip)) From 718cb09aaa6fa78cc8124e9517efbc6c92665384 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 8 Aug 2023 11:35:21 +0200 Subject: [PATCH 191/230] vlan: Fix VLAN 0 memory leak The referenced commit intended to fix memleak of VLAN 0 that is implicitly created on devices with NETIF_F_HW_VLAN_CTAG_FILTER feature. However, it doesn't take into account that the feature can be re-set during the netdevice lifetime which will cause memory leak if feature is disabled during the device deletion as illustrated by [0]. Fix the leak by unconditionally deleting VLAN 0 on NETDEV_DOWN event. [0]: > modprobe 8021q > ip l set dev eth2 up > ethtool -K eth2 rx-vlan-filter off > modprobe -r mlx5_ib > modprobe -r mlx5_core > cat /sys/kernel/debug/kmemleak unreferenced object 0xffff888103dcd900 (size 256): comm "ip", pid 1490, jiffies 4294907305 (age 325.364s) hex dump (first 32 bytes): 00 80 5d 03 81 88 ff ff 00 00 00 00 00 00 00 00 ..]............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000899f3bb9>] kmalloc_trace+0x25/0x80 [<000000002889a7a2>] vlan_vid_add+0xa0/0x210 [<000000007177800e>] vlan_device_event+0x374/0x760 [8021q] [<000000009a0716b1>] notifier_call_chain+0x35/0xb0 [<00000000bbf3d162>] __dev_notify_flags+0x58/0xf0 [<0000000053d2b05d>] dev_change_flags+0x4d/0x60 [<00000000982807e9>] do_setlink+0x28d/0x10a0 [<0000000058c1be00>] __rtnl_newlink+0x545/0x980 [<00000000e66c3bd9>] rtnl_newlink+0x44/0x70 [<00000000a2cc5970>] rtnetlink_rcv_msg+0x29c/0x390 [<00000000d307d1e4>] netlink_rcv_skb+0x54/0x100 [<00000000259d16f9>] netlink_unicast+0x1f6/0x2c0 [<000000007ce2afa1>] netlink_sendmsg+0x232/0x4a0 [<00000000f3f4bb39>] sock_sendmsg+0x38/0x60 [<000000002f9c0624>] ____sys_sendmsg+0x1e3/0x200 [<00000000d6ff5520>] ___sys_sendmsg+0x80/0xc0 unreferenced object 0xffff88813354fde0 (size 32): comm "ip", pid 1490, jiffies 4294907305 (age 325.364s) hex dump (first 32 bytes): a0 d9 dc 03 81 88 ff ff a0 d9 dc 03 81 88 ff ff ................ 81 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000899f3bb9>] kmalloc_trace+0x25/0x80 [<000000002da64724>] vlan_vid_add+0xdf/0x210 [<000000007177800e>] vlan_device_event+0x374/0x760 [8021q] [<000000009a0716b1>] notifier_call_chain+0x35/0xb0 [<00000000bbf3d162>] __dev_notify_flags+0x58/0xf0 [<0000000053d2b05d>] dev_change_flags+0x4d/0x60 [<00000000982807e9>] do_setlink+0x28d/0x10a0 [<0000000058c1be00>] __rtnl_newlink+0x545/0x980 [<00000000e66c3bd9>] rtnl_newlink+0x44/0x70 [<00000000a2cc5970>] rtnetlink_rcv_msg+0x29c/0x390 [<00000000d307d1e4>] netlink_rcv_skb+0x54/0x100 [<00000000259d16f9>] netlink_unicast+0x1f6/0x2c0 [<000000007ce2afa1>] netlink_sendmsg+0x232/0x4a0 [<00000000f3f4bb39>] sock_sendmsg+0x38/0x60 [<000000002f9c0624>] ____sys_sendmsg+0x1e3/0x200 [<00000000d6ff5520>] ___sys_sendmsg+0x80/0xc0 Fixes: efc73f4bbc23 ("net: Fix memory leak - vlan_info struct") Reviewed-by: Ido Schimmel Signed-off-by: Vlad Buslov Link: https://lore.kernel.org/r/20230808093521.1468929-1-vladbu@nvidia.com Signed-off-by: Jakub Kicinski --- net/8021q/vlan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index e40aa3e3641c..b3662119ddbc 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -384,8 +384,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } - if (event == NETDEV_DOWN && - (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) + if (event == NETDEV_DOWN) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); From 913f60cacda73ccac8eead94983e5884c03e04cd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 10:52:31 +0300 Subject: [PATCH 192/230] nexthop: Fix infinite nexthop dump when using maximum nexthop ID A netlink dump callback can return a positive number to signal that more information needs to be dumped or zero to signal that the dump is complete. In the second case, the core netlink code will append the NLMSG_DONE message to the skb in order to indicate to user space that the dump is complete. The nexthop dump callback always returns a positive number if nexthops were filled in the provided skb, even if the dump is complete. This means that a dump will span at least two recvmsg() calls as long as nexthops are present. In the last recvmsg() call the dump callback will not fill in any nexthops because the previous call indicated that the dump should restart from the last dumped nexthop ID plus one. # ip nexthop add id 1 blackhole # strace -e sendto,recvmsg -s 5 ip nexthop sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOP, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691394315, nlmsg_pid=0}, {nh_family=AF_UNSPEC, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 36 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=36, nlmsg_type=RTM_NEWNEXTHOP, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394315, nlmsg_pid=343}, {nh_family=AF_INET, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}, [[{nla_len=8, nla_type=NHA_ID}, 1], {nla_len=4, nla_type=NHA_BLACKHOLE}]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36 id 1 blackhole recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394315, nlmsg_pid=343}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20 +++ exited with 0 +++ This behavior is both inefficient and buggy. If the last nexthop to be dumped had the maximum ID of 0xffffffff, then the dump will restart from 0 (0xffffffff + 1) and never end: # ip nexthop add id $((2**32-1)) blackhole # ip nexthop id 4294967295 blackhole id 4294967295 blackhole [...] Fix by adjusting the dump callback to return zero when the dump is complete. After the fix only one recvmsg() call is made and the NLMSG_DONE message is appended to the RTM_NEWNEXTHOP response: # ip nexthop add id $((2**32-1)) blackhole # strace -e sendto,recvmsg -s 5 ip nexthop sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOP, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691394080, nlmsg_pid=0}, {nh_family=AF_UNSPEC, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 56 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=36, nlmsg_type=RTM_NEWNEXTHOP, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394080, nlmsg_pid=342}, {nh_family=AF_INET, nh_scope=RT_SCOPE_UNIVERSE, nh_protocol=RTPROT_UNSPEC, nh_flags=0}, [[{nla_len=8, nla_type=NHA_ID}, 4294967295], {nla_len=4, nla_type=NHA_BLACKHOLE}]], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691394080, nlmsg_pid=342}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 56 id 4294967295 blackhole +++ exited with 0 +++ Note that if the NLMSG_DONE message cannot be appended because of size limitations, then another recvmsg() will be needed, but the core netlink code will not invoke the dump callback and simply reply with a NLMSG_DONE message since it knows that the callback previously returned zero. Add a test that fails before the fix: # ./fib_nexthops.sh -t basic [...] TEST: Maximum nexthop ID dump [FAIL] [...] And passes after it: # ./fib_nexthops.sh -t basic [...] TEST: Maximum nexthop ID dump [ OK ] [...] Fixes: ab84be7e54fc ("net: Initial nexthop code") Reported-by: Petr Machata Closes: https://lore.kernel.org/netdev/87sf91enuf.fsf@nvidia.com/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230808075233.3337922-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 6 +----- tools/testing/selftests/net/fib_nexthops.sh | 5 +++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f95142e56da0..179e50d8fe07 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3221,13 +3221,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) &rtm_dump_nexthop_cb, &filter); if (err < 0) { if (likely(skb->len)) - goto out; - goto out_err; + err = skb->len; } -out: - err = skb->len; -out_err: cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 0f5e88c8f4ff..10aa059b9f06 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1981,6 +1981,11 @@ basic() run_cmd "$IP link set dev lo up" + # Dump should not loop endlessly when maximum nexthop ID is configured. + run_cmd "$IP nexthop add id $((2**32-1)) blackhole" + run_cmd "timeout 5 $IP nexthop" + log_test $? 0 "Maximum nexthop ID dump" + # # groups # From f10d3d9df49d9e6ee244fda6ca264f901a9c5d85 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 10:52:32 +0300 Subject: [PATCH 193/230] nexthop: Make nexthop bucket dump more efficient rtm_dump_nexthop_bucket_nh() is used to dump nexthop buckets belonging to a specific resilient nexthop group. The function returns a positive return code (the skb length) upon both success and failure. The above behavior is problematic. When a complete nexthop bucket dump is requested, the function that walks the different nexthops treats the non-zero return code as an error. This causes buckets belonging to different resilient nexthop groups to be dumped using different buffers even if they can all fit in the same buffer: # ip link add name dummy1 up type dummy # ip nexthop add id 1 dev dummy1 # ip nexthop add id 10 group 1 type resilient buckets 1 # ip nexthop add id 20 group 1 type resilient buckets 1 # strace -e recvmsg -s 0 ip nexthop bucket [...] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 64 id 10 index 0 idle_time 10.27 nhid 1 [...] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 64 id 20 index 0 idle_time 6.44 nhid 1 [...] Fix by only returning a non-zero return code when an error occurred and restarting the dump from the bucket index we failed to fill in. This allows buckets belonging to different resilient nexthop groups to be dumped using the same buffer: # ip link add name dummy1 up type dummy # ip nexthop add id 1 dev dummy1 # ip nexthop add id 10 group 1 type resilient buckets 1 # ip nexthop add id 20 group 1 type resilient buckets 1 # strace -e recvmsg -s 0 ip nexthop bucket [...] recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[...], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 128 id 10 index 0 idle_time 30.21 nhid 1 id 20 index 0 idle_time 26.7 nhid 1 [...] While this change is more of a performance improvement change than an actual bug fix, it is a prerequisite for a subsequent patch that does fix a bug. Fixes: 8a1bbabb034d ("nexthop: Add netlink handlers for bucket dump") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230808075233.3337922-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 179e50d8fe07..f365a4f63899 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3363,25 +3363,19 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, dd->filter.res_bucket_nh_id != nhge->nh->id) continue; + dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); - if (err < 0) { - if (likely(skb->len)) - goto out; - goto out_err; - } + if (err) + return err; } dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1; - bucket_index = 0; + dd->ctx->bucket_index = 0; -out: - err = skb->len; -out_err: - dd->ctx->bucket_index = bucket_index; - return err; + return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, From 8743aeff5bc4dcb5b87b43765f48d5ac3ad7dd9f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 10:52:33 +0300 Subject: [PATCH 194/230] nexthop: Fix infinite nexthop bucket dump when using maximum nexthop ID A netlink dump callback can return a positive number to signal that more information needs to be dumped or zero to signal that the dump is complete. In the second case, the core netlink code will append the NLMSG_DONE message to the skb in order to indicate to user space that the dump is complete. The nexthop bucket dump callback always returns a positive number if nexthop buckets were filled in the provided skb, even if the dump is complete. This means that a dump will span at least two recvmsg() calls as long as nexthop buckets are present. In the last recvmsg() call the dump callback will not fill in any nexthop buckets because the previous call indicated that the dump should restart from the last dumped nexthop ID plus one. # ip link add name dummy1 up type dummy # ip nexthop add id 1 dev dummy1 # ip nexthop add id 10 group 1 type resilient buckets 2 # strace -e sendto,recvmsg -s 5 ip nexthop bucket sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOPBUCKET, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691396980, nlmsg_pid=0}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 128 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 128 id 10 index 0 idle_time 6.66 nhid 1 id 10 index 1 idle_time 6.66 nhid 1 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 20 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396980, nlmsg_pid=347}, 0], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20 +++ exited with 0 +++ This behavior is both inefficient and buggy. If the last nexthop to be dumped had the maximum ID of 0xffffffff, then the dump will restart from 0 (0xffffffff + 1) and never end: # ip link add name dummy1 up type dummy # ip nexthop add id 1 dev dummy1 # ip nexthop add id $((2**32-1)) group 1 type resilient buckets 2 # ip nexthop bucket id 4294967295 index 0 idle_time 5.55 nhid 1 id 4294967295 index 1 idle_time 5.55 nhid 1 id 4294967295 index 0 idle_time 5.55 nhid 1 id 4294967295 index 1 idle_time 5.55 nhid 1 [...] Fix by adjusting the dump callback to return zero when the dump is complete. After the fix only one recvmsg() call is made and the NLMSG_DONE message is appended to the RTM_NEWNEXTHOPBUCKET responses: # ip link add name dummy1 up type dummy # ip nexthop add id 1 dev dummy1 # ip nexthop add id $((2**32-1)) group 1 type resilient buckets 2 # strace -e sendto,recvmsg -s 5 ip nexthop bucket sendto(3, [[{nlmsg_len=24, nlmsg_type=RTM_GETNEXTHOPBUCKET, nlmsg_flags=NLM_F_REQUEST|NLM_F_DUMP, nlmsg_seq=1691396737, nlmsg_pid=0}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], {nlmsg_len=0, nlmsg_type=0 /* NLMSG_??? */, nlmsg_flags=0, nlmsg_seq=0, nlmsg_pid=0}], 152, 0, NULL, 0) = 152 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 148 recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[[{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=64, nlmsg_type=RTM_NEWNEXTHOPBUCKET, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, {family=AF_UNSPEC, data="\x00\x00\x00\x00\x00"...}], [{nlmsg_len=20, nlmsg_type=NLMSG_DONE, nlmsg_flags=NLM_F_MULTI, nlmsg_seq=1691396737, nlmsg_pid=350}, 0]], iov_len=32768}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 148 id 4294967295 index 0 idle_time 6.61 nhid 1 id 4294967295 index 1 idle_time 6.61 nhid 1 +++ exited with 0 +++ Note that if the NLMSG_DONE message cannot be appended because of size limitations, then another recvmsg() will be needed, but the core netlink code will not invoke the dump callback and simply reply with a NLMSG_DONE message since it knows that the callback previously returned zero. Add a test that fails before the fix: # ./fib_nexthops.sh -t basic_res [...] TEST: Maximum nexthop ID dump [FAIL] [...] And passes after it: # ./fib_nexthops.sh -t basic_res [...] TEST: Maximum nexthop ID dump [ OK ] [...] Fixes: 8a1bbabb034d ("nexthop: Add netlink handlers for bucket dump") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230808075233.3337922-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 6 +----- tools/testing/selftests/net/fib_nexthops.sh | 5 +++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f365a4f63899..be5498f5dd31 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3424,13 +3424,9 @@ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, if (err < 0) { if (likely(skb->len)) - goto out; - goto out_err; + err = skb->len; } -out: - err = skb->len; -out_err: cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 10aa059b9f06..df8d90b51867 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -2206,6 +2206,11 @@ basic_res() run_cmd "$IP nexthop bucket list fdb" log_test $? 255 "Dump all nexthop buckets with invalid 'fdb' keyword" + # Dump should not loop endlessly when maximum nexthop ID is configured. + run_cmd "$IP nexthop add id $((2**32-1)) group 1/2 type resilient buckets 4" + run_cmd "timeout 5 $IP nexthop bucket" + log_test $? 0 "Maximum nexthop ID dump" + # # resilient nexthop buckets get requests # From 8a70ed9520c5fafaac91053cacdd44625c39e188 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Aug 2023 08:49:23 +0000 Subject: [PATCH 195/230] tcp: add missing family to tcp_set_ca_state() tracepoint Before this code is copied, add the missing family, as we did in commit 3dd344ea84e1 ("net: tracepoint: exposing sk_family in all tcp:tracepoints") Fixes: 15fcdf6ae116 ("tcp: Add tracepoint for tcp_set_ca_state") Signed-off-by: Eric Dumazet Cc: Ping Gan Cc: Manjusaka Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230808084923.2239142-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index bf06db8d2046..7b1ddffa3dfc 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -381,6 +381,7 @@ TRACE_EVENT(tcp_cong_state_set, __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) + __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) @@ -396,6 +397,7 @@ TRACE_EVENT(tcp_cong_state_set, __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; @@ -409,7 +411,8 @@ TRACE_EVENT(tcp_cong_state_set, __entry->cong_state = ca_state; ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", + TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", + show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, From d72c83b1e4b4a36a38269c77a85ff52f95eb0d08 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:47 +0300 Subject: [PATCH 196/230] selftests: forwarding: Skip test when no interfaces are specified As explained in [1], the forwarding selftests are meant to be run with either physical loopbacks or veth pairs. The interfaces are expected to be specified in a user-provided forwarding.config file or as command line arguments. By default, this file is not present and the tests fail: # make -C tools/testing/selftests TARGETS=net/forwarding run_tests [...] TAP version 13 1..102 # timeout set to 45 # selftests: net/forwarding: bridge_igmp.sh # Command line is not complete. Try option "help" # Failed to create netif not ok 1 selftests: net/forwarding: bridge_igmp.sh # exit=1 [...] Fix by skipping a test if interfaces are not provided either via the configuration file or command line arguments. # make -C tools/testing/selftests TARGETS=net/forwarding run_tests [...] TAP version 13 1..102 # timeout set to 45 # selftests: net/forwarding: bridge_igmp.sh # SKIP: Cannot create interface. Name not specified ok 1 selftests: net/forwarding: bridge_igmp.sh # SKIP [1] tools/testing/selftests/net/forwarding/README Fixes: 81573b18f26d ("selftests/net/forwarding: add Makefile to install tests") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/856d454e-f83c-20cf-e166-6dc06cbc1543@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 9ddb68dd6a08..975fc5168c63 100755 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -225,6 +225,11 @@ create_netif_veth() for ((i = 1; i <= NUM_NETIFS; ++i)); do local j=$((i+1)) + if [ -z ${NETIFS[p$i]} ]; then + echo "SKIP: Cannot create interface. Name not specified" + exit $ksft_skip + fi + ip link show dev ${NETIFS[p$i]} &> /dev/null if [[ $? -ne 0 ]]; then ip link add ${NETIFS[p$i]} type veth \ From 0529883ad102f6c04e19fb7018f31e1bda575bbe Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:48 +0300 Subject: [PATCH 197/230] selftests: forwarding: Switch off timeout The default timeout for selftests is 45 seconds, but it is not enough for forwarding selftests which can takes minutes to finish depending on the number of tests cases: # make -C tools/testing/selftests TARGETS=net/forwarding run_tests TAP version 13 1..102 # timeout set to 45 # selftests: net/forwarding: bridge_igmp.sh # TEST: IGMPv2 report 239.10.10.10 [ OK ] # TEST: IGMPv2 leave 239.10.10.10 [ OK ] # TEST: IGMPv3 report 239.10.10.10 is_include [ OK ] # TEST: IGMPv3 report 239.10.10.10 include -> allow [ OK ] # not ok 1 selftests: net/forwarding: bridge_igmp.sh # TIMEOUT 45 seconds Fix by switching off the timeout and setting it to 0. A similar change was done for BPF selftests in commit 6fc5916cc256 ("selftests: bpf: Switch off timeout"). Fixes: 81573b18f26d ("selftests/net/forwarding: add Makefile to install tests") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/8d149f8c-818e-d141-a0ce-a6bae606bc22@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/net/forwarding/settings diff --git a/tools/testing/selftests/net/forwarding/settings b/tools/testing/selftests/net/forwarding/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/net/forwarding/settings @@ -0,0 +1 @@ +timeout=0 From ab2eda04e2c2116910b9d77ccc82e727efa71d49 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:49 +0300 Subject: [PATCH 198/230] selftests: forwarding: bridge_mdb: Check iproute2 version The selftest relies on iproute2 changes present in version 6.3, but the test does not check for it, resulting in error: # ./bridge_mdb.sh INFO: # Host entries configuration tests TEST: Common host entries configuration tests (IPv4) [FAIL] Managed to add IPv4 host entry with a filter mode TEST: Common host entries configuration tests (IPv6) [FAIL] Managed to add IPv6 host entry with a filter mode TEST: Common host entries configuration tests (L2) [FAIL] Managed to add L2 host entry with a filter mode INFO: # Port group entries configuration tests - (*, G) Command "replace" is unknown, try "bridge mdb help". [...] Fix by skipping the test if iproute2 is too old. Fixes: b6d00da08610 ("selftests: forwarding: Add bridge MDB test") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/6b04b2ba-2372-6f6b-3ac8-b7cba1cfae83@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/bridge_mdb.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh index ae3f9462a2b6..6f830b5f03c9 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh @@ -1206,6 +1206,11 @@ ctrl_test() ctrl_mldv2_is_in_test } +if ! bridge mdb help 2>&1 | grep -q "replace"; then + echo "SKIP: iproute2 too old, missing bridge mdb replace support" + exit $ksft_skip +fi + trap cleanup EXIT setup_prepare From 6bdf3d9765f4e0eebfd919e70acc65bce5daa9b9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:50 +0300 Subject: [PATCH 199/230] selftests: forwarding: bridge_mdb_max: Check iproute2 version The selftest relies on iproute2 changes present in version 6.3, but the test does not check for it, resulting in errors: # ./bridge_mdb_max.sh INFO: 802.1d tests TEST: cfg4: port: ngroups reporting [FAIL] Number of groups was null, now is null, but 5 expected TEST: ctl4: port: ngroups reporting [FAIL] Number of groups was null, now is null, but 5 expected TEST: cfg6: port: ngroups reporting [FAIL] Number of groups was null, now is null, but 5 expected [...] Fix by skipping the test if iproute2 is too old. Fixes: 3446dcd7df05 ("selftests: forwarding: bridge_mdb_max: Add a new selftest") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/6b04b2ba-2372-6f6b-3ac8-b7cba1cfae83@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-5-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/bridge_mdb_max.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh index ae255b662ba3..fa762b716288 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh @@ -1328,6 +1328,11 @@ test_8021qvs() switch_destroy } +if ! bridge link help 2>&1 | grep -q "mcast_max_groups"; then + echo "SKIP: iproute2 too old, missing bridge \"mcast_max_groups\" support" + exit $ksft_skip +fi + trap cleanup EXIT setup_prepare From 38f7c44d6e760a8513557e27340d61b820c91b8f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:51 +0300 Subject: [PATCH 200/230] selftests: forwarding: Set default IPv6 traceroute utility The test uses the 'TROUTE6' environment variable to encode the name of the IPv6 traceroute utility. By default (without a configuration file), this variable is not set, resulting in failures: # ./ip6_forward_instats_vrf.sh TEST: ping6 [ OK ] TEST: Ip6InTooBigErrors [ OK ] TEST: Ip6InHdrErrors [FAIL] TEST: Ip6InAddrErrors [ OK ] TEST: Ip6InDiscards [ OK ] Fix by setting a default utility name and skip the test if the utility is not present. Fixes: 0857d6f8c759 ("ipv6: When forwarding count rx stats on the orig netdev") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-6-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh | 2 ++ tools/testing/selftests/net/forwarding/lib.sh | 1 + 2 files changed, 3 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh index 9f5b3e2e5e95..49fa94b53a1c 100755 --- a/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh +++ b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh @@ -14,6 +14,8 @@ ALL_TESTS=" NUM_NETIFS=4 source lib.sh +require_command $TROUTE6 + h1_create() { simple_if_init $h1 2001:1:1::2/64 diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 975fc5168c63..40a8c1541b7f 100755 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -30,6 +30,7 @@ REQUIRE_MZ=${REQUIRE_MZ:=yes} REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no} STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no} TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=} +TROUTE6=${TROUTE6:=traceroute6} relative_path="${BASH_SOURCE%/*}" if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then From 66e131861ab7bf754b50813216f5c6885cd32d63 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:52 +0300 Subject: [PATCH 201/230] selftests: forwarding: Add a helper to skip test when using veth pairs A handful of tests require physical loopbacks to be used instead of veth pairs. Add a helper that these tests will invoke in order to be skipped when executed with veth pairs. Fixes: 64916b57c0b1 ("selftests: forwarding: Add speed and auto-negotiation test") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-7-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 40a8c1541b7f..f69015bf2dea 100755 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -164,6 +164,17 @@ check_port_mab_support() fi } +skip_on_veth() +{ + local kind=$(ip -j -d link show dev ${NETIFS[p1]} | + jq -r '.[].linkinfo.info_kind') + + if [[ $kind == veth ]]; then + echo "SKIP: Test cannot be run with veth pairs" + exit $ksft_skip + fi +} + if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" exit $ksft_skip From 60a36e21915c31c0375d9427be9406aa8ce2ec34 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:53 +0300 Subject: [PATCH 202/230] selftests: forwarding: ethtool: Skip when using veth pairs Auto-negotiation cannot be tested with veth pairs, resulting in failures: # ./ethtool.sh TEST: force of same speed autoneg off [FAIL] error in configuration. swp1 speed Not autoneg off [...] Fix by skipping the test when used with veth pairs. Fixes: 64916b57c0b1 ("selftests: forwarding: Add speed and auto-negotiation test") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-8-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/ethtool.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/ethtool.sh b/tools/testing/selftests/net/forwarding/ethtool.sh index dbb9fcf759e0..aa2eafb7b243 100755 --- a/tools/testing/selftests/net/forwarding/ethtool.sh +++ b/tools/testing/selftests/net/forwarding/ethtool.sh @@ -286,6 +286,8 @@ different_speeds_autoneg_on() ethtool -s $h1 autoneg on } +skip_on_veth + trap cleanup EXIT setup_prepare From b3d9305e60d121dac20a77b6847c4cf14a4c0001 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:54 +0300 Subject: [PATCH 203/230] selftests: forwarding: ethtool_extended_state: Skip when using veth pairs Ethtool extended state cannot be tested with veth pairs, resulting in failures: # ./ethtool_extended_state.sh TEST: Autoneg, No partner detected [FAIL] Expected "Autoneg", got "Link detected: no" [...] Fix by skipping the test when used with veth pairs. Fixes: 7d10bcce98cd ("selftests: forwarding: Add tests for ethtool extended state") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-9-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/forwarding/ethtool_extended_state.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh index 072faa77f53b..17f89c3b7c02 100755 --- a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh +++ b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh @@ -108,6 +108,8 @@ no_cable() ip link set dev $swp3 down } +skip_on_veth + setup_prepare tests_run From 9a711cde07c245a163d95eee5b42ed1871e73236 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:55 +0300 Subject: [PATCH 204/230] selftests: forwarding: hw_stats_l3_gre: Skip when using veth pairs Layer 3 hardware stats cannot be used when the underlying interfaces are veth pairs, resulting in failures: # ./hw_stats_l3_gre.sh TEST: ping gre flat [ OK ] TEST: Test rx packets: [FAIL] Traffic not reflected in the counter: 0 -> 0 TEST: Test tx packets: [FAIL] Traffic not reflected in the counter: 0 -> 0 Fix by skipping the test when used with veth pairs. Fixes: 813f97a26860 ("selftests: forwarding: Add a tunnel-based test for L3 HW stats") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-10-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh b/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh index eb9ec4a68f84..7594bbb49029 100755 --- a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh +++ b/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh @@ -99,6 +99,8 @@ test_stats_rx() test_stats g2a rx } +skip_on_veth + trap cleanup EXIT setup_prepare From 23fb886a1ced6f885ddd541cc86d45c415ce705c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:56 +0300 Subject: [PATCH 205/230] selftests: forwarding: ethtool_mm: Skip when MAC Merge is not supported MAC Merge cannot be tested with veth pairs, resulting in failures: # ./ethtool_mm.sh [...] TEST: Manual configuration with verification: swp1 to swp2 [FAIL] Verification did not succeed Fix by skipping the test when the interfaces do not support MAC Merge. Fixes: e6991384ace5 ("selftests: forwarding: add a test for MAC Merge layer") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-11-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/ethtool_mm.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/ethtool_mm.sh b/tools/testing/selftests/net/forwarding/ethtool_mm.sh index c580ad623848..39e736f30322 100755 --- a/tools/testing/selftests/net/forwarding/ethtool_mm.sh +++ b/tools/testing/selftests/net/forwarding/ethtool_mm.sh @@ -258,11 +258,6 @@ h2_destroy() setup_prepare() { - check_ethtool_mm_support - check_tc_fp_support - require_command lldptool - bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually" - h1=${NETIFS[p1]} h2=${NETIFS[p2]} @@ -278,6 +273,19 @@ cleanup() h1_destroy } +check_ethtool_mm_support +check_tc_fp_support +require_command lldptool +bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually" + +for netif in ${NETIFS[@]}; do + ethtool --show-mm $netif 2>&1 &> /dev/null + if [[ $? -ne 0 ]]; then + echo "SKIP: $netif does not support MAC Merge" + exit $ksft_skip + fi +done + trap cleanup EXIT setup_prepare From 5e8670610b93158ffacc3241f835454ff26a3469 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:57 +0300 Subject: [PATCH 206/230] selftests: forwarding: tc_actions: Use ncat instead of nc The test relies on 'nc' being the netcat version from the nmap project. While this seems to be the case on Fedora, it is not the case on Ubuntu, resulting in failures such as [1]. Fix by explicitly using the 'ncat' utility from the nmap project and the skip the test in case it is not installed. [1] # timeout set to 0 # selftests: net/forwarding: tc_actions.sh # TEST: gact drop and ok (skip_hw) [ OK ] # TEST: mirred egress flower redirect (skip_hw) [ OK ] # TEST: mirred egress flower mirror (skip_hw) [ OK ] # TEST: mirred egress matchall mirror (skip_hw) [ OK ] # TEST: mirred_egress_to_ingress (skip_hw) [ OK ] # nc: invalid option -- '-' # usage: nc [-46CDdFhklNnrStUuvZz] [-I length] [-i interval] [-M ttl] # [-m minttl] [-O length] [-P proxy_username] [-p source_port] # [-q seconds] [-s sourceaddr] [-T keyword] [-V rtable] [-W recvlimit] # [-w timeout] [-X proxy_protocol] [-x proxy_address[:port]] # [destination] [port] # nc: invalid option -- '-' # usage: nc [-46CDdFhklNnrStUuvZz] [-I length] [-i interval] [-M ttl] # [-m minttl] [-O length] [-P proxy_username] [-p source_port] # [-q seconds] [-s sourceaddr] [-T keyword] [-V rtable] [-W recvlimit] # [-w timeout] [-X proxy_protocol] [-x proxy_address[:port]] # [destination] [port] # TEST: mirred_egress_to_ingress_tcp (skip_hw) [FAIL] # server output check failed # INFO: Could not test offloaded functionality not ok 80 selftests: net/forwarding: tc_actions.sh # exit=1 Fixes: ca22da2fbd69 ("act_mirred: use the backlog for nested calls to mirred ingress") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-12-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/tc_actions.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index a96cff8e7219..b0f5e55d2d0b 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -9,6 +9,8 @@ NUM_NETIFS=4 source tc_common.sh source lib.sh +require_command ncat + tcflags="skip_hw" h1_create() @@ -220,9 +222,9 @@ mirred_egress_to_ingress_tcp_test() ip_proto icmp \ action drop - ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 & + ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 & local rpid=$! - ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1 + ip vrf exec v$h1 ncat -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1 wait -n $rpid cmp -s $mirred_e2i_tf1 $mirred_e2i_tf2 check_err $? "server output check failed" From 9ee37e53e7687654b487fc94e82569377272a7a8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:58 +0300 Subject: [PATCH 207/230] selftests: forwarding: tc_flower: Relax success criterion The test checks that filters that match on source or destination MAC were only hit once. A host can send more than one packet with a given source or destination MAC, resulting in failures. Fix by relaxing the success criterion and instead check that the filters were not hit zero times. Using tc_check_at_least_x_packets() is also an option, but it is not available in older kernels. Fixes: 07e5c75184a1 ("selftests: forwarding: Introduce tc flower matching tests") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-13-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/tc_flower.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/tc_flower.sh b/tools/testing/selftests/net/forwarding/tc_flower.sh index 683711f41aa9..b1daad19b01e 100755 --- a/tools/testing/selftests/net/forwarding/tc_flower.sh +++ b/tools/testing/selftests/net/forwarding/tc_flower.sh @@ -52,8 +52,8 @@ match_dst_mac_test() tc_check_packets "dev $h2 ingress" 101 1 check_fail $? "Matched on a wrong filter" - tc_check_packets "dev $h2 ingress" 102 1 - check_err $? "Did not match on correct filter" + tc_check_packets "dev $h2 ingress" 102 0 + check_fail $? "Did not match on correct filter" tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower @@ -78,8 +78,8 @@ match_src_mac_test() tc_check_packets "dev $h2 ingress" 101 1 check_fail $? "Matched on a wrong filter" - tc_check_packets "dev $h2 ingress" 102 1 - check_err $? "Did not match on correct filter" + tc_check_packets "dev $h2 ingress" 102 0 + check_fail $? "Did not match on correct filter" tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower From 11604178fdc3404d46518e5332f1fe865d30c4a1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:14:59 +0300 Subject: [PATCH 208/230] selftests: forwarding: tc_tunnel_key: Make filters more specific The test installs filters that match on various IP fragments (e.g., no fragment, first fragment) and expects a certain amount of packets to hit each filter. This is problematic as the filters are not specific enough and can match IP packets (e.g., IGMP) generated by the stack, resulting in failures [1]. Fix by making the filters more specific and match on more fields in the IP header: Source IP, destination IP and protocol. [1] # timeout set to 0 # selftests: net/forwarding: tc_tunnel_key.sh # TEST: tunnel_key nofrag (skip_hw) [FAIL] # packet smaller than MTU was not tunneled # INFO: Could not test offloaded functionality not ok 89 selftests: net/forwarding: tc_tunnel_key.sh # exit=1 Fixes: 533a89b1940f ("selftests: forwarding: add tunnel_key "nofrag" test case") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Tested-by: Mirsad Todorovac Acked-by: Davide Caratti Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-14-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/tc_tunnel_key.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh index 5ac184d51809..5a5dd9034819 100755 --- a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh +++ b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh @@ -104,11 +104,14 @@ tunnel_key_nofrag_test() local i tc filter add dev $swp1 ingress protocol ip pref 100 handle 100 \ - flower ip_flags nofrag action drop + flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \ + ip_flags nofrag action drop tc filter add dev $swp1 ingress protocol ip pref 101 handle 101 \ - flower ip_flags firstfrag action drop + flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \ + ip_flags firstfrag action drop tc filter add dev $swp1 ingress protocol ip pref 102 handle 102 \ - flower ip_flags nofirstfrag action drop + flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \ + ip_flags nofirstfrag action drop # test 'nofrag' set tc filter add dev h1-et egress protocol all pref 1 handle 1 matchall $tcflags \ From 21a72166abb9c9d13fe24a07cef0a0b9f1e331b0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:15:00 +0300 Subject: [PATCH 209/230] selftests: forwarding: tc_flower_l2_miss: Fix failing test with old libnet As explained in commit 8bcfb4ae4d97 ("selftests: forwarding: Fix failing tests with old libnet"), old versions of libnet (used by mausezahn) do not use the "SO_BINDTODEVICE" socket option. For IP unicast packets, this can be solved by prefixing mausezahn invocations with "ip vrf exec". However, IP multicast packets do not perform routing and simply egress the bound device, which does not exist in this case. Fix by specifying the source and destination MAC of the packet which will cause mausezahn to use a packet socket instead of an IP socket. Fixes: 8c33266ae26a ("selftests: forwarding: Add layer 2 miss test cases") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-15-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/tc_flower_l2_miss.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/tc_flower_l2_miss.sh b/tools/testing/selftests/net/forwarding/tc_flower_l2_miss.sh index e22c2d28b6eb..20a7cb7222b8 100755 --- a/tools/testing/selftests/net/forwarding/tc_flower_l2_miss.sh +++ b/tools/testing/selftests/net/forwarding/tc_flower_l2_miss.sh @@ -127,6 +127,7 @@ test_l2_miss_multicast_common() local proto=$1; shift local sip=$1; shift local dip=$1; shift + local dmac=$1; shift local mode=$1; shift local name=$1; shift @@ -142,7 +143,7 @@ test_l2_miss_multicast_common() action pass # Before adding MDB entry. - $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q + $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q tc_check_packets "dev $swp2 egress" 101 1 check_err $? "Unregistered multicast filter was not hit before adding MDB entry" @@ -153,7 +154,7 @@ test_l2_miss_multicast_common() # Adding MDB entry. bridge mdb replace dev br1 port $swp2 grp $dip permanent - $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q + $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q tc_check_packets "dev $swp2 egress" 101 1 check_err $? "Unregistered multicast filter was hit after adding MDB entry" @@ -164,7 +165,7 @@ test_l2_miss_multicast_common() # Deleting MDB entry. bridge mdb del dev br1 port $swp2 grp $dip - $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q + $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q tc_check_packets "dev $swp2 egress" 101 2 check_err $? "Unregistered multicast filter was not hit after deleting MDB entry" @@ -183,10 +184,11 @@ test_l2_miss_multicast_ipv4() local proto="ipv4" local sip=192.0.2.1 local dip=239.1.1.1 + local dmac=01:00:5e:01:01:01 local mode="-4" local name="IPv4" - test_l2_miss_multicast_common $proto $sip $dip $mode $name + test_l2_miss_multicast_common $proto $sip $dip $dmac $mode $name } test_l2_miss_multicast_ipv6() @@ -194,10 +196,11 @@ test_l2_miss_multicast_ipv6() local proto="ipv6" local sip=2001:db8:1::1 local dip=ff0e::1 + local dmac=33:33:00:00:00:01 local mode="-6" local name="IPv6" - test_l2_miss_multicast_common $proto $sip $dip $mode $name + test_l2_miss_multicast_common $proto $sip $dip $dmac $mode $name } test_l2_miss_multicast() From e98e195d90cc93b1bf2ad762c7c274a40dab7173 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:15:01 +0300 Subject: [PATCH 210/230] selftests: forwarding: bridge_mdb: Fix failing test with old libnet As explained in commit 8bcfb4ae4d97 ("selftests: forwarding: Fix failing tests with old libnet"), old versions of libnet (used by mausezahn) do not use the "SO_BINDTODEVICE" socket option. For IP unicast packets, this can be solved by prefixing mausezahn invocations with "ip vrf exec". However, IP multicast packets do not perform routing and simply egress the bound device, which does not exist in this case. Fix by specifying the source and destination MAC of the packet which will cause mausezahn to use a packet socket instead of an IP socket. Fixes: b6d00da08610 ("selftests: forwarding: Add bridge MDB test") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-16-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/bridge_mdb.sh | 46 ++++++++++--------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh index 6f830b5f03c9..4853b8e4f8d3 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh @@ -850,6 +850,7 @@ cfg_test() __fwd_test_host_ip() { local grp=$1; shift + local dmac=$1; shift local src=$1; shift local mode=$1; shift local name @@ -872,27 +873,27 @@ __fwd_test_host_ip() # Packet should only be flooded to multicast router ports when there is # no matching MDB entry. The bridge is not configured as a multicast # router port. - $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q tc_check_packets "dev br0 ingress" 1 0 check_err $? "Packet locally received after flood" # Install a regular port group entry and expect the packet to not be # locally received. bridge mdb add dev br0 port $swp2 grp $grp temp vid 10 - $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q tc_check_packets "dev br0 ingress" 1 0 check_err $? "Packet locally received after installing a regular entry" # Add a host entry and expect the packet to be locally received. bridge mdb add dev br0 port br0 grp $grp temp vid 10 - $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q tc_check_packets "dev br0 ingress" 1 1 check_err $? "Packet not locally received after adding a host entry" # Remove the host entry and expect the packet to not be locally # received. bridge mdb del dev br0 port br0 grp $grp vid 10 - $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q tc_check_packets "dev br0 ingress" 1 1 check_err $? "Packet locally received after removing a host entry" @@ -905,8 +906,8 @@ __fwd_test_host_ip() fwd_test_host_ip() { - __fwd_test_host_ip "239.1.1.1" "192.0.2.1" "-4" - __fwd_test_host_ip "ff0e::1" "2001:db8:1::1" "-6" + __fwd_test_host_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "-4" + __fwd_test_host_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "-6" } fwd_test_host_l2() @@ -966,6 +967,7 @@ fwd_test_host() __fwd_test_port_ip() { local grp=$1; shift + local dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mode=$1; shift @@ -999,43 +1001,43 @@ __fwd_test_port_ip() vlan_ethtype $eth_type vlan_id 10 dst_ip $grp \ src_ip $invalid_src action drop - $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q tc_check_packets "dev $h2 ingress" 1 0 check_err $? "Packet from valid source received on H2 before adding entry" - $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q tc_check_packets "dev $h2 ingress" 2 0 check_err $? "Packet from invalid source received on H2 before adding entry" bridge mdb add dev br0 port $swp2 grp $grp vid 10 \ filter_mode $filter_mode source_list $src_list - $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q tc_check_packets "dev $h2 ingress" 1 1 check_err $? "Packet from valid source not received on H2 after adding entry" - $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q tc_check_packets "dev $h2 ingress" 2 0 check_err $? "Packet from invalid source received on H2 after adding entry" bridge mdb replace dev br0 port $swp2 grp $grp vid 10 \ filter_mode exclude - $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q tc_check_packets "dev $h2 ingress" 1 2 check_err $? "Packet from valid source not received on H2 after allowing all sources" - $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q tc_check_packets "dev $h2 ingress" 2 1 check_err $? "Packet from invalid source not received on H2 after allowing all sources" bridge mdb del dev br0 port $swp2 grp $grp vid 10 - $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q tc_check_packets "dev $h2 ingress" 1 2 check_err $? "Packet from valid source received on H2 after deleting entry" - $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q + $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q tc_check_packets "dev $h2 ingress" 2 1 check_err $? "Packet from invalid source received on H2 after deleting entry" @@ -1047,11 +1049,11 @@ __fwd_test_port_ip() fwd_test_port_ip() { - __fwd_test_port_ip "239.1.1.1" "192.0.2.1" "192.0.2.2" "-4" "exclude" - __fwd_test_port_ip "ff0e::1" "2001:db8:1::1" "2001:db8:1::2" "-6" \ + __fwd_test_port_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "192.0.2.2" "-4" "exclude" + __fwd_test_port_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "2001:db8:1::2" "-6" \ "exclude" - __fwd_test_port_ip "239.1.1.1" "192.0.2.1" "192.0.2.2" "-4" "include" - __fwd_test_port_ip "ff0e::1" "2001:db8:1::1" "2001:db8:1::2" "-6" \ + __fwd_test_port_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "192.0.2.2" "-4" "include" + __fwd_test_port_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "2001:db8:1::2" "-6" \ "include" } @@ -1127,7 +1129,7 @@ ctrl_igmpv3_is_in_test() filter_mode include source_list 192.0.2.1 # IS_IN ( 192.0.2.2 ) - $MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \ + $MZ $h1.10 -c 1 -a own -b 01:00:5e:01:01:01 -A 192.0.2.1 -B 239.1.1.1 \ -t ip proto=2,p=$(igmpv3_is_in_get 239.1.1.1 192.0.2.2) -q bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -q 192.0.2.2 @@ -1140,7 +1142,7 @@ ctrl_igmpv3_is_in_test() filter_mode include source_list 192.0.2.1 # IS_IN ( 192.0.2.2 ) - $MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \ + $MZ $h1.10 -a own -b 01:00:5e:01:01:01 -c 1 -A 192.0.2.1 -B 239.1.1.1 \ -t ip proto=2,p=$(igmpv3_is_in_get 239.1.1.1 192.0.2.2) -q bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -v "src" | \ @@ -1167,7 +1169,7 @@ ctrl_mldv2_is_in_test() # IS_IN ( 2001:db8:1::2 ) local p=$(mldv2_is_in_get fe80::1 ff0e::1 2001:db8:1::2) - $MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \ + $MZ -6 $h1.10 -a own -b 33:33:00:00:00:01 -c 1 -A fe80::1 -B ff0e::1 \ -t ip hop=1,next=0,p="$p" -q bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | \ @@ -1181,7 +1183,7 @@ ctrl_mldv2_is_in_test() filter_mode include source_list 2001:db8:1::1 # IS_IN ( 2001:db8:1::2 ) - $MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \ + $MZ -6 $h1.10 -a own -b 33:33:00:00:00:01 -c 1 -A fe80::1 -B ff0e::1 \ -t ip hop=1,next=0,p="$p" -q bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | grep -v "src" | \ From cb034948ac292da82cc0e6bc1340f81be36e117d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:15:02 +0300 Subject: [PATCH 211/230] selftests: forwarding: bridge_mdb_max: Fix failing test with old libnet As explained in commit 8bcfb4ae4d97 ("selftests: forwarding: Fix failing tests with old libnet"), old versions of libnet (used by mausezahn) do not use the "SO_BINDTODEVICE" socket option. For IP unicast packets, this can be solved by prefixing mausezahn invocations with "ip vrf exec". However, IP multicast packets do not perform routing and simply egress the bound device, which does not exist in this case. Fix by specifying the source and destination MAC of the packet which will cause mausezahn to use a packet socket instead of an IP socket. Fixes: 3446dcd7df05 ("selftests: forwarding: bridge_mdb_max: Add a new selftest") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-17-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/bridge_mdb_max.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh index fa762b716288..3da9d93ab36f 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh @@ -252,7 +252,8 @@ ctl4_entries_add() local IPs=$(seq -f 192.0.2.%g 1 $((n - 1))) local peer=$(locus_dev_peer $locus) local GRP=239.1.1.${grp} - $MZ $peer -c 1 -A 192.0.2.1 -B $GRP \ + local dmac=01:00:5e:01:01:$(printf "%02x" $grp) + $MZ $peer -a own -b $dmac -c 1 -A 192.0.2.1 -B $GRP \ -t ip proto=2,p=$(igmpv3_is_in_get $GRP $IPs) -q sleep 1 @@ -272,7 +273,8 @@ ctl4_entries_del() local peer=$(locus_dev_peer $locus) local GRP=239.1.1.${grp} - $MZ $peer -c 1 -A 192.0.2.1 -B 224.0.0.2 \ + local dmac=01:00:5e:00:00:02 + $MZ $peer -a own -b $dmac -c 1 -A 192.0.2.1 -B 224.0.0.2 \ -t ip proto=2,p=$(igmpv2_leave_get $GRP) -q sleep 1 ! bridge mdb show dev br0 | grep -q $GRP @@ -289,8 +291,10 @@ ctl6_entries_add() local peer=$(locus_dev_peer $locus) local SIP=fe80::1 local GRP=ff0e::${grp} + local dmac=33:33:00:00:00:$(printf "%02x" $grp) local p=$(mldv2_is_in_get $SIP $GRP $IPs) - $MZ -6 $peer -c 1 -A $SIP -B $GRP -t ip hop=1,next=0,p="$p" -q + $MZ -6 $peer -a own -b $dmac -c 1 -A $SIP -B $GRP \ + -t ip hop=1,next=0,p="$p" -q sleep 1 local nn=$(bridge mdb show dev br0 | grep $GRP | wc -l) @@ -310,8 +314,10 @@ ctl6_entries_del() local peer=$(locus_dev_peer $locus) local SIP=fe80::1 local GRP=ff0e::${grp} + local dmac=33:33:00:00:00:$(printf "%02x" $grp) local p=$(mldv1_done_get $SIP $GRP) - $MZ -6 $peer -c 1 -A $SIP -B $GRP -t ip hop=1,next=0,p="$p" -q + $MZ -6 $peer -a own -b $dmac -c 1 -A $SIP -B $GRP \ + -t ip hop=1,next=0,p="$p" -q sleep 1 ! bridge mdb show dev br0 | grep -q $GRP } From 8b5ff37097775cdbd447442603957066dd2e4d02 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Aug 2023 17:15:03 +0300 Subject: [PATCH 212/230] selftests: forwarding: bridge_mdb: Make test more robust Some test cases check that the group timer is (or isn't) 0. Instead of grepping for "0.00" grep for " 0.00" as the former can also match "260.00" which is the default group membership interval. Fixes: b6d00da08610 ("selftests: forwarding: Add bridge MDB test") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/adc5e40d-d040-a65e-eb26-edf47dac5b02@alu.unizg.hr/ Signed-off-by: Ido Schimmel Tested-by: Mirsad Todorovac Reviewed-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230808141503.4060661-18-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/bridge_mdb.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh index 4853b8e4f8d3..d0c6c499d5da 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh @@ -617,7 +617,7 @@ __cfg_test_port_ip_sg() grep -q "permanent" check_err $? "Entry not added as \"permanent\" when should" bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \ - grep -q "0.00" + grep -q " 0.00" check_err $? "\"permanent\" entry has a pending group timer" bridge mdb del dev br0 port $swp1 $grp_key vid 10 @@ -626,7 +626,7 @@ __cfg_test_port_ip_sg() grep -q "temp" check_err $? "Entry not added as \"temp\" when should" bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \ - grep -q "0.00" + grep -q " 0.00" check_fail $? "\"temp\" entry has an unpending group timer" bridge mdb del dev br0 port $swp1 $grp_key vid 10 @@ -659,7 +659,7 @@ __cfg_test_port_ip_sg() grep -q "permanent" check_err $? "Entry not marked as \"permanent\" after replace" bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \ - grep -q "0.00" + grep -q " 0.00" check_err $? "Entry has a pending group timer after replace" bridge mdb replace dev br0 port $swp1 $grp_key vid 10 temp @@ -667,7 +667,7 @@ __cfg_test_port_ip_sg() grep -q "temp" check_err $? "Entry not marked as \"temp\" after replace" bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \ - grep -q "0.00" + grep -q " 0.00" check_fail $? "Entry has an unpending group timer after replace" bridge mdb del dev br0 port $swp1 $grp_key vid 10 From 048c796beb6eb4fa3a5a647ee1c81f5c6f0f6a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Mon, 7 Aug 2023 03:25:32 -0700 Subject: [PATCH 213/230] ipv6: adjust ndisc_is_useropt() to also return true for PIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upcoming (and nearly finalized): https://datatracker.ietf.org/doc/draft-collink-6man-pio-pflag/ will update the IPv6 RA to include a new flag in the PIO field, which will serve as a hint to perform DHCPv6-PD. As we don't want DHCPv6 related logic inside the kernel, this piece of information needs to be exposed to userspace. The simplest option is to simply expose the entire PIO through the already existing mechanism. Even without this new flag, the already existing PIO R (router address) flag (from RFC6275) cannot AFAICT be handled entirely in kernel, and provides useful information that should be exposed to userspace (the router's global address, for use by Mobile IPv6). Also cc'ing stable@ for inclusion in LTS, as while technically this is not quite a bugfix, and instead more of a feature, it is absolutely trivial and the alternative is manually cherrypicking into all Android Common Kernel trees - and I know Greg will ask for it to be sent in via LTS instead... Cc: Jen Linkova Cc: Lorenzo Colitti Cc: David Ahern Cc: YOSHIFUJI Hideaki / 吉藤英明 Cc: stable@vger.kernel.org Signed-off-by: Maciej Żenczykowski Link: https://lore.kernel.org/r/20230807102533.1147559-1-maze@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ndisc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 18634ebd20a4..a42be96ae209 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -197,7 +197,8 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { - return opt->nd_opt_type == ND_OPT_RDNSS || + return opt->nd_opt_type == ND_OPT_PREFIX_INFO || + opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || opt->nd_opt_type == ND_OPT_PREF64 || From 85c2c79a07302fe68a1ad5cc449458cc559e314d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 9 Aug 2023 16:28:43 +0200 Subject: [PATCH 214/230] xsk: fix refcount underflow in error path Fix a refcount underflow problem reported by syzbot that can happen when a system is running out of memory. If xp_alloc_tx_descs() fails, and it can only fail due to not having enough memory, then the error path is triggered. In this error path, the refcount of the pool is decremented as it has incremented before. However, the reference to the pool in the socket was not nulled. This means that when the socket is closed later, the socket teardown logic will think that there is a pool attached to the socket and try to decrease the refcount again, leading to a refcount underflow. I chose this fix as it involved adding just a single line. Another option would have been to move xp_get_pool() and the assignment of xs->pool to after the if-statement and using xs_umem->pool instead of xs->pool in the whole if-statement resulting in somewhat simpler code, but this would have led to much more churn in the code base perhaps making it harder to backport. Fixes: ba3beec2ec1d ("xsk: Fix possible crash when multiple sockets are created") Reported-by: syzbot+8ada0057e69293a05fd4@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Link: https://lore.kernel.org/r/20230809142843.13944-1-magnus.karlsson@gmail.com Signed-off-by: Martin KaFai Lau --- net/xdp/xsk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b89adb52a977..10ea85c03147 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -994,6 +994,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) err = xp_alloc_tx_descs(xs->pool, xs); if (err) { xp_put_pool(xs->pool); + xs->pool = NULL; sockfd_put(sock); goto out_unlock; } From 7e96ec0e6605b69bb21bbf6c0ff9051e656ec2b1 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 4 Aug 2023 03:37:37 -0400 Subject: [PATCH 215/230] bpf, sockmap: Fix map type error in sock_map_del_link sock_map_del_link() operates on both SOCKMAP and SOCKHASH, although both types have member named "progs", the offset of "progs" member in these two types is different, so "progs" should be accessed with the real map type. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Xu Kuohai Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20230804073740.194770-2-xukuohai@huaweicloud.com Signed-off-by: Martin KaFai Lau --- net/core/sock_map.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 08ab108206bf..8f07fea39d9e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -146,13 +146,13 @@ static void sock_map_del_link(struct sock *sk, list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; - struct bpf_stab *stab = container_of(map, struct bpf_stab, - map); - if (psock->saved_data_ready && stab->progs.stream_parser) + struct sk_psock_progs *progs = sock_map_progs(map); + + if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.stream_verdict) + if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); From 809e4dc71a0f2b8d2836035d98603694fff11d5d Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 4 Aug 2023 03:37:38 -0400 Subject: [PATCH 216/230] bpf, sockmap: Fix bug that strp_done cannot be called strp_done is only called when psock->progs.stream_parser is not NULL, but stream_parser was set to NULL by sk_psock_stop_strp(), called by sk_psock_drop() earlier. So, strp_done can never be called. Introduce SK_PSOCK_RX_ENABLED to mark whether there is strp on psock. Change the condition for calling strp_done from judging whether stream_parser is set to judging whether this flag is set. This flag is only set once when strp_init() succeeds, and will never be cleared later. Fixes: c0d95d3380ee ("bpf, sockmap: Re-evaluate proto ops when psock is removed from sockmap") Signed-off-by: Xu Kuohai Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20230804073740.194770-3-xukuohai@huaweicloud.com Signed-off-by: Martin KaFai Lau --- include/linux/skmsg.h | 1 + net/core/skmsg.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 054d7911bfc9..c1637515a8a4 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -62,6 +62,7 @@ struct sk_psock_progs { enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, + SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a29508e1ff35..ef1a2eb6520b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1120,13 +1120,19 @@ static void sk_psock_strp_data_ready(struct sock *sk) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { + int ret; + static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; - return strp_init(&psock->strp, sk, &cb); + ret = strp_init(&psock->strp, sk, &cb); + if (!ret) + sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + + return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) @@ -1154,7 +1160,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.stream_parser) + if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else From 90f0074cd9f9a24b7b6c4d5afffa676aee48c0e9 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 4 Aug 2023 03:37:39 -0400 Subject: [PATCH 217/230] selftests/bpf: fix a CI failure caused by vsock sockmap test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF CI has reported the following failure: Error: #200/79 sockmap_listen/sockmap VSOCK test_vsock_redir Error: #200/79 sockmap_listen/sockmap VSOCK test_vsock_redir ./test_progs:vsock_unix_redir_connectible:1506: egress: write: Transport endpoint is not connected vsock_unix_redir_connectible:FAIL:1506 ./test_progs:vsock_unix_redir_connectible:1506: ingress: write: Transport endpoint is not connected vsock_unix_redir_connectible:FAIL:1506 ./test_progs:vsock_unix_redir_connectible:1506: egress: write: Transport endpoint is not connected vsock_unix_redir_connectible:FAIL:1506 ./test_progs:vsock_unix_redir_connectible:1514: ingress: recv() err, errno=11 vsock_unix_redir_connectible:FAIL:1514 ./test_progs:vsock_unix_redir_connectible:1518: ingress: vsock socket map failed, a != b vsock_unix_redir_connectible:FAIL:1518 ./test_progs:vsock_unix_redir_connectible:1525: ingress: want pass count 1, have 0 It’s because the recv(... MSG_DONTWAIT) syscall in the test case is called before the queued work sk_psock_backlog() in the kernel finishes executing. So the data to be read is still queued in psock->ingress_skb and cannot be read by the user program. Therefore, the non-blocking recv() reads nothing and reports an EAGAIN error. So replace recv(... MSG_DONTWAIT) with xrecv_nonblock(), which calls select() to wait for data to be readable or timeout before calls recv(). Fixes: d61bd8c1fd02 ("selftests/bpf: add a test case for vsock sockmap") Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20230804073740.194770-4-xukuohai@huaweicloud.com Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index b4f6f3a50ae5..ba35bcc66e7e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1432,7 +1432,7 @@ static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd, if (n < 1) goto out; - n = recv(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), MSG_DONTWAIT); + n = xrecv_nonblock(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), 0); if (n < 0) FAIL("%s: recv() err, errno=%d", log_prefix, errno); if (n == 0) From a4b7193d8efdfde1ea89fe34e921ad031f79f993 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 4 Aug 2023 03:37:40 -0400 Subject: [PATCH 218/230] selftests/bpf: Add sockmap test for redirecting partial skb data Add a test case to check whether sockmap redirection works correctly when data length returned by stream_parser is less than skb->len. In addition, this test checks whether strp_done is called correctly. The reason is that we returns skb->len - 1 from the stream_parser, so the last byte in the skb will be held by strp->skb_head. Therefore, if strp_done is not called to free strp->skb_head, we'll get a memleak warning. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20230804073740.194770-5-xukuohai@huaweicloud.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sockmap_listen.c | 72 +++++++++++++++++++ .../selftests/bpf/progs/test_sockmap_listen.c | 14 ++++ 2 files changed, 86 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index ba35bcc66e7e..5674a9d0cacf 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -869,6 +869,77 @@ static void test_msg_redir_to_listening(struct test_sockmap_listen *skel, xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); } +static void redir_partial(int family, int sotype, int sock_map, int parser_map) +{ + int s, c0, c1, p0, p1; + int err, n, key, value; + char buf[] = "abc"; + + key = 0; + value = sizeof(buf) - 1; + err = xbpf_map_update_elem(parser_map, &key, &value, 0); + if (err) + return; + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s < 0) + goto clean_parser_map; + + err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1); + if (err) + goto close_srv; + + err = add_to_sockmap(sock_map, p0, p1); + if (err) + goto close; + + n = xsend(c1, buf, sizeof(buf), 0); + if (n < sizeof(buf)) + FAIL("incomplete write"); + + n = xrecv_nonblock(c0, buf, sizeof(buf), 0); + if (n != sizeof(buf) - 1) + FAIL("expect %zu, received %d", sizeof(buf) - 1, n); + +close: + xclose(c0); + xclose(p0); + xclose(c1); + xclose(p1); +close_srv: + xclose(s); + +clean_parser_map: + key = 0; + value = 0; + xbpf_map_update_elem(parser_map, &key, &value, 0); +} + +static void test_skb_redir_partial(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); + int parser_map = bpf_map__fd(skel->maps.parser_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0); + if (err) + return; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) + goto detach; + + redir_partial(family, sotype, sock_map, parser_map); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT); +detach: + xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER); +} + static void test_reuseport_select_listening(int family, int sotype, int sock_map, int verd_map, int reuseport_prog) @@ -1243,6 +1314,7 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, } tests[] = { TEST(test_skb_redir_to_connected), TEST(test_skb_redir_to_listening), + TEST(test_skb_redir_partial), TEST(test_msg_redir_to_connected), TEST(test_msg_redir_to_listening), }; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index 325c9f193432..464d35bd57c7 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -28,12 +28,26 @@ struct { __type(value, unsigned int); } verdict_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} parser_map SEC(".maps"); + bool test_sockmap = false; /* toggled by user-space */ bool test_ingress = false; /* toggled by user-space */ SEC("sk_skb/stream_parser") int prog_stream_parser(struct __sk_buff *skb) { + int *value; + __u32 key = 0; + + value = bpf_map_lookup_elem(&parser_map, &key); + if (value && *value) + return *value; + return skb->len; } From 5f68718b34a531a556f2f50300ead2862278da26 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Aug 2023 14:31:54 +0200 Subject: [PATCH 219/230] netfilter: nf_tables: GC transaction API to avoid race with control plane The set types rhashtable and rbtree use a GC worker to reclaim memory. From system work queue, in periodic intervals, a scan of the table is done. The major caveat here is that the nft transaction mutex is not held. This causes a race between control plane and GC when they attempt to delete the same element. We cannot grab the netlink mutex from the work queue, because the control plane has to wait for the GC work queue in case the set is to be removed, so we get following deadlock: cpu 1 cpu2 GC work transaction comes in , lock nft mutex `acquire nft mutex // BLOCKS transaction asks to remove the set set destruction calls cancel_work_sync() cancel_work_sync will now block forever, because it is waiting for the mutex the caller already owns. This patch adds a new API that deals with garbage collection in two steps: 1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT so they are not visible via lookup. Annotate current GC sequence in the GC transaction. Enqueue GC transaction work as soon as it is full. If ruleset is updated, then GC transaction is aborted and retried later. 2) GC work grabs the mutex. If GC sequence has changed then this GC transaction lost race with control plane, abort it as it contains stale references to objects and let GC try again later. If the ruleset is intact, then this GC transaction deactivates and removes the elements and it uses call_rcu() to destroy elements. Note that no elements are removed from GC lockless path, the _DEAD bit is set and pointers are collected. GC catchall does not remove the elements anymore too. There is a new set->dead flag that is set on to abort the GC transaction to deal with set->ops->destroy() path which removes the remaining elements in the set from commit_release, where no mutex is held. To deal with GC when mutex is held, which allows safe deactivate and removal, add sync GC API which releases the set element object via call_rcu(). This is used by rbtree and pipapo backends which also perform garbage collection from control plane path. Since element removal from sets can happen from control plane and element garbage collection/timeout, it is necessary to keep the set structure alive until all elements have been deactivated and destroyed. We cannot do a cancel_work_sync or flush_work in nft_set_destroy because its called with the transaction mutex held, but the aforementioned async work queue might be blocked on the very mutex that nft_set_destroy() callchain is sitting on. This gives us the choice of ABBA deadlock or UaF. To avoid both, add set->refs refcount_t member. The GC API can then increment the set refcount and release it once the elements have been free'd. Set backends are adapted to use the GC transaction API in a follow up patch entitled: ("netfilter: nf_tables: use gc transaction API in set backends") This is joint work with Florian Westphal. Fixes: cfed7e1b1f8e ("netfilter: nf_tables: add set garbage collection helpers") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 64 +++++++- net/netfilter/nf_tables_api.c | 248 ++++++++++++++++++++++++++++-- 2 files changed, 300 insertions(+), 12 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 640441a2f926..7256e9c80477 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -512,6 +512,7 @@ struct nft_set_elem_expr { * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -541,6 +542,7 @@ struct nft_set_elem_expr { struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -562,7 +564,8 @@ struct nft_set { struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; @@ -1592,6 +1595,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) clear_bit(NFT_SET_ELEM_BUSY_BIT, word); } +#define NFT_SET_ELEM_DEAD_MASK (1 << 3) + +#if defined(__LITTLE_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT 3 +#elif defined(__BIG_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#else +#error +#endif + +static inline void nft_set_elem_dead(struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + /** * struct nft_trans - nf_tables object update in transaction * @@ -1732,6 +1761,38 @@ struct nft_trans_flowtable { #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u8 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); @@ -1758,6 +1819,7 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; unsigned int base_seq; + unsigned int gc_seq; }; extern unsigned int nf_tables_net_id; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b4321869e5c6..c28bacb9479b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -31,7 +31,9 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); enum { NFT_VALIDATE_SKIP = 0, @@ -120,6 +122,9 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -582,10 +587,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem); - static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, @@ -5055,6 +5056,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&set->bindings); INIT_LIST_HEAD(&set->catchall_list); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -5122,6 +5124,14 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, } } +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; @@ -5134,8 +5144,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, @@ -6278,7 +6287,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask) && - !nft_set_elem_expired(ext)) + !nft_set_elem_expired(ext) && + !nft_set_elem_is_dead(ext)) return ext; } @@ -6933,9 +6943,9 @@ static void nft_setelem_data_activate(const struct net *net, nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); @@ -9418,6 +9428,207 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + void **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + struct nft_set_elem elem = { + .priv = priv[i], + }; + + nft_setelem_data_deactivate(ctx->net, trans->set, &elem); + nft_setelem_remove(ctx->net, trans->set, &elem); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_set_elem elem = {}; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem.priv = trans->priv[i]; + if (!nft_setelem_is_catchall(trans->set, &elem)) + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct nftables_pernet *nft_net; + struct nft_ctx ctx = {}; + + nft_net = nft_pernet(trans->net); + + mutex_lock(&nft_net->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&nft_net->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&nft_net->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_destroy_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_destroy_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + refcount_inc(&set->refs); + trans->set = set; + trans->net = get_net(net); + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + if (nft_trans_gc_space(gc)) + return gc; + + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(gc->set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(gc->set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq) +{ + struct nft_set_elem_catchall *catchall; + const struct nft_set *set = gc->set; + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + if (nft_set_elem_is_dead(ext)) + goto dead_elem; + + nft_set_elem_dead(ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + return NULL; + + nft_trans_gc_elem_add(gc, catchall->elem); + } + + return gc; +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -9580,11 +9791,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; + unsigned int base_seq, gc_seq; LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; - unsigned int base_seq; LIST_HEAD(adl); int err; @@ -9661,6 +9872,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) WRITE_ONCE(nft_net->base_seq, base_seq); + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -9768,6 +9983,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), trans->msg_type, GFP_KERNEL); @@ -9870,6 +10086,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); + + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); nf_tables_commit_release(net); return 0; @@ -10919,6 +11137,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); nft_net->base_seq = 1; + nft_net->gc_seq = 0; return 0; } @@ -10947,10 +11166,16 @@ static void __net_exit nf_tables_exit_net(struct net *net) WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, .id = &nf_tables_net_id, .size = sizeof(struct nftables_pernet), }; @@ -11022,6 +11247,7 @@ static void __exit nf_tables_module_exit(void) nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); From f6c383b8c31a93752a52697f8430a71dcbc46adf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Aug 2023 14:54:23 +0200 Subject: [PATCH 220/230] netfilter: nf_tables: adapt set backend to use GC transaction API Use the GC transaction API to replace the old and buggy gc API and the busy mark approach. No set elements are removed from async garbage collection anymore, instead the _DEAD bit is set on so the set element is not visible from lookup path anymore. Async GC enqueues transaction work that might be aborted and retried later. rbtree and pipapo set backends does not set on the _DEAD bit from the sync GC path since this runs in control plane path where mutex is held. In this case, set elements are deactivated, removed and then released via RCU callback, sync GC never fails. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +- net/netfilter/nft_set_hash.c | 77 +++++++++++------- net/netfilter/nft_set_pipapo.c | 50 +++++++++--- net/netfilter/nft_set_rbtree.c | 144 ++++++++++++++++++++------------- 4 files changed, 174 insertions(+), 104 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c28bacb9479b..fd4b5da7ac3c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6380,7 +6380,6 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, if (nft_setelem_is_catchall(set, elem)) { nft_set_elem_change_active(net, set, ext); - nft_set_elem_clear_busy(ext); } else { set->ops->activate(net, set, elem); } @@ -6395,8 +6394,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_is_active(net, ext) || - nft_set_elem_mark_busy(ext)) + if (!nft_is_active(net, ext)) continue; kfree(elem->priv); @@ -7109,8 +7107,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask) || - nft_set_elem_mark_busy(ext)) + if (!nft_set_elem_active(ext, genmask)) continue; elem.priv = catchall->elem; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 24caa31fa231..2f067e4596b0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; + if (nft_set_elem_is_dead(&he->ext)) + return 1; if (nft_set_elem_expired(&he->ext)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) @@ -188,7 +190,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, struct nft_rhash_elem *he = elem->priv; nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); } static bool nft_rhash_flush(const struct net *net, @@ -196,12 +197,9 @@ static bool nft_rhash_flush(const struct net *net, { struct nft_rhash_elem *he = priv; - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); + + return true; } static void *nft_rhash_deactivate(const struct net *net, @@ -218,9 +216,8 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); @@ -312,25 +309,48 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, static void nft_rhash_gc(struct work_struct *work) { + struct nftables_pernet *nft_net; struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; + if (PTR_ERR(he) != -EAGAIN) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } continue; } + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && nft_rhash_expr_needs_gc_run(set, &he->ext)) goto needs_gc_run; @@ -338,26 +358,26 @@ static void nft_rhash_gc(struct work_struct *work) if (!nft_set_elem_expired(&he->ext)) continue; needs_gc_run: - if (nft_set_elem_mark_busy(&he->ext)) - continue; + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); + nft_trans_gc_elem_add(gc, he); } + + gc = nft_trans_gc_catchall(gc, gc_seq); + +try_later: + /* catchall list iteration requires rcu read side lock. */ rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - he = nft_set_catchall_gc(set); - if (he) { - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb) - nft_set_gc_batch_add(gcb, he); - } - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); + +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -420,7 +440,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx, }; cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, (void *)&rhash_ctx); } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index d54784ea465b..a5b8301afe4a 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1536,16 +1536,34 @@ static void pipapo_drop(struct nft_pipapo_match *m, } } +static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, + struct nft_pipapo_elem *e) + +{ + struct nft_set_elem elem = { + .priv = e, + }; + + nft_setelem_data_deactivate(net, set, &elem); +} + /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) { + struct nft_set *set = (struct nft_set *) _set; struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; @@ -1569,13 +1587,20 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) f--; i--; e = f->mt[rulemap[i].to].e; - if (nft_set_elem_expired(&e->ext) && - !nft_set_elem_mark_busy(&e->ext)) { - priv->dirty = true; - pipapo_drop(m, rulemap); - rcu_barrier(); - nft_set_elem_destroy(set, e, true); + /* synchronous gc never fails, there is no need to set on + * NFT_SET_ELEM_DEAD_BIT. + */ + if (nft_set_elem_expired(&e->ext)) { + priv->dirty = true; + + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (!gc) + break; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); + nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. @@ -1585,11 +1610,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } - e = nft_set_catchall_gc(set); - if (e) - nft_set_elem_destroy(set, e, true); - - priv->last_gc = jiffies; + gc = nft_trans_gc_catchall(gc, 0); + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } /** @@ -1714,7 +1739,6 @@ static void nft_pipapo_activate(const struct net *net, return; nft_set_elem_change_active(net, set, &e->ext); - nft_set_elem_clear_busy(&e->ext); } /** diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 39956e5341c9..f9d4c8fcbbf8 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +{ + return nft_set_elem_expired(&rbe->ext) || + nft_set_elem_is_dead(&rbe->ext); +} + static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext, unsigned int seq) @@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) + if (nft_rbtree_elem_expired(rbe)) return false; if (nft_rbtree_interval_end(rbe)) { @@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && + !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; @@ -215,6 +221,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set_elem elem = { + .priv = rbe, + }; + + nft_setelem_data_deactivate(net, set, &elem); + rb_erase(&rbe->node, &priv->root); +} + static int nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe, @@ -222,11 +240,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; - struct nft_set_gc_batch *gcb; + struct nft_trans_gc *gc; - gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); - if (!gcb) + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) return -ENOMEM; /* search for end interval coming before this element. @@ -244,17 +263,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_remove(net, set, priv, rbe_prev); - rb_erase(&rbe_prev->node, &priv->root); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_prev); + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; + + nft_trans_gc_elem_add(gc, rbe_prev); } - rb_erase(&rbe->node, &priv->root); - atomic_dec(&set->nelems); + nft_rbtree_gc_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; - nft_set_gc_batch_add(gcb, rbe); - nft_set_gc_batch_complete(gcb); + nft_trans_gc_elem_add(gc, rbe); + + nft_trans_gc_queue_sync_done(gc); return 0; } @@ -482,7 +512,6 @@ static void nft_rbtree_activate(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); } static bool nft_rbtree_flush(const struct net *net, @@ -490,12 +519,9 @@ static bool nft_rbtree_flush(const struct net *net, { struct nft_rbtree_elem *rbe = priv; - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); + + return true; } static void *nft_rbtree_deactivate(const struct net *net, @@ -570,26 +596,40 @@ cont: static void nft_rbtree_gc(struct work_struct *work) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; + struct nft_rbtree_elem *rbe, *rbe_end = NULL; + struct nftables_pernet *nft_net; struct nft_rbtree *priv; + struct nft_trans_gc *gc; struct rb_node *node; struct nft_set *set; + unsigned int gc_seq; struct net *net; - u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); - genmask = nft_genmask_cur(net); + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (!nft_set_elem_active(&rbe->ext, genmask)) - continue; + if (nft_set_elem_is_dead(&rbe->ext)) + goto dead_elem; /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is @@ -602,46 +642,36 @@ static void nft_rbtree_gc(struct work_struct *work) if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) { - rbe_end = NULL; + nft_set_elem_dead(&rbe->ext); + + if (!rbe_end) continue; - } - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; + nft_set_elem_dead(&rbe_end->ext); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); - rbe_end = NULL; - } - node = rb_next(node); - if (!node) - break; + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + nft_trans_gc_elem_add(gc, rbe); } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); + + gc = nft_trans_gc_catchall(gc, gc_seq); + +try_later: write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); - rbe = nft_set_catchall_gc(set); - if (rbe) { - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb) - nft_set_gc_batch_add(gcb, rbe); - } - nft_set_gc_batch_complete(gcb); - + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } From c92db3030492b8ad1d0faace7a93bbcf53850d0c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Aug 2023 15:00:06 +0200 Subject: [PATCH 221/230] netfilter: nft_set_hash: mark set element as dead when deleting from packet path Set on the NFT_SET_ELEM_DEAD_BIT flag on this element, instead of performing element removal which might race with an ongoing transaction. Enable gc when dynamic flag is set on since dynset deletion requires garbage collection after this patch. Fixes: d0a8d877da97 ("netfilter: nft_dynset: support for element deletion") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 2f067e4596b0..cef5df846000 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -249,7 +249,9 @@ static bool nft_rhash_delete(const struct nft_set *set, if (he == NULL) return false; - return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; + nft_set_elem_dead(&he->ext); + + return true; } static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, @@ -412,7 +414,7 @@ static int nft_rhash_init(const struct nft_set *set, return err; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); - if (set->flags & NFT_SET_TIMEOUT) + if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) nft_rhash_gc_init(set); return 0; From a2dd0233cbc4d8a0abb5f64487487ffc9265beb5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Aug 2023 15:00:36 +0200 Subject: [PATCH 222/230] netfilter: nf_tables: remove busy mark and gc batch API Ditch it, it has been replace it by the GC transaction API and it has no clients anymore. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 98 +------------------------------ net/netfilter/nf_tables_api.c | 48 +-------------- 2 files changed, 4 insertions(+), 142 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 7256e9c80477..35870858ddf2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -599,7 +599,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net, struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); -void *nft_set_catchall_gc(const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { @@ -816,62 +815,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, void *elem); -/** - * struct nft_set_gc_batch_head - nf_tables set garbage collection batch - * - * @rcu: rcu head - * @set: set the elements belong to - * @cnt: count of elements - */ -struct nft_set_gc_batch_head { - struct rcu_head rcu; - const struct nft_set *set; - unsigned int cnt; -}; - -#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ - sizeof(struct nft_set_gc_batch_head)) / \ - sizeof(void *)) - -/** - * struct nft_set_gc_batch - nf_tables set garbage collection batch - * - * @head: GC batch head - * @elems: garbage collection elements - */ -struct nft_set_gc_batch { - struct nft_set_gc_batch_head head; - void *elems[NFT_SET_GC_BATCH_SIZE]; -}; - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp); -void nft_set_gc_batch_release(struct rcu_head *rcu); - -static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) -{ - if (gcb != NULL) - call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); -} - -static inline struct nft_set_gc_batch * -nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, - gfp_t gfp) -{ - if (gcb != NULL) { - if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) - return gcb; - nft_set_gc_batch_complete(gcb); - } - return nft_set_gc_batch_alloc(set, gfp); -} - -static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, - void *elem) -{ - gcb->elems[gcb->head.cnt++] = elem; -} - struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type @@ -1560,47 +1503,12 @@ static inline void nft_set_elem_change_active(const struct net *net, #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ -/* - * We use a free bit in the genmask field to indicate the element - * is busy, meaning it is currently being processed either by - * the netlink API or GC. - * - * Even though the genmask is only a single byte wide, this works - * because the extension structure if fully constant once initialized, - * so there are no non-atomic write accesses unless it is already - * marked busy. - */ -#define NFT_SET_ELEM_BUSY_MASK (1 << 2) +#define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT 2 +#define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) -#else -#error -#endif - -static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) -{ - unsigned long *word = (unsigned long *)ext; - - BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); - return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); -} - -static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) -{ - unsigned long *word = (unsigned long *)ext; - - clear_bit(NFT_SET_ELEM_BUSY_BIT, word); -} - -#define NFT_SET_ELEM_DEAD_MASK (1 << 3) - -#if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_DEAD_BIT 3 -#elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd4b5da7ac3c..c62227ae7746 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6296,29 +6296,6 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, } EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); -void *nft_set_catchall_gc(const struct nft_set *set) -{ - struct nft_set_elem_catchall *catchall, *next; - struct nft_set_ext *ext; - void *elem = NULL; - - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - - if (!nft_set_elem_expired(ext) || - nft_set_elem_mark_busy(ext)) - continue; - - elem = catchall->elem; - list_del_rcu(&catchall->list); - kfree_rcu(catchall, rcu); - break; - } - - return elem; -} -EXPORT_SYMBOL_GPL(nft_set_catchall_gc); - static int nft_setelem_catchall_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, @@ -6789,7 +6766,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_elem_free; } - ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; + ext->genmask = nft_genmask_cur(ctx->net); err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); if (err) { @@ -7181,29 +7158,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return err; } -void nft_set_gc_batch_release(struct rcu_head *rcu) -{ - struct nft_set_gc_batch *gcb; - unsigned int i; - - gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); - for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); - kfree(gcb); -} - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp) -{ - struct nft_set_gc_batch *gcb; - - gcb = kzalloc(sizeof(*gcb), gfp); - if (gcb == NULL) - return gcb; - gcb->head.set = set; - return gcb; -} - /* * Stateful objects */ From a7dfeda6fdeccab4c7c3dce9a72c4262b9530c80 Mon Sep 17 00:00:00 2001 From: Souradeep Chakrabarti Date: Wed, 9 Aug 2023 03:22:05 -0700 Subject: [PATCH 223/230] net: mana: Fix MANA VF unload when hardware is unresponsive When unloading the MANA driver, mana_dealloc_queues() waits for the MANA hardware to complete any inflight packets and set the pending send count to zero. But if the hardware has failed, mana_dealloc_queues() could wait forever. Fix this by adding a timeout to the wait. Set the timeout to 120 seconds, which is a somewhat arbitrary value that is more than long enough for functional hardware to complete any sends. Cc: stable@vger.kernel.org Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Souradeep Chakrabarti Link: https://lore.kernel.org/r/1691576525-24271-1-git-send-email-schakrabarti@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index a499e460594b..c2ad0921e893 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -2345,9 +2346,12 @@ int mana_attach(struct net_device *ndev) static int mana_dealloc_queues(struct net_device *ndev) { struct mana_port_context *apc = netdev_priv(ndev); + unsigned long timeout = jiffies + 120 * HZ; struct gdma_dev *gd = apc->ac->gdma_dev; struct mana_txq *txq; + struct sk_buff *skb; int i, err; + u32 tsleep; if (apc->port_is_up) return -EINVAL; @@ -2363,15 +2367,40 @@ static int mana_dealloc_queues(struct net_device *ndev) * to false, but it doesn't matter since mana_start_xmit() drops any * new packets due to apc->port_is_up being false. * - * Drain all the in-flight TX packets + * Drain all the in-flight TX packets. + * A timeout of 120 seconds for all the queues is used. + * This will break the while loop when h/w is not responding. + * This value of 120 has been decided here considering max + * number of queues. */ + for (i = 0; i < apc->num_queues; i++) { txq = &apc->tx_qp[i].txq; - - while (atomic_read(&txq->pending_sends) > 0) - usleep_range(1000, 2000); + tsleep = 1000; + while (atomic_read(&txq->pending_sends) > 0 && + time_before(jiffies, timeout)) { + usleep_range(tsleep, tsleep + 1000); + tsleep <<= 1; + } + if (atomic_read(&txq->pending_sends)) { + err = pcie_flr(to_pci_dev(gd->gdma_context->dev)); + if (err) { + netdev_err(ndev, "flr failed %d with %d pkts pending in txq %u\n", + err, atomic_read(&txq->pending_sends), + txq->gdma_txq_id); + } + break; + } } + for (i = 0; i < apc->num_queues; i++) { + txq = &apc->tx_qp[i].txq; + while ((skb = skb_dequeue(&txq->pending_skbs))) { + mana_unmap_skb(skb, apc); + dev_kfree_skb_any(skb); + } + atomic_set(&txq->pending_sends, 0); + } /* We're 100% sure the queues can no longer be woken up, because * we're sure now mana_poll_tx_cq() can't be running. */ From db17ba719bceb52f0ae4ebca0e4c17d9a3bebf05 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 9 Aug 2023 17:10:34 -0500 Subject: [PATCH 224/230] ibmvnic: Enforce stronger sanity checks on login response Ensure that all offsets in a login response buffer are within the size of the allocated response buffer. Any offsets or lengths that surpass the allocation are likely the result of an incomplete response buffer. In these cases, a full reset is necessary. When attempting to login, the ibmvnic device will allocate a response buffer and pass a reference to the VIOS. The VIOS will then send the ibmvnic device a LOGIN_RSP CRQ to signal that the buffer has been filled with data. If the ibmvnic device does not get a response in 20 seconds, the old buffer is freed and a new login request is sent. With 2 outstanding requests, any LOGIN_RSP CRQ's could be for the older login request. If this is the case then the login response buffer (which is for the newer login request) could be incomplete and contain invalid data. Therefore, we must enforce strict sanity checks on the response buffer values. Testing has shown that the `off_rxadd_buff_size` value is filled in last by the VIOS and will be the smoking gun for these circumstances. Until VIOS can implement a mechanism for tracking outstanding response buffers and a method for mapping a LOGIN_RSP CRQ to a particular login response buffer, the best ibmvnic can do in this situation is perform a full reset. Fixes: dff515a3e71d ("ibmvnic: Harden device login requests") Signed-off-by: Nick Child Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230809221038.51296-1-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 763d613adbcc..f4bb2c9ab9a4 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5396,6 +5396,7 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, int num_tx_pools; int num_rx_pools; u64 *size_array; + u32 rsp_len; int i; /* CHECK: Test/set of login_pending does not need to be atomic @@ -5447,6 +5448,23 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, ibmvnic_reset(adapter, VNIC_RESET_FATAL); return -EIO; } + + rsp_len = be32_to_cpu(login_rsp->len); + if (be32_to_cpu(login->login_rsp_len) < rsp_len || + rsp_len <= be32_to_cpu(login_rsp->off_txsubm_subcrqs) || + rsp_len <= be32_to_cpu(login_rsp->off_rxadd_subcrqs) || + rsp_len <= be32_to_cpu(login_rsp->off_rxadd_buff_size) || + rsp_len <= be32_to_cpu(login_rsp->off_supp_tx_desc)) { + /* This can happen if a login request times out and there are + * 2 outstanding login requests sent, the LOGIN_RSP crq + * could have been for the older login request. So we are + * parsing the newer response buffer which may be incomplete + */ + dev_err(dev, "FATAL: Login rsp offsets/lengths invalid\n"); + ibmvnic_reset(adapter, VNIC_RESET_FATAL); + return -EIO; + } + size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size)); /* variable buffer sizes are not supported, so just read the From 411c565b4bc63e9584a8493882bd566e35a90588 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 9 Aug 2023 17:10:35 -0500 Subject: [PATCH 225/230] ibmvnic: Unmap DMA login rsp buffer on send login fail If the LOGIN CRQ fails to send then we must DMA unmap the response buffer. Previously, if the CRQ failed then the memory was freed without DMA unmapping. Fixes: c98d9cc4170d ("ibmvnic: send_login should check for crq errors") Signed-off-by: Nick Child Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230809221038.51296-2-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index f4bb2c9ab9a4..668c67556190 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4830,11 +4830,14 @@ static int send_login(struct ibmvnic_adapter *adapter) if (rc) { adapter->login_pending = false; netdev_err(adapter->netdev, "Failed to send login, rc=%d\n", rc); - goto buf_rsp_map_failed; + goto buf_send_failed; } return 0; +buf_send_failed: + dma_unmap_single(dev, rsp_buffer_token, rsp_buffer_size, + DMA_FROM_DEVICE); buf_rsp_map_failed: kfree(login_rsp_buffer); adapter->login_rsp_buf = NULL; From d78a671eb8996af19d6311ecdee9790d2fa479f0 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 9 Aug 2023 17:10:36 -0500 Subject: [PATCH 226/230] ibmvnic: Handle DMA unmapping of login buffs in release functions Rather than leaving the DMA unmapping of the login buffers to the login response handler, move this work into the login release functions. Previously, these functions were only used for freeing the allocated buffers. This could lead to issues if there are more than one outstanding login buffer requests, which is possible if a login request times out. If a login request times out, then there is another call to send login. The send login function makes a call to the login buffer release function. In the past, this freed the buffers but did not DMA unmap. Therefore, the VIOS could still write to the old login (now freed) buffer. It is for this reason that it is a good idea to leave the DMA unmap call to the login buffers release function. Since the login buffer release functions now handle DMA unmapping, remove the duplicate DMA unmapping in handle_login_rsp(). Fixes: dff515a3e71d ("ibmvnic: Harden device login requests") Signed-off-by: Nick Child Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230809221038.51296-3-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 668c67556190..77b0a744fa88 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1588,12 +1588,22 @@ static int ibmvnic_login(struct net_device *netdev) static void release_login_buffer(struct ibmvnic_adapter *adapter) { + if (!adapter->login_buf) + return; + + dma_unmap_single(&adapter->vdev->dev, adapter->login_buf_token, + adapter->login_buf_sz, DMA_TO_DEVICE); kfree(adapter->login_buf); adapter->login_buf = NULL; } static void release_login_rsp_buffer(struct ibmvnic_adapter *adapter) { + if (!adapter->login_rsp_buf) + return; + + dma_unmap_single(&adapter->vdev->dev, adapter->login_rsp_buf_token, + adapter->login_rsp_buf_sz, DMA_FROM_DEVICE); kfree(adapter->login_rsp_buf); adapter->login_rsp_buf = NULL; } @@ -5411,11 +5421,6 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, } adapter->login_pending = false; - dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz, - DMA_TO_DEVICE); - dma_unmap_single(dev, adapter->login_rsp_buf_token, - adapter->login_rsp_buf_sz, DMA_FROM_DEVICE); - /* If the number of queues requested can't be allocated by the * server, the login response will return with code 1. We will need * to resend the login buffer with fewer queues requested. From 23cc5f667453ca7645a24c8d21bf84dbf61107b2 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 9 Aug 2023 17:10:37 -0500 Subject: [PATCH 227/230] ibmvnic: Do partial reset on login failure Perform a partial reset before sending a login request if any of the following are true: 1. If a previous request times out. This can be dangerous because the VIOS could still receive the old login request at any point after the timeout. Therefore, it is best to re-register the CRQ's and sub-CRQ's before retrying. 2. If the previous request returns an error that is not described in PAPR. PAPR provides procedures if the login returns with partial success or aborted return codes (section L.5.1) but other values do not have a defined procedure. Previously, these conditions just returned error from the login function rather than trying to resolve the issue. This can cause further issues since most callers of the login function are not prepared to handle an error when logging in. This improper cleanup can lead to the device being permanently DOWN'd. For example, if the VIOS believes that the device is already logged in then it will return INVALID_STATE (-7). If we never re-register CRQ's then it will always think that the device is already logged in. This leaves the device inoperable. The partial reset involves freeing the sub-CRQs, freeing the CRQ then registering and initializing a new CRQ and sub-CRQs. This essentially restarts all communication with VIOS to allow for a fresh login attempt that will be unhindered by any previous failed attempts. Fixes: dff515a3e71d ("ibmvnic: Harden device login requests") Signed-off-by: Nick Child Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230809221038.51296-4-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 46 ++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 77b0a744fa88..e9619957d58a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -97,6 +97,8 @@ static int pending_scrq(struct ibmvnic_adapter *, static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *, struct ibmvnic_sub_crq_queue *); static int ibmvnic_poll(struct napi_struct *napi, int data); +static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter); +static inline void reinit_init_done(struct ibmvnic_adapter *adapter); static void send_query_map(struct ibmvnic_adapter *adapter); static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, u32, u8); static int send_request_unmap(struct ibmvnic_adapter *, u8); @@ -1527,11 +1529,9 @@ static int ibmvnic_login(struct net_device *netdev) if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { - netdev_warn(netdev, "Login timed out, retrying...\n"); - retry = true; - adapter->init_done_rc = 0; - retry_count++; - continue; + netdev_warn(netdev, "Login timed out\n"); + adapter->login_pending = false; + goto partial_reset; } if (adapter->init_done_rc == ABORTED) { @@ -1576,7 +1576,41 @@ static int ibmvnic_login(struct net_device *netdev) } else if (adapter->init_done_rc) { netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n", adapter->init_done_rc); - return -EIO; + +partial_reset: + /* adapter login failed, so free any CRQs or sub-CRQs + * and register again before attempting to login again. + * If we don't do this then the VIOS may think that + * we are already logged in and reject any subsequent + * attempts + */ + netdev_warn(netdev, + "Freeing and re-registering CRQs before attempting to login again\n"); + retry = true; + adapter->init_done_rc = 0; + retry_count++; + release_sub_crqs(adapter, true); + reinit_init_done(adapter); + release_crq_queue(adapter); + /* If we don't sleep here then we risk an unnecessary + * failover event from the VIOS. This is a known VIOS + * issue caused by a vnic device freeing and registering + * a CRQ too quickly. + */ + msleep(1500); + rc = init_crq_queue(adapter); + if (rc) { + netdev_err(netdev, "login recovery: init CRQ failed %d\n", + rc); + return -EIO; + } + + rc = ibmvnic_reset_init(adapter, false); + if (rc) { + netdev_err(netdev, "login recovery: Reset init failed %d\n", + rc); + return -EIO; + } } } while (retry); From 6db541ae279bd4e76dbd939e5fbf298396166242 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 9 Aug 2023 17:10:38 -0500 Subject: [PATCH 228/230] ibmvnic: Ensure login failure recovery is safe from other resets If a login request fails, the recovery process should be protected against parallel resets. It is a known issue that freeing and registering CRQ's in quick succession can result in a failover CRQ from the VIOS. Processing a failover during login recovery is dangerous for two reasons: 1. This will result in two parallel initialization processes, this can cause serious issues during login. 2. It is possible that the failover CRQ is received but never executed. We get notified of a pending failover through a transport event CRQ. The reset is not performed until a INIT CRQ request is received. Previously, if CRQ init fails during login recovery, then the ibmvnic irq is freed and the login process returned error. If failover_pending is true (a transport event was received), then the ibmvnic device would never be able to process the reset since it cannot receive the CRQ_INIT request due to the irq being freed. This leaved the device in a inoperable state. Therefore, the login failure recovery process must be hardened against these possible issues. Possible failovers (due to quick CRQ free and init) must be avoided and any issues during re-initialization should be dealt with instead of being propagated up the stack. This logic is similar to that of ibmvnic_probe(). Fixes: dff515a3e71d ("ibmvnic: Harden device login requests") Signed-off-by: Nick Child Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230809221038.51296-5-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 68 +++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index e9619957d58a..df76cdaddcfb 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -116,6 +116,7 @@ static void ibmvnic_tx_scrq_clean_buffer(struct ibmvnic_adapter *adapter, static void free_long_term_buff(struct ibmvnic_adapter *adapter, struct ibmvnic_long_term_buff *ltb); static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter); +static void flush_reset_queue(struct ibmvnic_adapter *adapter); struct ibmvnic_stat { char name[ETH_GSTRING_LEN]; @@ -1507,8 +1508,8 @@ static const char *adapter_state_to_string(enum vnic_state state) static int ibmvnic_login(struct net_device *netdev) { + unsigned long flags, timeout = msecs_to_jiffies(20000); struct ibmvnic_adapter *adapter = netdev_priv(netdev); - unsigned long timeout = msecs_to_jiffies(20000); int retry_count = 0; int retries = 10; bool retry; @@ -1573,6 +1574,7 @@ static int ibmvnic_login(struct net_device *netdev) "SCRQ irq initialization failed\n"); return rc; } + /* Default/timeout error handling, reset and start fresh */ } else if (adapter->init_done_rc) { netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n", adapter->init_done_rc); @@ -1588,29 +1590,53 @@ partial_reset: "Freeing and re-registering CRQs before attempting to login again\n"); retry = true; adapter->init_done_rc = 0; - retry_count++; release_sub_crqs(adapter, true); - reinit_init_done(adapter); - release_crq_queue(adapter); - /* If we don't sleep here then we risk an unnecessary - * failover event from the VIOS. This is a known VIOS - * issue caused by a vnic device freeing and registering - * a CRQ too quickly. + /* Much of this is similar logic as ibmvnic_probe(), + * we are essentially re-initializing communication + * with the server. We really should not run any + * resets/failovers here because this is already a form + * of reset and we do not want parallel resets occurring */ - msleep(1500); - rc = init_crq_queue(adapter); - if (rc) { - netdev_err(netdev, "login recovery: init CRQ failed %d\n", - rc); - return -EIO; - } + do { + reinit_init_done(adapter); + /* Clear any failovers we got in the previous + * pass since we are re-initializing the CRQ + */ + adapter->failover_pending = false; + release_crq_queue(adapter); + /* If we don't sleep here then we risk an + * unnecessary failover event from the VIOS. + * This is a known VIOS issue caused by a vnic + * device freeing and registering a CRQ too + * quickly. + */ + msleep(1500); + /* Avoid any resets, since we are currently + * resetting. + */ + spin_lock_irqsave(&adapter->rwi_lock, flags); + flush_reset_queue(adapter); + spin_unlock_irqrestore(&adapter->rwi_lock, + flags); - rc = ibmvnic_reset_init(adapter, false); - if (rc) { - netdev_err(netdev, "login recovery: Reset init failed %d\n", - rc); - return -EIO; - } + rc = init_crq_queue(adapter); + if (rc) { + netdev_err(netdev, "login recovery: init CRQ failed %d\n", + rc); + return -EIO; + } + + rc = ibmvnic_reset_init(adapter, false); + if (rc) + netdev_err(netdev, "login recovery: Reset init failed %d\n", + rc); + /* IBMVNIC_CRQ_INIT will return EAGAIN if it + * fails, since ibmvnic_reset_init will free + * irq's in failure, we won't be able to receive + * new CRQs so we need to keep trying. probe() + * handles this similarly. + */ + } while (rc == -EAGAIN && retry_count++ < retries); } } while (retry); From 6b486676b41c369fe822fe65771ffda7eeb3ea6f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Aug 2023 11:09:17 -0700 Subject: [PATCH 229/230] net: tls: set MSG_SPLICE_PAGES consistently We used to change the flags for the last segment, because non-last segments had the MSG_SENDPAGE_NOTLAST flag set. That flag is no longer a thing so remove the setting. Since flags most likely don't have MSG_SPLICE_PAGES set this avoids passing parts of the sg as splice and parts as non-splice. Before commit under Fixes we'd have called tcp_sendpage() which would add the MSG_SPLICE_PAGES. Why this leads to trouble remains unclear but Tariq reports hitting the WARN_ON(!sendpage_ok()) due to page refcount of 0. Fixes: e117dcfd646e ("tls: Inline do_tcp_sendpages()") Reported-by: Tariq Toukan Link: https://lore.kernel.org/all/4c49176f-147a-4283-f1b1-32aac7b4b996@gmail.com/ Tested-by: Tariq Toukan Link: https://lore.kernel.org/r/20230808180917.1243540-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls_main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b6896126bb92..4a8ee2f6badb 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -139,9 +139,6 @@ int tls_push_sg(struct sock *sk, ctx->splicing_pages = true; while (1) { - if (sg_is_last(sg)) - msg.msg_flags = flags; - /* is sending application-limited? */ tcp_rate_check_app_limited(sk); p = sg_page(sg); From 5e3d20617b055e725e785e0058426368269949f3 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 9 Aug 2023 10:09:02 +0800 Subject: [PATCH 230/230] net: hns3: fix strscpy causing content truncation issue hns3_dbg_fill_content()/hclge_dbg_fill_content() is aim to integrate some items to a string for content, and we add '\n' and '\0' in the last two bytes of content. strscpy() will add '\0' in the last byte of destination buffer(one of items), it result in finishing content print ahead of schedule and some dump content truncation. One Error log shows as below: cat mac_list/uc UC MAC_LIST: Expected: UC MAC_LIST: FUNC_ID MAC_ADDR STATE pf 00:2b:19:05:03:00 ACTIVE The destination buffer is length-bounded and not required to be NUL-terminated, so just change strscpy() to memcpy() to fix it. Fixes: 1cf3d5567f27 ("net: hns3: fix strncpy() not using dest-buf length as length issue") Signed-off-by: Hao Chen Signed-off-by: Jijie Shao Link: https://lore.kernel.org/r/20230809020902.1941471-1-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 52546f625c8b..f276b5ecb431 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -464,9 +464,9 @@ static void hns3_dbg_fill_content(char *content, u16 len, if (result) { if (item_len < strlen(result[i])) break; - strscpy(pos, result[i], strlen(result[i])); + memcpy(pos, result[i], strlen(result[i])); } else { - strscpy(pos, items[i].name, strlen(items[i].name)); + memcpy(pos, items[i].name, strlen(items[i].name)); } pos += item_len; len -= item_len; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 409db2e70965..0fb2eaee3e8a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -111,9 +111,9 @@ static void hclge_dbg_fill_content(char *content, u16 len, if (result) { if (item_len < strlen(result[i])) break; - strscpy(pos, result[i], strlen(result[i])); + memcpy(pos, result[i], strlen(result[i])); } else { - strscpy(pos, items[i].name, strlen(items[i].name)); + memcpy(pos, items[i].name, strlen(items[i].name)); } pos += item_len; len -= item_len;