From fc5a40694ba684fb3b7009819965ec38e829118f Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 23 Dec 2021 18:33:39 +0100 Subject: [PATCH 001/773] Revert "dt-bindings: arm: qcom: Document SDX65 platform and boards" This reverts commit 3b338c9a6a2afd6db46d5d8e39ae4f5eef420bf8. This was a duplicate of 61339f368d59d25e22401731f89de44e3215508b, causing the sdx65 compatible and its board to be documented twice. Signed-off-by: Stanislav Jakubek Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20211223173339.GA3925@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/arm/qcom.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml index 2d613282816a..c8808e0f9e64 100644 --- a/Documentation/devicetree/bindings/arm/qcom.yaml +++ b/Documentation/devicetree/bindings/arm/qcom.yaml @@ -48,7 +48,6 @@ description: | sdx65 sm7225 sm8150 - sdx65 sm8250 sm8350 @@ -225,11 +224,6 @@ properties: - qcom,sdx65-mtp - const: qcom,sdx65 - - items: - - enum: - - qcom,sdx65-mtp - - const: qcom,sdx65 - - items: - enum: - qcom,ipq6018-cp01 From 2ddd96aadbd0412040ef49eda94549c32de6c92c Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 23 Jan 2022 14:36:15 +0100 Subject: [PATCH 002/773] arm64: dts: rockchip: fix dma-controller node names on rk356x DMA-Cotrollers defined in rk356x.dtsi do not match the pattern in bindings. arch/arm64/boot/dts/rockchip/rk3568-evb1-v10.dt.yaml: dmac@fe530000: $nodename:0: 'dmac@fe530000' does not match '^dma-controller(@.*)?$' From schema: Documentation/devicetree/bindings/dma/arm,pl330.yaml arch/arm64/boot/dts/rockchip/rk3568-evb1-v10.dt.yaml: dmac@fe550000: $nodename:0: 'dmac@fe550000' does not match '^dma-controller(@.*)?$' From schema: Documentation/devicetree/bindings/dma/arm,pl330.yaml This Patch fixes it. Signed-off-by: Frank Wunderlich Link: https://lore.kernel.org/r/20220123133615.135789-1-linux@fw-web.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk356x.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi b/arch/arm64/boot/dts/rockchip/rk356x.dtsi index a68033a23975..8ccce54ee8e7 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi @@ -651,7 +651,7 @@ status = "disabled"; }; - dmac0: dmac@fe530000 { + dmac0: dma-controller@fe530000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x0 0xfe530000 0x0 0x4000>; interrupts = , @@ -662,7 +662,7 @@ #dma-cells = <1>; }; - dmac1: dmac@fe550000 { + dmac1: dma-controller@fe550000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x0 0xfe550000 0x0 0x4000>; interrupts = , From 85a8bccfa945680dc561f06b65ea01341d2033fc Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 23 Jan 2022 14:35:10 +0100 Subject: [PATCH 003/773] arm64: dts: rockchip: drop pclk_xpcs from gmac0 on rk3568 pclk_xpcs is not supported by mainline driver and breaks dtbs_check following warnings occour, and many more rk3568-evb1-v10.dt.yaml: ethernet@fe2a0000: clocks: [[15, 386], [15, 389], [15, 389], [15, 184], [15, 180], [15, 181], [15, 389], [15, 185], [15, 172]] is too long From schema: Documentation/devicetree/bindings/net/snps,dwmac.yaml rk3568-evb1-v10.dt.yaml: ethernet@fe2a0000: clock-names: ['stmmaceth', 'mac_clk_rx', 'mac_clk_tx', 'clk_mac_refout', 'aclk_mac', 'pclk_mac', 'clk_mac_speed', 'ptp_ref', 'pclk_xpcs'] is too long From schema: Documentation/devicetree/bindings/net/snps,dwmac.yaml after removing it, the clock and other warnings are gone. pclk_xpcs on gmac is used to support QSGMII, but this requires a driver supporting it. Once xpcs support is introduced, the clock can be added to the documentation and both controllers. Fixes: b8d41e5053cd ("arm64: dts: rockchip: add gmac0 node to rk3568") Co-developed-by: Peter Geis Signed-off-by: Peter Geis Signed-off-by: Frank Wunderlich Acked-by: Michael Riesch Link: https://lore.kernel.org/r/20220123133510.135651-1-linux@fw-web.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3568.dtsi | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3568.dtsi b/arch/arm64/boot/dts/rockchip/rk3568.dtsi index 2fd313a295f8..d91df1cde736 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3568.dtsi @@ -32,13 +32,11 @@ clocks = <&cru SCLK_GMAC0>, <&cru SCLK_GMAC0_RX_TX>, <&cru SCLK_GMAC0_RX_TX>, <&cru CLK_MAC0_REFOUT>, <&cru ACLK_GMAC0>, <&cru PCLK_GMAC0>, - <&cru SCLK_GMAC0_RX_TX>, <&cru CLK_GMAC0_PTP_REF>, - <&cru PCLK_XPCS>; + <&cru SCLK_GMAC0_RX_TX>, <&cru CLK_GMAC0_PTP_REF>; clock-names = "stmmaceth", "mac_clk_rx", "mac_clk_tx", "clk_mac_refout", "aclk_mac", "pclk_mac", - "clk_mac_speed", "ptp_ref", - "pclk_xpcs"; + "clk_mac_speed", "ptp_ref"; resets = <&cru SRST_A_GMAC0>; reset-names = "stmmaceth"; rockchip,grf = <&grf>; From ed2c66a95c0c5669880aa93d0d34c6e9694b4cbd Mon Sep 17 00:00:00 2001 From: Quentin Schulz Date: Thu, 20 Jan 2022 13:51:56 +0100 Subject: [PATCH 004/773] arm64: dts: rockchip: fix rk3399-puma-haikou USB OTG mode The micro USB3.0 port available on the Haikou evaluation kit for Puma RK3399-Q7 SoM supports dual-role model (aka drd or OTG) but its support was broken until now because of missing logic around the ID pin. This adds proper support for USB OTG on Puma Haikou by "connecting" the GPIO used for USB ID to the USB3 controller device. Cc: Quentin Schulz Signed-off-by: Quentin Schulz Link: https://lore.kernel.org/r/20220120125156.16217-1-quentin.schulz@theobroma-systems.com Signed-off-by: Heiko Stuebner --- .../arm64/boot/dts/rockchip/rk3399-puma-haikou.dts | 1 + arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts index 292bb7e80cf3..3ae5d727e367 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou.dts @@ -232,6 +232,7 @@ &usbdrd_dwc3_0 { dr_mode = "otg"; + extcon = <&extcon_usb3>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi index fb67db4619ea..002ece51c3ba 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi @@ -25,6 +25,13 @@ }; }; + extcon_usb3: extcon-usb3 { + compatible = "linux,extcon-usb-gpio"; + id-gpio = <&gpio1 RK_PC2 GPIO_ACTIVE_HIGH>; + pinctrl-names = "default"; + pinctrl-0 = <&usb3_id>; + }; + clkin_gmac: external-gmac-clock { compatible = "fixed-clock"; clock-frequency = <125000000>; @@ -422,6 +429,13 @@ <4 RK_PA3 RK_FUNC_GPIO &pcfg_pull_none>; }; }; + + usb3 { + usb3_id: usb3-id { + rockchip,pins = + <1 RK_PC2 RK_FUNC_GPIO &pcfg_pull_none>; + }; + }; }; &sdhci { From b5fbaf7d779f5f02b7f75b080e7707222573be2a Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 14 Jan 2022 15:02:07 -0800 Subject: [PATCH 005/773] arm64: dts: rockchip: Switch RK3399-Gru DP to SPDIF output Commit b18c6c3c7768 ("ASoC: rockchip: cdn-dp sound output use spdif") switched the platform to SPDIF, but we didn't fix up the device tree. Drop the pinctrl settings, because the 'spdif_bus' pins are either: * unused (on kevin, bob), so the settings is ~harmless * used by a different function (on scarlet), which causes probe failures (!!) Fixes: b18c6c3c7768 ("ASoC: rockchip: cdn-dp sound output use spdif") Signed-off-by: Brian Norris Reviewed-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20220114150129.v2.1.I46f64b00508d9dff34abe1c3e8d2defdab4ea1e5@changeid Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi index 45a5ae5d2027..162f08bca0d4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi @@ -286,7 +286,7 @@ sound: sound { compatible = "rockchip,rk3399-gru-sound"; - rockchip,cpu = <&i2s0 &i2s2>; + rockchip,cpu = <&i2s0 &spdif>; }; }; @@ -437,10 +437,6 @@ ap_i2c_audio: &i2c8 { status = "okay"; }; -&i2s2 { - status = "okay"; -}; - &io_domains { status = "okay"; @@ -537,6 +533,17 @@ ap_i2c_audio: &i2c8 { vqmmc-supply = <&ppvar_sd_card_io>; }; +&spdif { + status = "okay"; + + /* + * SPDIF is routed internally to DP; we either don't use these pins, or + * mux them to something else. + */ + /delete-property/ pinctrl-0; + /delete-property/ pinctrl-names; +}; + &spi1 { status = "okay"; From b7a78a8adaa8849c02f174d707aead0f85dca0da Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 7 Jan 2022 09:14:01 +0100 Subject: [PATCH 006/773] iio: adc: tsc2046: fix memory corruption by preventing array overflow On one side we have indio_dev->num_channels includes all physical channels + timestamp channel. On other side we have an array allocated only for physical channels. So, fix memory corruption by ARRAY_SIZE() instead of num_channels variable. Note the first case is a cleanup rather than a fix as the software timestamp channel bit in active_scanmask is never set by the IIO core. Fixes: 9374e8f5a38d ("iio: adc: add ADC driver for the TI TSC2046 controller") Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20220107081401.2816357-1-o.rempel@pengutronix.de Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-tsc2046.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/ti-tsc2046.c b/drivers/iio/adc/ti-tsc2046.c index d84ae6b008c1..e8fc4d01f30b 100644 --- a/drivers/iio/adc/ti-tsc2046.c +++ b/drivers/iio/adc/ti-tsc2046.c @@ -388,7 +388,7 @@ static int tsc2046_adc_update_scan_mode(struct iio_dev *indio_dev, mutex_lock(&priv->slock); size = 0; - for_each_set_bit(ch_idx, active_scan_mask, indio_dev->num_channels) { + for_each_set_bit(ch_idx, active_scan_mask, ARRAY_SIZE(priv->l)) { size += tsc2046_adc_group_set_layout(priv, group, ch_idx); tsc2046_adc_group_set_cmd(priv, group, ch_idx); group++; @@ -548,7 +548,7 @@ static int tsc2046_adc_setup_spi_msg(struct tsc2046_adc_priv *priv) * enabled. */ size = 0; - for (ch_idx = 0; ch_idx < priv->dcfg->num_channels; ch_idx++) + for (ch_idx = 0; ch_idx < ARRAY_SIZE(priv->l); ch_idx++) size += tsc2046_adc_group_set_layout(priv, ch_idx, ch_idx); priv->tx = devm_kzalloc(&priv->spi->dev, size, GFP_KERNEL); From 0e33d15f1dce9e3a80a970ea7f0b27837168aeca Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 12 Jan 2022 22:00:36 +0200 Subject: [PATCH 007/773] iio: adc: ad7124: fix mask used for setting AIN_BUFP & AIN_BUFM bits According to page 90 of the datasheet [1], AIN_BUFP is bit 6 and AIN_BUFM is bit 5 of the CONFIG_0 -> CONFIG_7 registers. Fix the mask used for setting these bits. [1]: https://www.analog.com/media/en/technical-documentation/data-sheets/ad7124-8.pdf Fixes: 0eaecea6e487 ("iio: adc: ad7124: Add buffered input support") Signed-off-by: Cosmin Tanislav Link: https://lore.kernel.org/r/20220112200036.694490-1-cosmin.tanislav@analog.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7124.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c index bc2cfa5f9592..b400bbe291aa 100644 --- a/drivers/iio/adc/ad7124.c +++ b/drivers/iio/adc/ad7124.c @@ -76,7 +76,7 @@ #define AD7124_CONFIG_REF_SEL(x) FIELD_PREP(AD7124_CONFIG_REF_SEL_MSK, x) #define AD7124_CONFIG_PGA_MSK GENMASK(2, 0) #define AD7124_CONFIG_PGA(x) FIELD_PREP(AD7124_CONFIG_PGA_MSK, x) -#define AD7124_CONFIG_IN_BUFF_MSK GENMASK(7, 6) +#define AD7124_CONFIG_IN_BUFF_MSK GENMASK(6, 5) #define AD7124_CONFIG_IN_BUFF(x) FIELD_PREP(AD7124_CONFIG_IN_BUFF_MSK, x) /* AD7124_FILTER_X */ From b0e85f95e30d4d2dc22ea123a30dba36406879a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Fri, 14 Jan 2022 14:26:08 +0100 Subject: [PATCH 008/773] iio:imu:adis16480: fix buffering for devices with no burst mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The trigger handler defined in the driver assumes that burst mode is being used. Hence, for devices that do not support it, we have to use the adis library default trigger implementation. Tested-by: Julia Pineda Fixes: 941f130881fa9 ("iio: adis16480: support burst read function") Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20220114132608.241-1-nuno.sa@analog.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis16480.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c index ed129321a14d..f9b4540db1f4 100644 --- a/drivers/iio/imu/adis16480.c +++ b/drivers/iio/imu/adis16480.c @@ -1403,6 +1403,7 @@ static int adis16480_probe(struct spi_device *spi) { const struct spi_device_id *id = spi_get_device_id(spi); const struct adis_data *adis16480_data; + irq_handler_t trigger_handler = NULL; struct iio_dev *indio_dev; struct adis16480 *st; int ret; @@ -1474,8 +1475,12 @@ static int adis16480_probe(struct spi_device *spi) st->clk_freq = st->chip_info->int_clk; } + /* Only use our trigger handler if burst mode is supported */ + if (adis16480_data->burst_len) + trigger_handler = adis16480_trigger_handler; + ret = devm_adis_setup_buffer_and_trigger(&st->adis, indio_dev, - adis16480_trigger_handler); + trigger_handler); if (ret) return ret; From ccbed9d8d2a5351d8238f2d3f0741c9a3176f752 Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Mon, 20 Dec 2021 13:51:43 +0100 Subject: [PATCH 009/773] iio: accel: fxls8962af: add padding to regmap for SPI Add missing don't care padding between address and data for SPI transfers Fixes: a3e0b51884ee ("iio: accel: add support for FXLS8962AF/FXLS8964AF accelerometers") Signed-off-by: Sean Nyekjaer Link: https://lore.kernel.org/r/20211220125144.3630539-1-sean@geanix.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/accel/fxls8962af-core.c | 12 ++++++++++-- drivers/iio/accel/fxls8962af-i2c.c | 2 +- drivers/iio/accel/fxls8962af-spi.c | 2 +- drivers/iio/accel/fxls8962af.h | 3 ++- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/iio/accel/fxls8962af-core.c b/drivers/iio/accel/fxls8962af-core.c index 32989d91b982..f7fd9e046588 100644 --- a/drivers/iio/accel/fxls8962af-core.c +++ b/drivers/iio/accel/fxls8962af-core.c @@ -173,12 +173,20 @@ struct fxls8962af_data { u16 upper_thres; }; -const struct regmap_config fxls8962af_regmap_conf = { +const struct regmap_config fxls8962af_i2c_regmap_conf = { .reg_bits = 8, .val_bits = 8, .max_register = FXLS8962AF_MAX_REG, }; -EXPORT_SYMBOL_GPL(fxls8962af_regmap_conf); +EXPORT_SYMBOL_GPL(fxls8962af_i2c_regmap_conf); + +const struct regmap_config fxls8962af_spi_regmap_conf = { + .reg_bits = 8, + .pad_bits = 8, + .val_bits = 8, + .max_register = FXLS8962AF_MAX_REG, +}; +EXPORT_SYMBOL_GPL(fxls8962af_spi_regmap_conf); enum { fxls8962af_idx_x, diff --git a/drivers/iio/accel/fxls8962af-i2c.c b/drivers/iio/accel/fxls8962af-i2c.c index cfb004b20455..6bde9891effb 100644 --- a/drivers/iio/accel/fxls8962af-i2c.c +++ b/drivers/iio/accel/fxls8962af-i2c.c @@ -18,7 +18,7 @@ static int fxls8962af_probe(struct i2c_client *client) { struct regmap *regmap; - regmap = devm_regmap_init_i2c(client, &fxls8962af_regmap_conf); + regmap = devm_regmap_init_i2c(client, &fxls8962af_i2c_regmap_conf); if (IS_ERR(regmap)) { dev_err(&client->dev, "Failed to initialize i2c regmap\n"); return PTR_ERR(regmap); diff --git a/drivers/iio/accel/fxls8962af-spi.c b/drivers/iio/accel/fxls8962af-spi.c index 57108d3d480b..6f4dff3238d3 100644 --- a/drivers/iio/accel/fxls8962af-spi.c +++ b/drivers/iio/accel/fxls8962af-spi.c @@ -18,7 +18,7 @@ static int fxls8962af_probe(struct spi_device *spi) { struct regmap *regmap; - regmap = devm_regmap_init_spi(spi, &fxls8962af_regmap_conf); + regmap = devm_regmap_init_spi(spi, &fxls8962af_spi_regmap_conf); if (IS_ERR(regmap)) { dev_err(&spi->dev, "Failed to initialize spi regmap\n"); return PTR_ERR(regmap); diff --git a/drivers/iio/accel/fxls8962af.h b/drivers/iio/accel/fxls8962af.h index b67572c3ef06..9cbe98c3ba9a 100644 --- a/drivers/iio/accel/fxls8962af.h +++ b/drivers/iio/accel/fxls8962af.h @@ -17,6 +17,7 @@ int fxls8962af_core_probe(struct device *dev, struct regmap *regmap, int irq); int fxls8962af_core_remove(struct device *dev); extern const struct dev_pm_ops fxls8962af_pm_ops; -extern const struct regmap_config fxls8962af_regmap_conf; +extern const struct regmap_config fxls8962af_i2c_regmap_conf; +extern const struct regmap_config fxls8962af_spi_regmap_conf; #endif /* _FXLS8962AF_H_ */ From 305325688ff924c52a6e12f92235a89536e022cf Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 20 Jan 2022 17:02:47 -0600 Subject: [PATCH 010/773] NTB/msi: Use struct_size() helper in devm_kzalloc() Make use of the struct_size() helper instead of an open-coded version, in order to avoid any potential type mistakes or integer overflows that, in the worst scenario, could lead to heap overflows. Also, address the following sparse warnings: drivers/ntb/msi.c:46:23: warning: using sizeof on a flexible structure Link: https://github.com/KSPP/linux/issues/174 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason --- drivers/ntb/msi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c index dd683cb58d09..6295e55ef85e 100644 --- a/drivers/ntb/msi.c +++ b/drivers/ntb/msi.c @@ -33,7 +33,6 @@ int ntb_msi_init(struct ntb_dev *ntb, { phys_addr_t mw_phys_addr; resource_size_t mw_size; - size_t struct_size; int peer_widx; int peers; int ret; @@ -43,9 +42,8 @@ int ntb_msi_init(struct ntb_dev *ntb, if (peers <= 0) return -EINVAL; - struct_size = sizeof(*ntb->msi) + sizeof(*ntb->msi->peer_mws) * peers; - - ntb->msi = devm_kzalloc(&ntb->dev, struct_size, GFP_KERNEL); + ntb->msi = devm_kzalloc(&ntb->dev, struct_size(ntb->msi, peer_mws, peers), + GFP_KERNEL); if (!ntb->msi) return -ENOMEM; From 6596a0229541270fb8d38d989f91b78838e5e9da Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 19 Jan 2022 10:22:53 +0100 Subject: [PATCH 011/773] xfrm: fix MTU regression Commit 749439bfac6e1a2932c582e2699f91d329658196 ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") breaks PMTU for xfrm. A Packet Too Big ICMPv6 message received in response to an ESP packet will prevent all further communication through the tunnel if the reported MTU minus the ESP overhead is smaller than 1280. E.g. in a case of a tunnel-mode ESP with sha256/aes the overhead is 92 bytes. Receiving a PTB with MTU of 1371 or less will result in all further packets in the tunnel dropped. A ping through the tunnel fails with "ping: sendmsg: Invalid argument". Apparently the MTU on the xfrm route is smaller than 1280 and fails the check inside ip6_setup_cork() added by 749439bf. We found this by debugging USGv6/ipv6ready failures. Failing tests are: "Phase-2 Interoperability Test Scenario IPsec" / 5.3.11 and 5.4.11 (Tunnel Mode: Fragmentation). Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") attempted to fix this but caused another regression in TCP MSS calculations and had to be reverted. The patch below fixes the situation by dropping the MTU check and instead checking for the underflows described in the 749439bf commit message. Signed-off-by: Jiri Bohac Fixes: 749439bfac6e ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") Signed-off-by: Steffen Klassert --- net/ipv6/ip6_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2995f8d89e7e..4ff110214dea 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1408,8 +1408,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (np->frag_size) mtu = np->frag_size; } - if (mtu < IPV6_MIN_MTU) - return -EINVAL; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; @@ -1471,8 +1469,6 @@ static int __ip6_append_data(struct sock *sk, fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - - sizeof(struct frag_hdr); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + @@ -1480,6 +1476,13 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; + if (mtu < fragheaderlen || + ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr)) + goto emsgsize; + + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); + /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ From ba1b71b008e97fd747845ff3a818420b11bbe830 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 30 Dec 2021 07:27:51 +0000 Subject: [PATCH 012/773] mtd: rawnand: ingenic: Fix missing put_device in ingenic_ecc_get If of_find_device_by_node() succeeds, ingenic_ecc_get() doesn't have a corresponding put_device(). Thus add put_device() to fix the exception handling. Fixes: 15de8c6efd0e ("mtd: rawnand: ingenic: Separate top-level and SoC specific code") Signed-off-by: Miaoqian Lin Reviewed-by: Paul Cercueil Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20211230072751.21622-1-linmq006@gmail.com --- drivers/mtd/nand/raw/ingenic/ingenic_ecc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/ingenic/ingenic_ecc.c b/drivers/mtd/nand/raw/ingenic/ingenic_ecc.c index efe0ffe4f1ab..9054559e52dd 100644 --- a/drivers/mtd/nand/raw/ingenic/ingenic_ecc.c +++ b/drivers/mtd/nand/raw/ingenic/ingenic_ecc.c @@ -68,9 +68,14 @@ static struct ingenic_ecc *ingenic_ecc_get(struct device_node *np) struct ingenic_ecc *ecc; pdev = of_find_device_by_node(np); - if (!pdev || !platform_get_drvdata(pdev)) + if (!pdev) return ERR_PTR(-EPROBE_DEFER); + if (!platform_get_drvdata(pdev)) { + put_device(&pdev->dev); + return ERR_PTR(-EPROBE_DEFER); + } + ecc = platform_get_drvdata(pdev); clk_prepare_enable(ecc->clk); From 5c23b3f965bc9ee696bf2ed4bdc54d339dd9a455 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Mon, 3 Jan 2022 03:03:15 +0000 Subject: [PATCH 013/773] mtd: rawnand: qcom: Fix clock sequencing in qcom_nandc_probe() Interacting with a NAND chip on an IPQ6018 I found that the qcomsmem NAND partition parser was returning -EPROBE_DEFER waiting for the main smem driver to load. This caused the board to reset. Playing about with the probe() function shows that the problem lies in the core clock being switched off before the nandc_unalloc() routine has completed. If we look at how qcom_nandc_remove() tears down allocated resources we see the expected order is qcom_nandc_unalloc(nandc); clk_disable_unprepare(nandc->aon_clk); clk_disable_unprepare(nandc->core_clk); dma_unmap_resource(&pdev->dev, nandc->base_dma, resource_size(res), DMA_BIDIRECTIONAL, 0); Tweaking probe() to both bring up and tear-down in that order removes the reset if we end up deferring elsewhere. Fixes: c76b78d8ec05 ("mtd: nand: Qualcomm NAND controller driver") Signed-off-by: Bryan O'Donoghue Reviewed-by: Manivannan Sadhasivam Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220103030316.58301-2-bryan.odonoghue@linaro.org --- drivers/mtd/nand/raw/qcom_nandc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index 7c6efa3b6255..1a77542c6d67 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -2,7 +2,6 @@ /* * Copyright (c) 2016, The Linux Foundation. All rights reserved. */ - #include #include #include @@ -3073,10 +3072,6 @@ static int qcom_nandc_probe(struct platform_device *pdev) if (dma_mapping_error(dev, nandc->base_dma)) return -ENXIO; - ret = qcom_nandc_alloc(nandc); - if (ret) - goto err_nandc_alloc; - ret = clk_prepare_enable(nandc->core_clk); if (ret) goto err_core_clk; @@ -3085,6 +3080,10 @@ static int qcom_nandc_probe(struct platform_device *pdev) if (ret) goto err_aon_clk; + ret = qcom_nandc_alloc(nandc); + if (ret) + goto err_nandc_alloc; + ret = qcom_nandc_setup(nandc); if (ret) goto err_setup; @@ -3096,15 +3095,14 @@ static int qcom_nandc_probe(struct platform_device *pdev) return 0; err_setup: + qcom_nandc_unalloc(nandc); +err_nandc_alloc: clk_disable_unprepare(nandc->aon_clk); err_aon_clk: clk_disable_unprepare(nandc->core_clk); err_core_clk: - qcom_nandc_unalloc(nandc); -err_nandc_alloc: dma_unmap_resource(dev, res->start, resource_size(res), DMA_BIDIRECTIONAL, 0); - return ret; } From 079e6bdb2b1cc1da8b5c602229db782732668ae7 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Mon, 3 Jan 2022 03:03:16 +0000 Subject: [PATCH 014/773] mtd: parsers: qcom: Don't print error message on -EPROBE_DEFER Its possible for the main smem driver to not be loaded by the time we come along to parse the smem partition description but, this is a perfectly normal thing. No need to print out an error message in this case. Signed-off-by: Bryan O'Donoghue Reviewed-by: Manivannan Sadhasivam Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220103030316.58301-3-bryan.odonoghue@linaro.org --- drivers/mtd/parsers/qcomsmempart.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/parsers/qcomsmempart.c b/drivers/mtd/parsers/qcomsmempart.c index 06a818cd2433..b2a57fe8479f 100644 --- a/drivers/mtd/parsers/qcomsmempart.c +++ b/drivers/mtd/parsers/qcomsmempart.c @@ -75,7 +75,8 @@ static int parse_qcomsmem_part(struct mtd_info *mtd, pr_debug("Parsing partition table info from SMEM\n"); ptable = qcom_smem_get(SMEM_APPS, SMEM_AARM_PARTITION_TABLE, &len); if (IS_ERR(ptable)) { - pr_err("Error reading partition table header\n"); + if (PTR_ERR(ptable) != -EPROBE_DEFER) + pr_err("Error reading partition table header\n"); return PTR_ERR(ptable); } From 65d003cca335cabc0160d3cd7daa689eaa9dd3cd Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Sun, 16 Jan 2022 04:22:10 +0100 Subject: [PATCH 015/773] mtd: parsers: qcom: Fix kernel panic on skipped partition In the event of a skipped partition (case when the entry name is empty) the kernel panics in the cleanup function as the name entry is NULL. Rework the parser logic by first checking the real partition number and then allocate the space and set the data for the valid partitions. The logic was also fundamentally wrong as with a skipped partition, the parts number returned was incorrect by not decreasing it for the skipped partitions. Fixes: 803eb124e1a6 ("mtd: parsers: Add Qcom SMEM parser") Signed-off-by: Ansuel Smith Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220116032211.9728-1-ansuelsmth@gmail.com --- drivers/mtd/parsers/qcomsmempart.c | 31 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/mtd/parsers/qcomsmempart.c b/drivers/mtd/parsers/qcomsmempart.c index b2a57fe8479f..469d466f568f 100644 --- a/drivers/mtd/parsers/qcomsmempart.c +++ b/drivers/mtd/parsers/qcomsmempart.c @@ -58,11 +58,11 @@ static int parse_qcomsmem_part(struct mtd_info *mtd, const struct mtd_partition **pparts, struct mtd_part_parser_data *data) { + size_t len = SMEM_FLASH_PTABLE_HDR_LEN; + int ret, i, j, tmpparts, numparts = 0; struct smem_flash_pentry *pentry; struct smem_flash_ptable *ptable; - size_t len = SMEM_FLASH_PTABLE_HDR_LEN; struct mtd_partition *parts; - int ret, i, numparts; char *name, *c; if (IS_ENABLED(CONFIG_MTD_SPI_NOR_USE_4K_SECTORS) @@ -88,8 +88,8 @@ static int parse_qcomsmem_part(struct mtd_info *mtd, } /* Ensure that # of partitions is less than the max we have allocated */ - numparts = le32_to_cpu(ptable->numparts); - if (numparts > SMEM_FLASH_PTABLE_MAX_PARTS_V4) { + tmpparts = le32_to_cpu(ptable->numparts); + if (tmpparts > SMEM_FLASH_PTABLE_MAX_PARTS_V4) { pr_err("Partition numbers exceed the max limit\n"); return -EINVAL; } @@ -117,11 +117,17 @@ static int parse_qcomsmem_part(struct mtd_info *mtd, return PTR_ERR(ptable); } + for (i = 0; i < tmpparts; i++) { + pentry = &ptable->pentry[i]; + if (pentry->name[0] != '\0') + numparts++; + } + parts = kcalloc(numparts, sizeof(*parts), GFP_KERNEL); if (!parts) return -ENOMEM; - for (i = 0; i < numparts; i++) { + for (i = 0, j = 0; i < tmpparts; i++) { pentry = &ptable->pentry[i]; if (pentry->name[0] == '\0') continue; @@ -136,24 +142,25 @@ static int parse_qcomsmem_part(struct mtd_info *mtd, for (c = name; *c != '\0'; c++) *c = tolower(*c); - parts[i].name = name; - parts[i].offset = le32_to_cpu(pentry->offset) * mtd->erasesize; - parts[i].mask_flags = pentry->attr; - parts[i].size = le32_to_cpu(pentry->length) * mtd->erasesize; + parts[j].name = name; + parts[j].offset = le32_to_cpu(pentry->offset) * mtd->erasesize; + parts[j].mask_flags = pentry->attr; + parts[j].size = le32_to_cpu(pentry->length) * mtd->erasesize; pr_debug("%d: %s offs=0x%08x size=0x%08x attr:0x%08x\n", i, pentry->name, le32_to_cpu(pentry->offset), le32_to_cpu(pentry->length), pentry->attr); + j++; } pr_debug("SMEM partition table found: ver: %d len: %d\n", - le32_to_cpu(ptable->version), numparts); + le32_to_cpu(ptable->version), tmpparts); *pparts = parts; return numparts; out_free_parts: - while (--i >= 0) - kfree(parts[i].name); + while (--j >= 0) + kfree(parts[j].name); kfree(parts); *pparts = NULL; From 3dd8ba961b9356c4113b96541c752c73d98fef70 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Sun, 16 Jan 2022 04:22:11 +0100 Subject: [PATCH 016/773] mtd: parsers: qcom: Fix missing free for pparts in cleanup Mtdpart doesn't free pparts when a cleanup function is declared. Add missing free for pparts in cleanup function for smem to fix the leak. Fixes: 10f3b4d79958 ("mtd: parsers: qcom: Fix leaking of partition name") Signed-off-by: Ansuel Smith Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220116032211.9728-2-ansuelsmth@gmail.com --- drivers/mtd/parsers/qcomsmempart.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mtd/parsers/qcomsmempart.c b/drivers/mtd/parsers/qcomsmempart.c index 469d466f568f..4311b89d8df0 100644 --- a/drivers/mtd/parsers/qcomsmempart.c +++ b/drivers/mtd/parsers/qcomsmempart.c @@ -174,6 +174,8 @@ static void parse_qcomsmem_cleanup(const struct mtd_partition *pparts, for (i = 0; i < nr_parts; i++) kfree(pparts[i].name); + + kfree(pparts); } static const struct of_device_id qcomsmem_of_match_table[] = { From 4cd335dae3cf25412427938d8abbaf04d46e63b5 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Tue, 18 Jan 2022 14:35:25 +0200 Subject: [PATCH 017/773] mtd: rawnand: omap2: Prevent invalid configuration and build error We need to select MEMORY as well otherwise OMAP_GPMC will not be built. For simplicity let's select MEMORY and OMAP_GPMC unconditionally as this driver depends on OMAP_GPMC driver and uses symbols from there. Fixes: dbcb124acebd ("mtd: rawnand: omap2: Select GPMC device driver for ARCH_K3") Reported-by: kernel test robot Signed-off-by: Roger Quadros Reviewed-by: Krzysztof Kozlowski Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220118123525.8020-1-rogerq@kernel.org --- drivers/mtd/nand/raw/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig index 20408b7db540..d986ab4e4c35 100644 --- a/drivers/mtd/nand/raw/Kconfig +++ b/drivers/mtd/nand/raw/Kconfig @@ -42,7 +42,8 @@ config MTD_NAND_OMAP2 tristate "OMAP2, OMAP3, OMAP4 and Keystone NAND controller" depends on ARCH_OMAP2PLUS || ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST depends on HAS_IOMEM - select OMAP_GPMC if ARCH_K3 + select MEMORY + select OMAP_GPMC help Support for NAND flash on Texas Instruments OMAP2, OMAP3, OMAP4 and Keystone platforms. From 3e3765875b1b8864898603768fd5c93eeb552211 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 21 Jan 2022 14:55:05 +0300 Subject: [PATCH 018/773] mtd: phram: Prevent divide by zero bug in phram_setup() The problem is that "erasesize" is a uint64_t type so it might be non-zero but the lower 32 bits are zero so when it's truncated, "(uint32_t)erasesize", then that value is zero. This leads to a divide by zero bug. Avoid the bug by delaying the divide until after we have validated that "erasesize" is non-zero and within the uint32_t range. Fixes: dc2b3e5cbc80 ("mtd: phram: use div_u64_rem to stop overwrite len in phram_setup") Signed-off-by: Dan Carpenter Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220121115505.GI1978@kadam --- drivers/mtd/devices/phram.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/devices/phram.c b/drivers/mtd/devices/phram.c index 6ed6c51fac69..d503821a3e60 100644 --- a/drivers/mtd/devices/phram.c +++ b/drivers/mtd/devices/phram.c @@ -264,16 +264,20 @@ static int phram_setup(const char *val) } } - if (erasesize) - div_u64_rem(len, (uint32_t)erasesize, &rem); - if (len == 0 || erasesize == 0 || erasesize > len - || erasesize > UINT_MAX || rem) { + || erasesize > UINT_MAX) { parse_err("illegal erasesize or len\n"); ret = -EINVAL; goto error; } + div_u64_rem(len, (uint32_t)erasesize, &rem); + if (rem) { + parse_err("len is not multiple of erasesize\n"); + ret = -EINVAL; + goto error; + } + ret = register_device(name, start, len, (uint32_t)erasesize); if (ret) goto error; From c1aca3080e382886e2e58e809787441984a2f89b Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:13 -0800 Subject: [PATCH 019/773] xfrm: Check if_id in xfrm_migrate This patch enables distinguishing SAs and SPs based on if_id during the xfrm_migrate flow. This ensures support for xfrm interfaces throughout the SA/SP lifecycle. When there are multiple existing SPs with the same direction, the same xfrm_selector and different endpoint addresses, xfrm_migrate might fail with ENODATA. Specifically, the code path for performing xfrm_migrate is: Stage 1: find policy to migrate with xfrm_migrate_policy_find(sel, dir, type, net) Stage 2: find and update state(s) with xfrm_migrate_state_find(mp, net) Stage 3: update endpoint address(es) of template(s) with xfrm_policy_migrate(pol, m, num_migrate) Currently "Stage 1" always returns the first xfrm_policy that matches, and "Stage 3" looks for the xfrm_tmpl that matches the old endpoint address. Thus if there are multiple xfrm_policy with same selector, direction, type and net, "Stage 1" might rertun a wrong xfrm_policy and "Stage 3" will fail with ENODATA because it cannot find a xfrm_tmpl with the matching endpoint address. The fix is to allow userspace to pass an if_id and add if_id to the matching rule in Stage 1 and Stage 2 since if_id is a unique ID for xfrm_policy and xfrm_state. For compatibility, if_id will only be checked if the attribute is set. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1668886 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++-- net/key/af_key.c | 2 +- net/xfrm/xfrm_policy.c | 14 ++++++++------ net/xfrm/xfrm_state.c | 7 ++++++- net/xfrm/xfrm_user.c | 6 +++++- 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fdb41e8bb626..743dd1da506e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1681,14 +1681,15 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net); +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, u32 if_id); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); diff --git a/net/key/af_key.c b/net/key/af_key.c index de24a7d474df..9bf52a09b5ff 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2623,7 +2623,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL); + kma ? &k : NULL, net, NULL, 0); out: return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 04d1ce9b510f..882526159d3a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4256,7 +4256,7 @@ static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, - u8 dir, u8 type, struct net *net) + u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; @@ -4265,7 +4265,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; @@ -4277,7 +4278,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * if ((pol->priority >= priority) && ret) break; - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; @@ -4393,7 +4395,7 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, u32 if_id) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4412,14 +4414,14 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 1 - find policy */ - if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { + if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) { err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { - if ((x = xfrm_migrate_state_find(mp, net))) { + if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ca6bee18346d..b0eeb0aef493 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1606,7 +1606,8 @@ out: return NULL; } -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net) +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id) { unsigned int h; struct xfrm_state *x = NULL; @@ -1622,6 +1623,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n continue; if (m->reqid && x->props.reqid != m->reqid) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, @@ -1637,6 +1640,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n if (x->props.mode != m->mode || x->id.proto != m->proto) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8cd6c8129004..a4fb596e87af 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2608,6 +2608,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; + u32 if_id = 0; if (attrs[XFRMA_MIGRATE] == NULL) return -EINVAL; @@ -2632,7 +2633,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOMEM; } - err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap); + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + + err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id); kfree(encap); From e03c3bba351f99ad932e8f06baa9da1afc418e02 Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:14 -0800 Subject: [PATCH 020/773] xfrm: Fix xfrm migrate issues when address family changes xfrm_migrate cannot handle address family change of an xfrm_state. The symptons are the xfrm_state will be migrated to a wrong address, and sending as well as receiving packets wil be broken. This commit fixes it by breaking the original xfrm_state_clone method into two steps so as to update the props.family before running xfrm_init_state. As the result, xfrm_state's inner mode, outer mode, type and IP header length in xfrm_state_migrate can be updated with the new address family. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1885354 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b0eeb0aef493..1ba6fbfe8cdb 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1579,9 +1579,6 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, memcpy(&x->mark, &orig->mark, sizeof(x->mark)); memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); - if (xfrm_init_state(x) < 0) - goto error; - x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; @@ -1668,6 +1665,11 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, if (!xc) return NULL; + xc->props.family = m->new_family; + + if (xfrm_init_state(xc) < 0) + goto error; + memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); From 9161f365c91614e5a3f5c6dcc44c3b1b33bc59c0 Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Tue, 25 Jan 2022 09:16:19 +0100 Subject: [PATCH 021/773] mtd: rawnand: gpmi: don't leak PM reference in error path If gpmi_nfc_apply_timings() fails, the PM runtime usage counter must be dropped. Reported-by: Pavel Machek Fixes: f53d4c109a66 ("mtd: rawnand: gpmi: Add ERR007117 protection for nfc_apply_timings") Signed-off-by: Christian Eggers Cc: stable@vger.kernel.org Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220125081619.6286-1-ceggers@arri.de --- drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c index 1b64c5a5140d..ded4df473928 100644 --- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c +++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c @@ -2285,7 +2285,7 @@ static int gpmi_nfc_exec_op(struct nand_chip *chip, this->hw.must_apply_timings = false; ret = gpmi_nfc_apply_timings(this); if (ret) - return ret; + goto out_pm; } dev_dbg(this->dev, "%s: %d instructions\n", __func__, op->ninstrs); @@ -2414,6 +2414,7 @@ unmap: this->bch = false; +out_pm: pm_runtime_mark_last_busy(this->dev); pm_runtime_put_autosuspend(this->dev); From 31eeb6b09f4053f32a30ce9fbcdfca31f713028d Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Jan 2022 17:57:01 +0000 Subject: [PATCH 022/773] arm64: dts: juno: Remove GICv2m dma-range Although it is painstakingly honest to describe all 3 PCI windows in "dma-ranges", it misses the the subtle distinction that the window for the GICv2m range is normally programmed for Device memory attributes rather than Normal Cacheable like the DRAM windows. Since MMU-401 only offers stage 2 translation, this means that when the PCI SMMU is enabled, accesses through that IPA range unexpectedly lose coherency if mapped as cacheable at the SMMU, due to the attribute combining rules. Since an extra 256KB is neither here nor there when we still have 10GB worth of usable address space, rather than attempting to describe and cope with this detail let's just remove the offending range. If the SMMU is not used then it makes no difference anyway. Link: https://lore.kernel.org/r/856c3f7192c6c3ce545ba67462f2ce9c86ed6b0c.1643046936.git.robin.murphy@arm.com Fixes: 4ac4d146cb63 ("arm64: dts: juno: Describe PCI dma-ranges") Reported-by: Anders Roxell Acked-by: Liviu Dudau Signed-off-by: Robin Murphy Signed-off-by: Sudeep Holla --- arch/arm64/boot/dts/arm/juno-base.dtsi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/arm/juno-base.dtsi b/arch/arm64/boot/dts/arm/juno-base.dtsi index 6288e104a089..a2635b14da30 100644 --- a/arch/arm64/boot/dts/arm/juno-base.dtsi +++ b/arch/arm64/boot/dts/arm/juno-base.dtsi @@ -543,8 +543,7 @@ <0x02000000 0x00 0x50000000 0x00 0x50000000 0x0 0x08000000>, <0x42000000 0x40 0x00000000 0x40 0x00000000 0x1 0x00000000>; /* Standard AXI Translation entries as programmed by EDK2 */ - dma-ranges = <0x02000000 0x0 0x2c1c0000 0x0 0x2c1c0000 0x0 0x00040000>, - <0x02000000 0x0 0x80000000 0x0 0x80000000 0x0 0x80000000>, + dma-ranges = <0x02000000 0x0 0x80000000 0x0 0x80000000 0x0 0x80000000>, <0x43000000 0x8 0x00000000 0x8 0x00000000 0x2 0x00000000>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; From a6d95c5a628a09be129f25d5663a7e9db8261f51 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 26 Jan 2022 16:00:18 +0100 Subject: [PATCH 023/773] Revert "xfrm: xfrm_state_mtu should return at least 1280 for ipv6" This reverts commit b515d2637276a3810d6595e10ab02c13bfd0b63a. Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") in v5.14 breaks the TCP MSS calculation in ipsec transport mode, resulting complete stalls of TCP connections. This happens when the (P)MTU is 1280 or slighly larger. The desired formula for the MSS is: MSS = (MTU - ESP_overhead) - IP header - TCP header However, the above commit clamps the (MTU - ESP_overhead) to a minimum of 1280, turning the formula into MSS = max(MTU - ESP overhead, 1280) - IP header - TCP header With the (P)MTU near 1280, the calculated MSS is too large and the resulting TCP packets never make it to the destination because they are over the actual PMTU. The above commit also causes suboptimal double fragmentation in xfrm tunnel mode, as described in https://lore.kernel.org/netdev/20210429202529.codhwpc7w6kbudug@dwarf.suse.cz/ The original problem the above commit was trying to fix is now fixed by commit 6596a0229541270fb8d38d989f91b78838e5e9da ("xfrm: fix MTU regression"). Signed-off-by: Jiri Bohac Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- net/xfrm/xfrm_state.c | 14 ++------------ 4 files changed, 4 insertions(+), 15 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 743dd1da506e..76aa6f11a540 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1568,7 +1568,6 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 851f542928a3..e1b1d080e908 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -671,7 +671,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bb2c407b46b..7591160edce1 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -707,7 +707,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1ba6fbfe8cdb..b749935152ba 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2579,7 +2579,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2610,17 +2610,7 @@ u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(__xfrm_state_mtu); - -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) -{ - mtu = __xfrm_state_mtu(x, mtu); - - if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) - return IPV6_MIN_MTU; - - return mtu; -} +EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { From 22d7ee32f1fb6d51ef8cf657c6685ca04745755a Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 23 Dec 2021 17:46:50 +0300 Subject: [PATCH 024/773] gpu: host1x: Fix hang on Tegra186+ Tegra186+ hangs if host1x hardware is disabled at a kernel boot time because we touch hardware before runtime PM is resumed. Move sync point assignment initialization to the RPM-resume callback. Older SoCs were unaffected because they skip that sync point initialization. Tested-by: Jon Hunter # T186 Reported-by: Jon Hunter # T186 Fixes: 6b6776e2ab8a ("gpu: host1x: Add initial runtime PM and OPP support") Signed-off-by: Dmitry Osipenko Signed-off-by: Thierry Reding --- drivers/gpu/host1x/syncpt.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c index e08e331e46ae..83a16e46146b 100644 --- a/drivers/gpu/host1x/syncpt.c +++ b/drivers/gpu/host1x/syncpt.c @@ -137,8 +137,15 @@ void host1x_syncpt_restore(struct host1x *host) struct host1x_syncpt *sp_base = host->syncpt; unsigned int i; - for (i = 0; i < host1x_syncpt_nb_pts(host); i++) + for (i = 0; i < host1x_syncpt_nb_pts(host); i++) { + /* + * Unassign syncpt from channels for purposes of Tegra186 + * syncpoint protection. This prevents any channel from + * accessing it until it is reassigned. + */ + host1x_hw_syncpt_assign_to_channel(host, sp_base + i, NULL); host1x_hw_syncpt_restore(host, sp_base + i); + } for (i = 0; i < host1x_syncpt_nb_bases(host); i++) host1x_hw_syncpt_restore_wait_base(host, sp_base + i); @@ -352,13 +359,6 @@ int host1x_syncpt_init(struct host1x *host) for (i = 0; i < host->info->nb_pts; i++) { syncpt[i].id = i; syncpt[i].host = host; - - /* - * Unassign syncpt from channels for purposes of Tegra186 - * syncpoint protection. This prevents any channel from - * accessing it until it is reassigned. - */ - host1x_hw_syncpt_assign_to_channel(host, &syncpt[i], NULL); } for (i = 0; i < host->info->nb_bases; i++) From ebea268ea583ba4970df425dfef8c8e21d0a4e12 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Thu, 13 Jan 2022 14:31:52 +0000 Subject: [PATCH 025/773] arm64: tegra: Disable ISO SMMU for Tegra194 Commit e762232f9466 ("arm64: tegra: Add ISO SMMU controller for Tegra194") added the ISO SMMU for display devices on Tegra194. The SMMU is enabled by default but not hooked up to the display controllers yet because we do not have a way to pass frame-buffer memory from the bootloader to the kernel. However, even though the SMMU is not hooked up to the display controllers' SMMU faults are being seen if a display is connected. Therefore, keep the ISO SMMU disabled by default for now. Fixes: e762232f9466 ("arm64: tegra: Add ISO SMMU controller for Tegra194") Signed-off-by: Jon Hunter Signed-off-by: Thierry Reding --- arch/arm64/boot/dts/nvidia/tegra194.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi b/arch/arm64/boot/dts/nvidia/tegra194.dtsi index 3c4acfca459d..6005e422c968 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi @@ -1585,7 +1585,7 @@ #iommu-cells = <1>; nvidia,memory-controller = <&mc>; - status = "okay"; + status = "disabled"; }; smmu: iommu@12000000 { From d5081bf5dcfb1cb83fb538708b0ac07a10a79cc4 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 27 Jan 2022 13:31:12 -0700 Subject: [PATCH 026/773] ntb: intel: fix port config status offset for SPR The field offset for port configuration status on SPR has been changed to bit 14 from ICX where it resides at bit 12. By chance link status detection continued to work on SPR. This is due to bit 12 being a configuration bit which is in sync with the status bit. Fix this by checking for a SPR device and checking correct status bit. Fixes: 26bfe3d0b227 ("ntb: intel: Add Icelake (gen4) support for Intel NTB") Tested-by: Jerry Dai Signed-off-by: Dave Jiang Signed-off-by: Jon Mason --- drivers/ntb/hw/intel/ntb_hw_gen4.c | 17 ++++++++++++++++- drivers/ntb/hw/intel/ntb_hw_gen4.h | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.c b/drivers/ntb/hw/intel/ntb_hw_gen4.c index fede05151f69..4081fc538ff4 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen4.c +++ b/drivers/ntb/hw/intel/ntb_hw_gen4.c @@ -168,6 +168,18 @@ static enum ntb_topo gen4_ppd_topo(struct intel_ntb_dev *ndev, u32 ppd) return NTB_TOPO_NONE; } +static enum ntb_topo spr_ppd_topo(struct intel_ntb_dev *ndev, u32 ppd) +{ + switch (ppd & SPR_PPD_TOPO_MASK) { + case SPR_PPD_TOPO_B2B_USD: + return NTB_TOPO_B2B_USD; + case SPR_PPD_TOPO_B2B_DSD: + return NTB_TOPO_B2B_DSD; + } + + return NTB_TOPO_NONE; +} + int gen4_init_dev(struct intel_ntb_dev *ndev) { struct pci_dev *pdev = ndev->ntb.pdev; @@ -183,7 +195,10 @@ int gen4_init_dev(struct intel_ntb_dev *ndev) } ppd1 = ioread32(ndev->self_mmio + GEN4_PPD1_OFFSET); - ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1); + if (pdev_is_ICX(pdev)) + ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1); + else if (pdev_is_SPR(pdev)) + ndev->ntb.topo = spr_ppd_topo(ndev, ppd1); dev_dbg(&pdev->dev, "ppd %#x topo %s\n", ppd1, ntb_topo_string(ndev->ntb.topo)); if (ndev->ntb.topo == NTB_TOPO_NONE) diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.h b/drivers/ntb/hw/intel/ntb_hw_gen4.h index 3fcd3fdce9ed..f91323eaf5ce 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen4.h +++ b/drivers/ntb/hw/intel/ntb_hw_gen4.h @@ -49,10 +49,14 @@ #define GEN4_PPD_CLEAR_TRN 0x0001 #define GEN4_PPD_LINKTRN 0x0008 #define GEN4_PPD_CONN_MASK 0x0300 +#define SPR_PPD_CONN_MASK 0x0700 #define GEN4_PPD_CONN_B2B 0x0200 #define GEN4_PPD_DEV_MASK 0x1000 #define GEN4_PPD_DEV_DSD 0x1000 #define GEN4_PPD_DEV_USD 0x0000 +#define SPR_PPD_DEV_MASK 0x4000 +#define SPR_PPD_DEV_DSD 0x4000 +#define SPR_PPD_DEV_USD 0x0000 #define GEN4_LINK_CTRL_LINK_DISABLE 0x0010 #define GEN4_SLOTSTS 0xb05a @@ -62,6 +66,10 @@ #define GEN4_PPD_TOPO_B2B_USD (GEN4_PPD_CONN_B2B | GEN4_PPD_DEV_USD) #define GEN4_PPD_TOPO_B2B_DSD (GEN4_PPD_CONN_B2B | GEN4_PPD_DEV_DSD) +#define SPR_PPD_TOPO_MASK (SPR_PPD_CONN_MASK | SPR_PPD_DEV_MASK) +#define SPR_PPD_TOPO_B2B_USD (GEN4_PPD_CONN_B2B | SPR_PPD_DEV_USD) +#define SPR_PPD_TOPO_B2B_DSD (GEN4_PPD_CONN_B2B | SPR_PPD_DEV_DSD) + #define GEN4_DB_COUNT 32 #define GEN4_DB_LINK 32 #define GEN4_DB_LINK_BIT BIT_ULL(GEN4_DB_LINK) @@ -112,4 +120,12 @@ static inline int pdev_is_ICX(struct pci_dev *pdev) return 0; } +static inline int pdev_is_SPR(struct pci_dev *pdev) +{ + if (pdev_is_gen4(pdev) && + pdev->revision > PCI_DEVICE_REVISION_ICX_MAX) + return 1; + return 0; +} + #endif From ad02776cf8d083e28b1ca4d93d8b1949668c27cc Mon Sep 17 00:00:00 2001 From: Peter Geis Date: Thu, 27 Jan 2022 19:38:05 -0500 Subject: [PATCH 027/773] arm64: dts: rockchip: fix Quartz64-A ddr regulator voltage The Quartz64 Model A uses a voltage divider to ensure ddr voltage is within specification from the default regulator configuration. Adjusting this voltage is detrimental, and currently causes the ddr voltage to be about 0.8v. Remove the min and max voltage setpoints for the ddr regulator. Fixes: b33a22a1e7c4 ("arm64: dts: rockchip: add basic dts for Pine64 Quartz64-A") Signed-off-by: Peter Geis Link: https://lore.kernel.org/r/20220128003809.3291407-2-pgwipeout@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts index 166399b7f13f..d9eb92d59099 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts @@ -285,8 +285,6 @@ vcc_ddr: DCDC_REG3 { regulator-always-on; regulator-boot-on; - regulator-min-microvolt = <1100000>; - regulator-max-microvolt = <1100000>; regulator-initial-mode = <0x2>; regulator-name = "vcc_ddr"; regulator-state-mem { From 62966cbdda8a92f82d966a45aa671e788b2006f7 Mon Sep 17 00:00:00 2001 From: Jakob Unterwurzacher Date: Wed, 19 Jan 2022 14:49:48 +0100 Subject: [PATCH 028/773] arm64: dts: rockchip: fix rk3399-puma eMMC HS400 signal integrity There are signal integrity issues running the eMMC at 200MHz on Puma RK3399-Q7. Similar to the work-around found for RK3399 Gru boards, lowering the frequency to 100MHz made the eMMC much more stable, so let's lower the frequency to 100MHz. It might be possible to run at 150MHz as on RK3399 Gru boards but only 100MHz was extensively tested. Cc: Quentin Schulz Signed-off-by: Jakob Unterwurzacher Signed-off-by: Quentin Schulz Link: https://lore.kernel.org/r/20220119134948.1444965-1-quentin.schulz@theobroma-systems.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi index 002ece51c3ba..08fa00364b42 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi @@ -439,6 +439,12 @@ }; &sdhci { + /* + * Signal integrity isn't great at 200MHz but 100MHz has proven stable + * enough. + */ + max-frequency = <100000000>; + bus-width = <8>; mmc-hs400-1_8v; mmc-hs400-enhanced-strobe; From 8fd9415042826c7609c588e5ef45f3e84237785f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 29 Jan 2022 18:54:29 +0100 Subject: [PATCH 029/773] arm64: dts: rockchip: align pl330 node name with dtschema Fixes dtbs_check warnings like: dmac@ff240000: $nodename:0: 'dmac@ff240000' does not match '^dma-controller(@.*)?$' Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220129175429.298836-1-krzysztof.kozlowski@canonical.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/px30.dtsi | 2 +- arch/arm64/boot/dts/rockchip/rk3328.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/px30.dtsi b/arch/arm64/boot/dts/rockchip/px30.dtsi index f972704dfe7a..56dfbb2e2fa6 100644 --- a/arch/arm64/boot/dts/rockchip/px30.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30.dtsi @@ -711,7 +711,7 @@ clock-names = "pclk", "timer"; }; - dmac: dmac@ff240000 { + dmac: dma-controller@ff240000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x0 0xff240000 0x0 0x4000>; interrupts = , diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 39db0b85b4da..b822533dc7f1 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -489,7 +489,7 @@ status = "disabled"; }; - dmac: dmac@ff1f0000 { + dmac: dma-controller@ff1f0000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x0 0xff1f0000 0x0 0x4000>; interrupts = , From a1cba0e2deeb0fe5200602658ed1078a714f8667 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 5 Jan 2022 02:09:20 +0500 Subject: [PATCH 030/773] iio: frequency: admv1013: remove the always true condition unsigned int variable is always greater than or equal to zero. Make the if condition simple. Signed-off-by: Muhammad Usama Anjum Fixes: da35a7b526d9 ("iio: frequency: admv1013: add support for ADMV1013") Link: https://lore.kernel.org/r/YdS3gJYtECMaDDjA@debian-BULLSEYE-live-builder-AMD64 Signed-off-by: Jonathan Cameron --- drivers/iio/frequency/admv1013.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/frequency/admv1013.c b/drivers/iio/frequency/admv1013.c index 6cdeb50143af..3f3c478e9baa 100644 --- a/drivers/iio/frequency/admv1013.c +++ b/drivers/iio/frequency/admv1013.c @@ -348,7 +348,7 @@ static int admv1013_update_mixer_vgate(struct admv1013_state *st) vcm = regulator_get_voltage(st->reg); - if (vcm >= 0 && vcm < 1800000) + if (vcm < 1800000) mixer_vgate = (2389 * vcm / 1000000 + 8100) / 100; else if (vcm > 1800000 && vcm < 2600000) mixer_vgate = (2375 * vcm / 1000000 + 125) / 100; From e0a2e37f303828d030a83f33ffe14b36cb88d563 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 29 Jan 2022 09:32:47 +0100 Subject: [PATCH 031/773] iio: adc: men_z188_adc: Fix a resource leak in an error handling path If iio_device_register() fails, a previous ioremap() is left unbalanced. Update the error handling path and add the missing iounmap() call, as already done in the remove function. Fixes: 74aeac4da66f ("iio: adc: Add MEN 16z188 ADC driver") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/320fc777863880247c2aff4a9d1a54ba69abf080.1643445149.git.christophe.jaillet@wanadoo.fr Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/men_z188_adc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/men_z188_adc.c b/drivers/iio/adc/men_z188_adc.c index 42ea8bc7e780..adc5ceaef8c9 100644 --- a/drivers/iio/adc/men_z188_adc.c +++ b/drivers/iio/adc/men_z188_adc.c @@ -103,6 +103,7 @@ static int men_z188_probe(struct mcb_device *dev, struct z188_adc *adc; struct iio_dev *indio_dev; struct resource *mem; + int ret; indio_dev = devm_iio_device_alloc(&dev->dev, sizeof(struct z188_adc)); if (!indio_dev) @@ -128,8 +129,14 @@ static int men_z188_probe(struct mcb_device *dev, adc->mem = mem; mcb_set_drvdata(dev, indio_dev); - return iio_device_register(indio_dev); + ret = iio_device_register(indio_dev); + if (ret) + goto err_unmap; + return 0; + +err_unmap: + iounmap(adc->base); err: mcb_release_mem(mem); return -ENXIO; From e7a3290d330e3f8cf06a3cad455c49e89183137b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jan 2022 12:34:56 -0800 Subject: [PATCH 032/773] iio: addac: ad74413r: Do not reference negative array offsets Instead of aiming rx_buf at an invalid array-boundary-crossing location, just skip the first increment. Seen when building with -Warray-bounds: drivers/iio/addac/ad74413r.c: In function 'ad74413r_update_scan_mode': drivers/iio/addac/ad74413r.c:843:22: warning: array subscript -4 is below array bounds of 'u8[16]' { aka 'unsigned char[16]'} [-Warray-bounds] 843 | u8 *rx_buf = &st->adc_samples_buf.rx_buf[-1 * AD74413R_FRAME_SIZE]; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/iio/addac/ad74413r.c:84:20: note: while referencing 'rx_buf' 84 | u8 rx_buf[AD74413R_FRAME_SIZE * AD74413R_CHANNEL_MAX]; | ^~~~~~ Cc: Lars-Peter Clausen Cc: Michael Hennerich Cc: Jonathan Cameron Cc: linux-iio@vger.kernel.org Fixes: fea251b6a5db ("iio: addac: add AD74413R driver") Reviewed-by: Cosmin Tanislav Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220112203456.3950884-1-keescook@chromium.org Signed-off-by: Jonathan Cameron --- drivers/iio/addac/ad74413r.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/iio/addac/ad74413r.c b/drivers/iio/addac/ad74413r.c index 5271073bb74e..aba9a643a4ca 100644 --- a/drivers/iio/addac/ad74413r.c +++ b/drivers/iio/addac/ad74413r.c @@ -840,7 +840,7 @@ static int ad74413r_update_scan_mode(struct iio_dev *indio_dev, { struct ad74413r_state *st = iio_priv(indio_dev); struct spi_transfer *xfer = st->adc_samples_xfer; - u8 *rx_buf = &st->adc_samples_buf.rx_buf[-1 * AD74413R_FRAME_SIZE]; + u8 *rx_buf = st->adc_samples_buf.rx_buf; u8 *tx_buf = st->adc_samples_tx_buf; unsigned int channel; int ret = -EINVAL; @@ -894,9 +894,10 @@ static int ad74413r_update_scan_mode(struct iio_dev *indio_dev, spi_message_add_tail(xfer, &st->adc_samples_msg); - xfer++; tx_buf += AD74413R_FRAME_SIZE; - rx_buf += AD74413R_FRAME_SIZE; + if (xfer != st->adc_samples_xfer) + rx_buf += AD74413R_FRAME_SIZE; + xfer++; } xfer->rx_buf = rx_buf; From 8a3e4a5614adab30b6e0eb7dbd8ef737aabbb8eb Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Tue, 11 Jan 2022 09:47:01 +0200 Subject: [PATCH 033/773] iio: addac: ad74413r: use ngpio size when iterating over mask ngpio is the actual number of GPIOs handled by the GPIO chip, as opposed to the max number of GPIOs. Fixes: fea251b6a5db ("iio: addac: add AD74413R driver") Signed-off-by: Cosmin Tanislav Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220111074703.3677392-1-cosmin.tanislav@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/addac/ad74413r.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/addac/ad74413r.c b/drivers/iio/addac/ad74413r.c index aba9a643a4ca..4f4dce42f20e 100644 --- a/drivers/iio/addac/ad74413r.c +++ b/drivers/iio/addac/ad74413r.c @@ -288,7 +288,7 @@ static void ad74413r_gpio_set_multiple(struct gpio_chip *chip, unsigned int offset = 0; int ret; - for_each_set_bit_from(offset, mask, AD74413R_CHANNEL_MAX) { + for_each_set_bit_from(offset, mask, chip->ngpio) { unsigned int real_offset = st->gpo_gpio_offsets[offset]; ret = ad74413r_set_gpo_config(st, real_offset, @@ -334,7 +334,7 @@ static int ad74413r_gpio_get_multiple(struct gpio_chip *chip, if (ret) return ret; - for_each_set_bit_from(offset, mask, AD74413R_CHANNEL_MAX) { + for_each_set_bit_from(offset, mask, chip->ngpio) { unsigned int real_offset = st->comp_gpio_offsets[offset]; if (val & BIT(real_offset)) From 4165456fe6b7719c0c9626022a7d96c381c94e6f Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Tue, 11 Jan 2022 09:47:02 +0200 Subject: [PATCH 034/773] iio: addac: ad74413r: correct comparator gpio getters mask usage The value of the GPIOs is currently altered using offsets rather than masks. Make use of __assign_bit and the BIT macro to turn the offsets into masks. Fixes: fea251b6a5db ("iio: addac: add AD74413R driver") Signed-off-by: Cosmin Tanislav Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220111074703.3677392-2-cosmin.tanislav@analog.com Signed-off-by: Jonathan Cameron --- drivers/iio/addac/ad74413r.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/iio/addac/ad74413r.c b/drivers/iio/addac/ad74413r.c index 4f4dce42f20e..acd230a6af35 100644 --- a/drivers/iio/addac/ad74413r.c +++ b/drivers/iio/addac/ad74413r.c @@ -134,7 +134,6 @@ struct ad74413r_state { #define AD74413R_CH_EN_MASK(x) BIT(x) #define AD74413R_REG_DIN_COMP_OUT 0x25 -#define AD74413R_DIN_COMP_OUT_SHIFT_X(x) x #define AD74413R_REG_ADC_RESULT_X(x) (0x26 + (x)) #define AD74413R_ADC_RESULT_MAX GENMASK(15, 0) @@ -316,7 +315,7 @@ static int ad74413r_gpio_get(struct gpio_chip *chip, unsigned int offset) if (ret) return ret; - status &= AD74413R_DIN_COMP_OUT_SHIFT_X(real_offset); + status &= BIT(real_offset); return status ? 1 : 0; } @@ -337,8 +336,7 @@ static int ad74413r_gpio_get_multiple(struct gpio_chip *chip, for_each_set_bit_from(offset, mask, chip->ngpio) { unsigned int real_offset = st->comp_gpio_offsets[offset]; - if (val & BIT(real_offset)) - *bits |= offset; + __assign_bit(offset, bits, val & BIT(real_offset)); } return ret; From 632fe0bb8c5b9c06ec961f575ee42a6fff5eceeb Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 6 Jan 2022 11:23:09 +0000 Subject: [PATCH 035/773] iio: Fix error handling for PM The pm_runtime_enable will increase power disable depth. If the probe fails, we should use pm_runtime_disable() to balance pm_runtime_enable(). In the PM Runtime docs: Drivers in ->remove() callback should undo the runtime PM changes done in ->probe(). Usually this means calling pm_runtime_disable(), pm_runtime_dont_use_autosuspend() etc. We should do this in error handling. Fix this problem for the following drivers: bmc150, bmg160, kmx61, kxcj-1013, mma9551, mma9553. Fixes: 7d0ead5c3f00 ("iio: Reconcile operation order between iio_register/unregister and pm functions") Signed-off-by: Miaoqian Lin Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220106112309.16879-1-linmq006@gmail.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/accel/bmc150-accel-core.c | 5 ++++- drivers/iio/accel/kxcjk-1013.c | 5 ++++- drivers/iio/accel/mma9551.c | 5 ++++- drivers/iio/accel/mma9553.c | 5 ++++- drivers/iio/gyro/bmg160_core.c | 5 ++++- drivers/iio/imu/kmx61.c | 5 ++++- drivers/iio/magnetometer/bmc150_magn.c | 5 +++-- 7 files changed, 27 insertions(+), 8 deletions(-) diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c index e6081dd0a880..d11f668016a6 100644 --- a/drivers/iio/accel/bmc150-accel-core.c +++ b/drivers/iio/accel/bmc150-accel-core.c @@ -1783,11 +1783,14 @@ int bmc150_accel_core_probe(struct device *dev, struct regmap *regmap, int irq, ret = iio_device_register(indio_dev); if (ret < 0) { dev_err(dev, "Unable to register iio device\n"); - goto err_trigger_unregister; + goto err_pm_cleanup; } return 0; +err_pm_cleanup: + pm_runtime_dont_use_autosuspend(dev); + pm_runtime_disable(dev); err_trigger_unregister: bmc150_accel_unregister_triggers(data, BMC150_ACCEL_TRIGGERS - 1); err_buffer_cleanup: diff --git a/drivers/iio/accel/kxcjk-1013.c b/drivers/iio/accel/kxcjk-1013.c index 0fe570316848..ac74cdcd2bc8 100644 --- a/drivers/iio/accel/kxcjk-1013.c +++ b/drivers/iio/accel/kxcjk-1013.c @@ -1590,11 +1590,14 @@ static int kxcjk1013_probe(struct i2c_client *client, ret = iio_device_register(indio_dev); if (ret < 0) { dev_err(&client->dev, "unable to register iio device\n"); - goto err_buffer_cleanup; + goto err_pm_cleanup; } return 0; +err_pm_cleanup: + pm_runtime_dont_use_autosuspend(&client->dev); + pm_runtime_disable(&client->dev); err_buffer_cleanup: iio_triggered_buffer_cleanup(indio_dev); err_trigger_unregister: diff --git a/drivers/iio/accel/mma9551.c b/drivers/iio/accel/mma9551.c index 4c359fb05480..c53a3398b14c 100644 --- a/drivers/iio/accel/mma9551.c +++ b/drivers/iio/accel/mma9551.c @@ -495,11 +495,14 @@ static int mma9551_probe(struct i2c_client *client, ret = iio_device_register(indio_dev); if (ret < 0) { dev_err(&client->dev, "unable to register iio device\n"); - goto out_poweroff; + goto err_pm_cleanup; } return 0; +err_pm_cleanup: + pm_runtime_dont_use_autosuspend(&client->dev); + pm_runtime_disable(&client->dev); out_poweroff: mma9551_set_device_state(client, false); diff --git a/drivers/iio/accel/mma9553.c b/drivers/iio/accel/mma9553.c index 0570ab1cc064..5ff6bc70708b 100644 --- a/drivers/iio/accel/mma9553.c +++ b/drivers/iio/accel/mma9553.c @@ -1134,12 +1134,15 @@ static int mma9553_probe(struct i2c_client *client, ret = iio_device_register(indio_dev); if (ret < 0) { dev_err(&client->dev, "unable to register iio device\n"); - goto out_poweroff; + goto err_pm_cleanup; } dev_dbg(&indio_dev->dev, "Registered device %s\n", name); return 0; +err_pm_cleanup: + pm_runtime_dont_use_autosuspend(&client->dev); + pm_runtime_disable(&client->dev); out_poweroff: mma9551_set_device_state(client, false); return ret; diff --git a/drivers/iio/gyro/bmg160_core.c b/drivers/iio/gyro/bmg160_core.c index 17b939a367ad..81a6d09788bd 100644 --- a/drivers/iio/gyro/bmg160_core.c +++ b/drivers/iio/gyro/bmg160_core.c @@ -1188,11 +1188,14 @@ int bmg160_core_probe(struct device *dev, struct regmap *regmap, int irq, ret = iio_device_register(indio_dev); if (ret < 0) { dev_err(dev, "unable to register iio device\n"); - goto err_buffer_cleanup; + goto err_pm_cleanup; } return 0; +err_pm_cleanup: + pm_runtime_dont_use_autosuspend(dev); + pm_runtime_disable(dev); err_buffer_cleanup: iio_triggered_buffer_cleanup(indio_dev); err_trigger_unregister: diff --git a/drivers/iio/imu/kmx61.c b/drivers/iio/imu/kmx61.c index 1dabfd615dab..f89724481df9 100644 --- a/drivers/iio/imu/kmx61.c +++ b/drivers/iio/imu/kmx61.c @@ -1385,7 +1385,7 @@ static int kmx61_probe(struct i2c_client *client, ret = iio_device_register(data->acc_indio_dev); if (ret < 0) { dev_err(&client->dev, "Failed to register acc iio device\n"); - goto err_buffer_cleanup_mag; + goto err_pm_cleanup; } ret = iio_device_register(data->mag_indio_dev); @@ -1398,6 +1398,9 @@ static int kmx61_probe(struct i2c_client *client, err_iio_unregister_acc: iio_device_unregister(data->acc_indio_dev); +err_pm_cleanup: + pm_runtime_dont_use_autosuspend(&client->dev); + pm_runtime_disable(&client->dev); err_buffer_cleanup_mag: if (client->irq > 0) iio_triggered_buffer_cleanup(data->mag_indio_dev); diff --git a/drivers/iio/magnetometer/bmc150_magn.c b/drivers/iio/magnetometer/bmc150_magn.c index f96f53175349..3d4d21f979fa 100644 --- a/drivers/iio/magnetometer/bmc150_magn.c +++ b/drivers/iio/magnetometer/bmc150_magn.c @@ -962,13 +962,14 @@ int bmc150_magn_probe(struct device *dev, struct regmap *regmap, ret = iio_device_register(indio_dev); if (ret < 0) { dev_err(dev, "unable to register iio device\n"); - goto err_disable_runtime_pm; + goto err_pm_cleanup; } dev_dbg(dev, "Registered device %s\n", name); return 0; -err_disable_runtime_pm: +err_pm_cleanup: + pm_runtime_dont_use_autosuspend(dev); pm_runtime_disable(dev); err_buffer_cleanup: iio_triggered_buffer_cleanup(indio_dev); From fa4300f060e5c4ca670b705f1e9b93685ad30c5b Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Thu, 27 Jan 2022 13:26:43 -0600 Subject: [PATCH 036/773] of: unittest: update text of expected warnings The text of various warning messages triggered by unittest has changed. Update the text of expected warnings to match. The expected vs actual warnings are most easily seen by filtering the boot console messages with the of_unittest_expect program at https://github.com/frowand/dt_tools.git. The filter prefixes problem lines with '***', and prefixes lines that match expected errors with 'ok '. All other lines are prefixed with ' '. Unrelated lines have been deleted in the following examples. The mismatch appears as: -> ### dt-test ### start of unittest - you will see error messages OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found 1 ** of_unittest_expect WARNING - not found ---> OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found -1 OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found 1 ** of_unittest_expect WARNING - not found ---> OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found -1 OF: /testcase-data/phandle-tests/consumer-b: #phandle-cells = 2 found 1 ** of_unittest_expect WARNING - not found ---> OF: /testcase-data/phandle-tests/consumer-b: #phandle-cells = 2 found -1 platform testcase-data:testcase-device2: error -ENXIO: IRQ index 0 not found ** of_unittest_expect WARNING - not found ---> platform testcase-data:testcase-device2: IRQ index 0 not found -> ### dt-test ### end of unittest - 254 passed, 0 failed ** EXPECT statistics: ** ** EXPECT found : 42 ** EXPECT not found : 4 With this commit applied, the mismatch is resolved: -> ### dt-test ### start of unittest - you will see error messages ok OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found 1 ok OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found 1 ok OF: /testcase-data/phandle-tests/consumer-b: #phandle-cells = 2 found 1 ok platform testcase-data:testcase-device2: error -ENXIO: IRQ index 0 not found -> ### dt-test ### end of unittest - 254 passed, 0 failed ** EXPECT statistics: ** ** EXPECT found : 46 ** EXPECT not found : 0 Fixes: 2043727c2882 ("driver core: platform: Make use of the helper function dev_err_probe()") Fixes: 94a4950a4acf ("of: base: Fix phandle argument length mismatch error message") Signed-off-by: Frank Rowand Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220127192643.2534941-1-frowand.list@gmail.com --- drivers/of/unittest.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 70992103c07d..2c2fb161b572 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -513,24 +513,24 @@ static void __init of_unittest_parse_phandle_with_args(void) memset(&args, 0, sizeof(args)); EXPECT_BEGIN(KERN_INFO, - "OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found -1"); + "OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found 1"); rc = of_parse_phandle_with_args(np, "phandle-list-bad-args", "#phandle-cells", 1, &args); EXPECT_END(KERN_INFO, - "OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found -1"); + "OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found 1"); unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc); EXPECT_BEGIN(KERN_INFO, - "OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found -1"); + "OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found 1"); rc = of_count_phandle_with_args(np, "phandle-list-bad-args", "#phandle-cells"); EXPECT_END(KERN_INFO, - "OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found -1"); + "OF: /testcase-data/phandle-tests/consumer-a: #phandle-cells = 3 found 1"); unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc); } @@ -670,12 +670,12 @@ static void __init of_unittest_parse_phandle_with_args_map(void) memset(&args, 0, sizeof(args)); EXPECT_BEGIN(KERN_INFO, - "OF: /testcase-data/phandle-tests/consumer-b: #phandle-cells = 2 found -1"); + "OF: /testcase-data/phandle-tests/consumer-b: #phandle-cells = 2 found 1"); rc = of_parse_phandle_with_args_map(np, "phandle-list-bad-args", "phandle", 1, &args); EXPECT_END(KERN_INFO, - "OF: /testcase-data/phandle-tests/consumer-b: #phandle-cells = 2 found -1"); + "OF: /testcase-data/phandle-tests/consumer-b: #phandle-cells = 2 found 1"); unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc); } @@ -1257,12 +1257,12 @@ static void __init of_unittest_platform_populate(void) unittest(pdev, "device 2 creation failed\n"); EXPECT_BEGIN(KERN_INFO, - "platform testcase-data:testcase-device2: IRQ index 0 not found"); + "platform testcase-data:testcase-device2: error -ENXIO: IRQ index 0 not found"); irq = platform_get_irq(pdev, 0); EXPECT_END(KERN_INFO, - "platform testcase-data:testcase-device2: IRQ index 0 not found"); + "platform testcase-data:testcase-device2: error -ENXIO: IRQ index 0 not found"); unittest(irq < 0 && irq != -EPROBE_DEFER, "device parsing error failed - %d\n", irq); From 36415a7964711822e63695ea67fede63979054d9 Mon Sep 17 00:00:00 2001 From: david regan Date: Wed, 26 Jan 2022 23:43:44 +0100 Subject: [PATCH 037/773] mtd: rawnand: brcmnand: Fixed incorrect sub-page ECC status The brcmnand driver contains a bug in which if a page (example 2k byte) is read from the parallel/ONFI NAND and within that page a subpage (512 byte) has correctable errors which is followed by a subpage with uncorrectable errors, the page read will return the wrong status of correctable (as opposed to the actual status of uncorrectable.) The bug is in function brcmnand_read_by_pio where there is a check for uncorrectable bits which will be preempted if a previous status for correctable bits is detected. The fix is to stop checking for bad bits only if we already have a bad bits status. Fixes: 27c5b17cd1b1 ("mtd: nand: add NAND driver "library" for Broadcom STB NAND controller") Signed-off-by: david regan Reviewed-by: Florian Fainelli Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/trinity-478e0c09-9134-40e8-8f8c-31c371225eda-1643237024774@3c-app-mailcom-lxa02 --- drivers/mtd/nand/raw/brcmnand/brcmnand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c index f75929783b94..aee78f5f4f15 100644 --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c @@ -2106,7 +2106,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip, mtd->oobsize / trans, host->hwcfg.sector_size_1k); - if (!ret) { + if (ret != -EBADMSG) { *err_addr = brcmnand_get_uncorrecc_addr(ctrl); if (*err_addr) From 0fd4dcb607ce29110d6c0b481a98c4ff3d300551 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 22 Dec 2021 08:20:58 -0800 Subject: [PATCH 038/773] arm64: dts: qcom: sm8350: Correct UFS symbol clocks The introduction of '9a61f813fcc8 ("clk: qcom: regmap-mux: fix parent clock lookup")' broke UFS support on SM8350. The cause for this is that the symbol clocks have a specified rate in the "freq-table-hz" table in the UFS node, which causes the UFS code to request a rate change, for which the "bi_tcxo" happens to provide the closest rate. Prior to the change in regmap-mux it was determined (incorrectly) that no change was needed and everything worked. The rates of 75 and 300MHz matches the documentation for the symbol clocks, but we don't represent the parent clocks today. So let's mimic the configuration found in other platforms, by omitting the rate for the symbol clocks as well to avoid the rate change. While at it also fill in the dummy symbol clocks that was dropped from the GCC driver as it was upstreamed. Fixes: 59c7cf814783 ("arm64: dts: qcom: sm8350: Add UFS nodes") Signed-off-by: Bjorn Andersson Reviewed-by: Vinod Koul Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20211222162058.3418902-1-bjorn.andersson@linaro.org --- arch/arm64/boot/dts/qcom/sm8350.dtsi | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sm8350.dtsi b/arch/arm64/boot/dts/qcom/sm8350.dtsi index 53b39e718fb6..4b19744bcfb3 100644 --- a/arch/arm64/boot/dts/qcom/sm8350.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8350.dtsi @@ -35,6 +35,24 @@ clock-frequency = <32000>; #clock-cells = <0>; }; + + ufs_phy_rx_symbol_0_clk: ufs-phy-rx-symbol-0 { + compatible = "fixed-clock"; + clock-frequency = <1000>; + #clock-cells = <0>; + }; + + ufs_phy_rx_symbol_1_clk: ufs-phy-rx-symbol-1 { + compatible = "fixed-clock"; + clock-frequency = <1000>; + #clock-cells = <0>; + }; + + ufs_phy_tx_symbol_0_clk: ufs-phy-tx-symbol-0 { + compatible = "fixed-clock"; + clock-frequency = <1000>; + #clock-cells = <0>; + }; }; cpus { @@ -603,9 +621,9 @@ <0>, <0>, <0>, - <0>, - <0>, - <0>, + <&ufs_phy_rx_symbol_0_clk>, + <&ufs_phy_rx_symbol_1_clk>, + <&ufs_phy_tx_symbol_0_clk>, <0>, <0>; }; @@ -1923,8 +1941,8 @@ <75000000 300000000>, <0 0>, <0 0>, - <75000000 300000000>, - <75000000 300000000>; + <0 0>, + <0 0>; status = "disabled"; }; From 197769fede5824d218f1f13f7620243015801d81 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Sat, 22 Jan 2022 11:29:31 -0500 Subject: [PATCH 039/773] arm64: dts: qcom: sm8450: enable GCC_USB3_0_CLKREF_EN for usb USB doesn't work at all without this clock enabled. This fixes USB when not using clk_ignore_unused. Fixes: 19fd04fb9247 ("arm64: dts: qcom: sm8450: Add usb nodes") Signed-off-by: Jonathan Marek Reviewed-by: Vinod Koul Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220122162932.7686-1-jonathan@marek.ca --- arch/arm64/boot/dts/qcom/sm8450.dtsi | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sm8450.dtsi b/arch/arm64/boot/dts/qcom/sm8450.dtsi index 10c25ad2d0c7..f68475fb7a32 100644 --- a/arch/arm64/boot/dts/qcom/sm8450.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8450.dtsi @@ -1072,9 +1072,10 @@ <&gcc GCC_USB30_PRIM_MASTER_CLK>, <&gcc GCC_AGGRE_USB3_PRIM_AXI_CLK>, <&gcc GCC_USB30_PRIM_MOCK_UTMI_CLK>, - <&gcc GCC_USB30_PRIM_SLEEP_CLK>; + <&gcc GCC_USB30_PRIM_SLEEP_CLK>, + <&gcc GCC_USB3_0_CLKREF_EN>; clock-names = "cfg_noc", "core", "iface", "mock_utmi", - "sleep"; + "sleep", "xo"; assigned-clocks = <&gcc GCC_USB30_PRIM_MOCK_UTMI_CLK>, <&gcc GCC_USB30_PRIM_MASTER_CLK>; From 7baa00bef336254e2cea5d4b064afe6430a05309 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Sat, 22 Jan 2022 11:29:32 -0500 Subject: [PATCH 040/773] arm64: dts: qcom: sm8450: fix apps_smmu interrupts Update interrupts in apps_smmu to match downstream. This is fixes apps_smmu failing to probe when running at EL2 (expects 96 context interrupts) Fixes: 892d5395396d ("arm64: dts: qcom: sm8450: add smmu nodes") Signed-off-by: Jonathan Marek Reviewed-by: Vinod Koul Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220122162932.7686-2-jonathan@marek.ca --- arch/arm64/boot/dts/qcom/sm8450.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/sm8450.dtsi b/arch/arm64/boot/dts/qcom/sm8450.dtsi index f68475fb7a32..02b97e838c47 100644 --- a/arch/arm64/boot/dts/qcom/sm8450.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8450.dtsi @@ -726,7 +726,7 @@ compatible = "qcom,sm8450-smmu-500", "arm,mmu-500"; reg = <0 0x15000000 0 0x100000>; #iommu-cells = <2>; - #global-interrupts = <2>; + #global-interrupts = <1>; interrupts = , , , @@ -813,6 +813,7 @@ , , , + , , , , From da5462a4dc446e7028e6d150a4b02684d1d3ea81 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 17 Jan 2022 18:15:22 -0800 Subject: [PATCH 041/773] power: supply: fix table problem in sysfs-class-power MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a bottom table border to complete the table format and prevent a documentation build warning. Documentation/ABI/testing/sysfs-class-power:459: WARNING: Malformed table. No bottom table border found. Fixes: 1b0b6cc8030d0 ("power: supply: add charge_behaviour attributes") Signed-off-by: Randy Dunlap Cc: Thomas Weißschuh Cc: Hans de Goede Cc: Sebastian Reichel Cc: linux-pm@vger.kernel.org Cc: Stephen Rothwell Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index fde21d900420..859501366777 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -468,6 +468,7 @@ Description: auto: Charge normally, respect thresholds inhibit-charge: Do not charge while AC is attached force-discharge: Force discharge while AC is attached + ================ ==================================== What: /sys/class/power_supply//technology Date: May 2007 From 2b56a9a28a6bf09890a43161402948ed62963f36 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 5 Jan 2022 10:37:00 +0800 Subject: [PATCH 042/773] power: supply: core: fix application of sizeof to pointer The coccinelle check report: ./drivers/power/supply/cw2015_battery.c:692:12-18: ERROR: application of sizeof to pointer Reported-by: Abaci Robot Fixes: 25fd330370ac ("power: supply_core: Pass pointer to battery info") Signed-off-by: Yang Li Reviewed-by: Linus Walleij Signed-off-by: Sebastian Reichel --- drivers/power/supply/cw2015_battery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/supply/cw2015_battery.c b/drivers/power/supply/cw2015_battery.c index 0c87ad0dbf71..728e2a6cc9c3 100644 --- a/drivers/power/supply/cw2015_battery.c +++ b/drivers/power/supply/cw2015_battery.c @@ -689,7 +689,7 @@ static int cw_bat_probe(struct i2c_client *client) if (ret) { /* Allocate an empty battery */ cw_bat->battery = devm_kzalloc(&client->dev, - sizeof(cw_bat->battery), + sizeof(*cw_bat->battery), GFP_KERNEL); if (!cw_bat->battery) return -ENOMEM; From 564778d7b1ea465f9487eedeece7527a033549c5 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Feb 2022 15:56:26 +0000 Subject: [PATCH 043/773] ASoC: ops: Fix stereo change notifications in snd_soc_put_volsw() When writing out a stereo control we discard the change notification from the first channel, meaning that events are only generated based on changes to the second channel. Ensure that we report a change if either channel has changed. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220201155629.120510-2-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 9833611b83d1..2ce73e3391f4 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -308,7 +308,7 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, unsigned int sign_bit = mc->sign_bit; unsigned int mask = (1 << fls(max)) - 1; unsigned int invert = mc->invert; - int err; + int err, ret; bool type_2r = false; unsigned int val2 = 0; unsigned int val, val_mask; @@ -350,12 +350,18 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, err = snd_soc_component_update_bits(component, reg, val_mask, val); if (err < 0) return err; + ret = err; - if (type_2r) + if (type_2r) { err = snd_soc_component_update_bits(component, reg2, val_mask, - val2); + val2); + /* Don't discard any error code or drop change flag */ + if (ret == 0 || err < 0) { + ret = err; + } + } - return err; + return ret; } EXPORT_SYMBOL_GPL(snd_soc_put_volsw); From 7f3d90a3519680dfa23e750f80bfdefc0f5eda4a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Feb 2022 15:56:27 +0000 Subject: [PATCH 044/773] ASoC: ops: Fix stereo change notifications in snd_soc_put_volsw_sx() When writing out a stereo control we discard the change notification from the first channel, meaning that events are only generated based on changes to the second channel. Ensure that we report a change if either channel has changed. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220201155629.120510-3-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 2ce73e3391f4..4987c58cd59a 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -427,6 +427,7 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, int min = mc->min; unsigned int mask = (1U << (fls(min + max) - 1)) - 1; int err = 0; + int ret; unsigned int val, val_mask; if (ucontrol->value.integer.value[0] < 0) @@ -443,6 +444,7 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, err = snd_soc_component_update_bits(component, reg, val_mask, val); if (err < 0) return err; + ret = err; if (snd_soc_volsw_is_stereo(mc)) { unsigned int val2; @@ -453,6 +455,11 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, err = snd_soc_component_update_bits(component, reg2, val_mask, val2); + + /* Don't discard any error code or drop change flag */ + if (ret == 0 || err < 0) { + ret = err; + } } return err; } From 650204ded3703b5817bd4b6a77fa47d333c4f902 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Feb 2022 15:56:28 +0000 Subject: [PATCH 045/773] ASoC: ops: Fix stereo change notifications in snd_soc_put_volsw_range() When writing out a stereo control we discard the change notification from the first channel, meaning that events are only generated based on changes to the second channel. Ensure that we report a change if either channel has changed. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220201155629.120510-4-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 4987c58cd59a..2e57f814bcc2 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -519,7 +519,7 @@ int snd_soc_put_volsw_range(struct snd_kcontrol *kcontrol, unsigned int mask = (1 << fls(max)) - 1; unsigned int invert = mc->invert; unsigned int val, val_mask; - int ret; + int err, ret; if (invert) val = (max - ucontrol->value.integer.value[0]) & mask; @@ -528,9 +528,10 @@ int snd_soc_put_volsw_range(struct snd_kcontrol *kcontrol, val_mask = mask << shift; val = val << shift; - ret = snd_soc_component_update_bits(component, reg, val_mask, val); - if (ret < 0) - return ret; + err = snd_soc_component_update_bits(component, reg, val_mask, val); + if (err < 0) + return err; + ret = err; if (snd_soc_volsw_is_stereo(mc)) { if (invert) @@ -540,8 +541,12 @@ int snd_soc_put_volsw_range(struct snd_kcontrol *kcontrol, val_mask = mask << shift; val = val << shift; - ret = snd_soc_component_update_bits(component, rreg, val_mask, + err = snd_soc_component_update_bits(component, rreg, val_mask, val); + /* Don't discard any error code or drop change flag */ + if (ret == 0 || err < 0) { + ret = err; + } } return ret; From 2b7c46369f09c358164d31d17e5695185403185e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Feb 2022 15:56:29 +0000 Subject: [PATCH 046/773] ASoC: ops: Fix stereo change notifications in snd_soc_put_xr_sx() When writing out a stereo control we discard the change notification from the first channel, meaning that events are only generated based on changes to the second channel. Ensure that we report a change if either channel has changed. Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220201155629.120510-5-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 2e57f814bcc2..03ea9591fb16 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -895,6 +895,7 @@ int snd_soc_put_xr_sx(struct snd_kcontrol *kcontrol, unsigned long mask = (1UL<nbits)-1; long max = mc->max; long val = ucontrol->value.integer.value[0]; + int ret = 0; unsigned int i; if (val < mc->min || val > mc->max) @@ -909,9 +910,11 @@ int snd_soc_put_xr_sx(struct snd_kcontrol *kcontrol, regmask, regval); if (err < 0) return err; + if (err > 0) + ret = err; } - return 0; + return ret; } EXPORT_SYMBOL_GPL(snd_soc_put_xr_sx); From 7fa5c33d043160eba3be9fb8e21588dff2a467c7 Mon Sep 17 00:00:00 2001 From: V sujith kumar Reddy Date: Tue, 1 Feb 2022 02:02:15 +0530 Subject: [PATCH 047/773] ASoC: amd: acp: Set gpio_spkr_en to None for max speaker amplifer in machine driver Maxim codec driver already enabling/disabling spk_en_gpio in form of sd_mode gpio hence remove such gpio access control from machine driver to avoid conflict Signed-off-by: V sujith kumar Reddy Link: https://lore.kernel.org/r/20220131203225.1418648-1-vsujithkumar.reddy@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-mach.h | 1 - sound/soc/amd/acp/acp-sof-mach.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sound/soc/amd/acp/acp-mach.h b/sound/soc/amd/acp/acp-mach.h index fd6299844ebe..c855f50d6b34 100644 --- a/sound/soc/amd/acp/acp-mach.h +++ b/sound/soc/amd/acp/acp-mach.h @@ -21,7 +21,6 @@ #include #define EN_SPKR_GPIO_GB 0x11F -#define EN_SPKR_GPIO_NK 0x146 #define EN_SPKR_GPIO_NONE -EINVAL enum be_id { diff --git a/sound/soc/amd/acp/acp-sof-mach.c b/sound/soc/amd/acp/acp-sof-mach.c index 07de46142655..4cc431e54fe1 100644 --- a/sound/soc/amd/acp/acp-sof-mach.c +++ b/sound/soc/amd/acp/acp-sof-mach.c @@ -37,7 +37,7 @@ static struct acp_card_drvdata sof_rt5682_max_data = { .hs_codec_id = RT5682, .amp_codec_id = MAX98360A, .dmic_codec_id = DMIC, - .gpio_spkr_en = EN_SPKR_GPIO_NK, + .gpio_spkr_en = EN_SPKR_GPIO_NONE, }; static struct acp_card_drvdata sof_rt5682s_max_data = { @@ -47,7 +47,7 @@ static struct acp_card_drvdata sof_rt5682s_max_data = { .hs_codec_id = RT5682S, .amp_codec_id = MAX98360A, .dmic_codec_id = DMIC, - .gpio_spkr_en = EN_SPKR_GPIO_NK, + .gpio_spkr_en = EN_SPKR_GPIO_NONE, }; static const struct snd_kcontrol_new acp_controls[] = { From 9b818634f8e7e0bca3386a50b1fada7a49036408 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 2 Feb 2022 14:19:25 -0700 Subject: [PATCH 048/773] MAINTAINERS: update mailing list address for NTB subsystem NTB mailing list is moving from linux-ntb@googlegroups.com to ntb@lists.linux.dev in order to get better archive and lore support. Update all entries in MAINTAINERS. Signed-off-by: Dave Jiang Signed-off-by: Jon Mason --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..0a7edab46ed1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13674,7 +13674,7 @@ F: scripts/nsdeps NTB AMD DRIVER M: Sanjay R Mehta M: Shyam Sundar S K -L: linux-ntb@googlegroups.com +L: ntb@lists.linux.dev S: Supported F: drivers/ntb/hw/amd/ @@ -13682,7 +13682,7 @@ NTB DRIVER CORE M: Jon Mason M: Dave Jiang M: Allen Hubbe -L: linux-ntb@googlegroups.com +L: ntb@lists.linux.dev S: Supported W: https://github.com/jonmason/ntb/wiki T: git git://github.com/jonmason/ntb.git @@ -13694,13 +13694,13 @@ F: tools/testing/selftests/ntb/ NTB IDT DRIVER M: Serge Semin -L: linux-ntb@googlegroups.com +L: ntb@lists.linux.dev S: Supported F: drivers/ntb/hw/idt/ NTB INTEL DRIVER M: Dave Jiang -L: linux-ntb@googlegroups.com +L: ntb@lists.linux.dev S: Supported W: https://github.com/davejiang/linux/wiki T: git https://github.com/davejiang/linux.git From 6d0d95a1c2b07270870e7be16575c513c29af3f1 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Tue, 1 Feb 2022 07:51:57 +0100 Subject: [PATCH 049/773] xfrm: fix the if_id check in changelink if_id will be always 0, because it was not yet initialized. Fixes: 8dce43919566 ("xfrm: interface with if_id 0 should return error") Reported-by: Pavel Machek Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 57448fc519fc..4e3c62d1ad9e 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -673,12 +673,12 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct net *net = xi->net; struct xfrm_if_parms p = {}; + xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } - xfrmi_netlink_parms(data, &p); xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); From 868d7618d75f2cac23c2be6ca8d55ae1380c36d1 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 3 Feb 2022 11:33:02 +0100 Subject: [PATCH 050/773] platform/x86: thinkpad_acpi: Add dual-fan quirk for T15g (2nd gen) The ThinkPad T15g Gen 2 has 2 fan, add a TPACPI_FAN_2CTL quirk entry for it to the fan_quirk_table[] so that both fans can be controllerd. Reported-and-tested-by: David Dreschner Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220203103302.49401-1-hdegoede@redhat.com --- drivers/platform/x86/thinkpad_acpi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index bd045486b933..3424b080db77 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -8703,6 +8703,7 @@ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_Q_LNV3('N', '4', '0', TPACPI_FAN_2CTL), /* P1 / X1 Extreme (4nd gen) */ TPACPI_Q_LNV3('N', '3', '0', TPACPI_FAN_2CTL), /* P15 (1st gen) / P15v (1st gen) */ TPACPI_Q_LNV3('N', '3', '2', TPACPI_FAN_2CTL), /* X1 Carbon (9th gen) */ + TPACPI_Q_LNV3('N', '3', '7', TPACPI_FAN_2CTL), /* T15g (2nd gen) */ TPACPI_Q_LNV3('N', '1', 'O', TPACPI_FAN_NOFAN), /* X1 Tablet (2nd gen) */ }; From e3d13da7f77d73c64981b62591c21614a6cf688f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 5 Feb 2022 12:28:40 +0100 Subject: [PATCH 051/773] platform/x86: asus-wmi: Fix regression when probing for fan curve control The fan curve control patches introduced a regression for at least the TUF FX506 and possibly other TUF series laptops that do not have support for fan curve control. As part of the probing process, asus_wmi_evaluate_method_buf is called to get the factory default fan curve . The WMI management function returns 0 on certain laptops to indicate lack of fan curve control instead of ASUS_WMI_UNSUPPORTED_METHOD. This 0 is transformed to -ENODATA which results in failure when probing. Fixes: 0f0ac158d28f ("platform/x86: asus-wmi: Add support for custom fan curves") Reported-and-tested-by: Abhijeet V Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220205112840.33095-1-hdegoede@redhat.com --- drivers/platform/x86/asus-wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index a3b83b22a3b1..2104a2621e50 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -2223,7 +2223,7 @@ static int fan_curve_check_present(struct asus_wmi *asus, bool *available, err = fan_curve_get_factory_default(asus, fan_dev); if (err) { - if (err == -ENODEV) + if (err == -ENODEV || err == -ENODATA) return 0; return err; } From 9bb162fa26ed76031ed0e7dbc77ccea0bf977758 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 7 Dec 2021 06:10:05 +0000 Subject: [PATCH 052/773] powerpc/603: Fix boot failure with DEBUG_PAGEALLOC and KFENCE Allthough kernel text is always mapped with BATs, we still have inittext mapped with pages, so TLB miss handling is required when CONFIG_DEBUG_PAGEALLOC or CONFIG_KFENCE is set. The final solution should be to set a BAT that also maps inittext but that BAT then needs to be cleared at end of init, and it will require more changes to be able to do it properly. As DEBUG_PAGEALLOC or KFENCE are debugging, performance is not a big deal so let's fix it simply for now to enable easy stable application. Fixes: 035b19a15a98 ("powerpc/32s: Always map kernel text and rodata with BATs") Cc: stable@vger.kernel.org # v5.11+ Reported-by: Maxime Bizon Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/aea33b4813a26bdb9378b5f273f00bd5d4abe240.1638857364.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_book3s_32.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index fa84744d6b24..b876ef8c70a7 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -421,14 +421,14 @@ InstructionTLBMiss: */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS -#ifdef CONFIG_MODULES +#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 #endif mfspr r2, SPRN_SDR1 li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC | _PAGE_USER rlwinm r2, r2, 28, 0xfffff000 -#ifdef CONFIG_MODULES +#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC From 9495b9b31abe525ebd93da58de2c88b9f66d3a0e Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Fri, 23 Feb 2018 22:42:31 +0100 Subject: [PATCH 053/773] i2c: bcm2835: Avoid clock stretching timeouts The CLKT register contains at poweron 0x40, which at our typical 100kHz bus rate means .64ms. But there is no specified limit to how long devices should be able to stretch the clocks, so just disable the timeout. We still have a timeout wrapping the entire transfer. Signed-off-by: Eric Anholt Signed-off-by: Stefan Wahren BugLink: https://github.com/raspberrypi/linux/issues/3064 Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm2835.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c index dfc534065595..5149454eef4a 100644 --- a/drivers/i2c/busses/i2c-bcm2835.c +++ b/drivers/i2c/busses/i2c-bcm2835.c @@ -23,6 +23,11 @@ #define BCM2835_I2C_FIFO 0x10 #define BCM2835_I2C_DIV 0x14 #define BCM2835_I2C_DEL 0x18 +/* + * 16-bit field for the number of SCL cycles to wait after rising SCL + * before deciding the slave is not responding. 0 disables the + * timeout detection. + */ #define BCM2835_I2C_CLKT 0x1c #define BCM2835_I2C_C_READ BIT(0) @@ -474,6 +479,12 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) adap->dev.of_node = pdev->dev.of_node; adap->quirks = of_device_get_match_data(&pdev->dev); + /* + * Disable the hardware clock stretching timeout. SMBUS + * specifies a limit for how long the device can stretch the + * clock, but core I2C doesn't. + */ + bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_CLKT, 0); bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_C, 0); ret = i2c_add_adapter(adap); From ea85bf906466191b58532bb19f4fbb4591f0a77e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Feb 2022 22:57:42 +0100 Subject: [PATCH 054/773] iio: imu: st_lsm6dsx: wait for settling time in st_lsm6dsx_read_oneshot We need to wait for sensor settling time (~ 3/ODR) before reading data in st_lsm6dsx_read_oneshot routine in order to avoid corrupted samples. Fixes: 290a6ce11d93 ("iio: imu: add support to lsm6dsx driver") Reported-by: Mario Tesi Tested-by: Mario Tesi Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/b41ebda5535895298716c76d939f9f165fcd2d13.1644098120.git.lorenzo@kernel.org Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c index 727b4b6ac696..93f0c6bce502 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c @@ -1374,8 +1374,12 @@ static int st_lsm6dsx_read_oneshot(struct st_lsm6dsx_sensor *sensor, if (err < 0) return err; + /* + * we need to wait for sensor settling time before + * reading data in order to avoid corrupted samples + */ delay = 1000000000 / sensor->odr; - usleep_range(delay, 2 * delay); + usleep_range(3 * delay, 4 * delay); err = st_lsm6dsx_read_locked(hw, addr, &data, sizeof(data)); if (err < 0) From 2e8a8b5955a000cc655f7e368670518cbb77fe58 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 26 Jan 2022 15:55:40 +0100 Subject: [PATCH 055/773] arm64: dts: rockchip: reorder rk3399 hdmi clocks The binding specifies the clock order to "cec", "grf", "vpll". Reorder the clocks accordingly. Signed-off-by: Sascha Hauer Link: https://lore.kernel.org/r/20220126145549.617165-19-s.hauer@pengutronix.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index d3cdf6f42a30..080457a68e3c 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1881,10 +1881,10 @@ interrupts = ; clocks = <&cru PCLK_HDMI_CTRL>, <&cru SCLK_HDMI_SFR>, - <&cru PLL_VPLL>, + <&cru SCLK_HDMI_CEC>, <&cru PCLK_VIO_GRF>, - <&cru SCLK_HDMI_CEC>; - clock-names = "iahb", "isfr", "vpll", "grf", "cec"; + <&cru PLL_VPLL>; + clock-names = "iahb", "isfr", "cec", "grf", "vpll"; power-domains = <&power RK3399_PD_HDCP>; reg-io-width = <4>; rockchip,grf = <&grf>; From 946eb87114af37c9c13c618a7c1cdaca936905fa Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 7 Feb 2022 08:09:23 -0800 Subject: [PATCH 056/773] ASoC: Revert "ASoC: mediatek: Check for error clk pointer" This reverts commit 9de2b9286a6d ("ASoC: mediatek: Check for error clk pointer"). With this patch in the tree, Chromebooks running the affected hardware no longer boot. Bisect points to this patch, and reverting it fixes the problem. An analysis of the code with this patch applied shows: ret = init_clks(pdev, clk); if (ret) return ERR_PTR(ret); ... for (j = 0; j < MAX_CLKS && data->clk_id[j]; j++) { struct clk *c = clk[data->clk_id[j]]; if (IS_ERR(c)) { dev_err(&pdev->dev, "%s: clk unavailable\n", data->name); return ERR_CAST(c); } scpd->clk[j] = c; } Not all clocks in the clk_names array have to be present. Only the clocks in the data->clk_id array are actually needed. The code already checks if the required clocks are available and bails out if not. The assumption that all clocks have to be present is wrong, and commit 9de2b9286a6d ("ASoC: mediatek: Check for error clk pointer") needs to be reverted. Cc: Jiasheng Jiang Cc: Mark Brown Cc: James Liao Cc: Kevin Hilman Cc: Matthias Brugger Reported-by: Frank Wunderlich Reported-by: Daniel Golle Fixes: 9de2b9286a6d ("ASoC: mediatek: Check for error clk pointer") Signed-off-by: Guenter Roeck Link: https://lore.kernel.org/r/20220207160923.3911501-1-linux@roeck-us.net Signed-off-by: Mark Brown --- drivers/soc/mediatek/mtk-scpsys.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/soc/mediatek/mtk-scpsys.c b/drivers/soc/mediatek/mtk-scpsys.c index 670cc82d17dc..ca75b14931ec 100644 --- a/drivers/soc/mediatek/mtk-scpsys.c +++ b/drivers/soc/mediatek/mtk-scpsys.c @@ -411,17 +411,12 @@ out: return ret; } -static int init_clks(struct platform_device *pdev, struct clk **clk) +static void init_clks(struct platform_device *pdev, struct clk **clk) { int i; - for (i = CLK_NONE + 1; i < CLK_MAX; i++) { + for (i = CLK_NONE + 1; i < CLK_MAX; i++) clk[i] = devm_clk_get(&pdev->dev, clk_names[i]); - if (IS_ERR(clk[i])) - return PTR_ERR(clk[i]); - } - - return 0; } static struct scp *init_scp(struct platform_device *pdev, @@ -431,7 +426,7 @@ static struct scp *init_scp(struct platform_device *pdev, { struct genpd_onecell_data *pd_data; struct resource *res; - int i, j, ret; + int i, j; struct scp *scp; struct clk *clk[CLK_MAX]; @@ -486,9 +481,7 @@ static struct scp *init_scp(struct platform_device *pdev, pd_data->num_domains = num; - ret = init_clks(pdev, clk); - if (ret) - return ERR_PTR(ret); + init_clks(pdev, clk); for (i = 0; i < num; i++) { struct scp_domain *scpd = &scp->domains[i]; From 307f31452078792aab94a729fce33200c6e42dc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Povi=C5=A1er?= Date: Fri, 4 Feb 2022 10:53:01 +0100 Subject: [PATCH 057/773] ASoC: tas2770: Insert post reset delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per TAS2770 datasheet there must be a 1 ms delay from reset to first command. So insert delays into the driver where appropriate. Fixes: 1a476abc723e ("tas2770: add tas2770 smart PA kernel driver") Signed-off-by: Martin Povišer Link: https://lore.kernel.org/r/20220204095301.5554-1-povik+lin@cutebit.org Signed-off-by: Mark Brown --- sound/soc/codecs/tas2770.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/tas2770.c b/sound/soc/codecs/tas2770.c index 6549e7fef3e3..c5ea3b115966 100644 --- a/sound/soc/codecs/tas2770.c +++ b/sound/soc/codecs/tas2770.c @@ -38,10 +38,12 @@ static void tas2770_reset(struct tas2770_priv *tas2770) gpiod_set_value_cansleep(tas2770->reset_gpio, 0); msleep(20); gpiod_set_value_cansleep(tas2770->reset_gpio, 1); + usleep_range(1000, 2000); } snd_soc_component_write(tas2770->component, TAS2770_SW_RST, TAS2770_RST); + usleep_range(1000, 2000); } static int tas2770_set_bias_level(struct snd_soc_component *component, @@ -110,6 +112,7 @@ static int tas2770_codec_resume(struct snd_soc_component *component) if (tas2770->sdz_gpio) { gpiod_set_value_cansleep(tas2770->sdz_gpio, 1); + usleep_range(1000, 2000); } else { ret = snd_soc_component_update_bits(component, TAS2770_PWR_CTRL, TAS2770_PWR_CTRL_MASK, @@ -510,8 +513,10 @@ static int tas2770_codec_probe(struct snd_soc_component *component) tas2770->component = component; - if (tas2770->sdz_gpio) + if (tas2770->sdz_gpio) { gpiod_set_value_cansleep(tas2770->sdz_gpio, 1); + usleep_range(1000, 2000); + } tas2770_reset(tas2770); From d7b530fdc45e75a54914a194c4becd9672a4e24f Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Mon, 7 Feb 2022 17:29:58 +0200 Subject: [PATCH 058/773] ASoC: rt5682s: do not block workqueue if card is unbound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current rt5682s_jack_detect_handler() assumes the component and card will always show up and implements an infinite usleep loop waiting for them to show up. This does not hold true if a codec interrupt (or other event) occurs when the card is unbound. The codec driver's remove or shutdown functions cannot cancel the workqueue due to the wait loop. As a result, code can either end up blocking the workqueue, or hit a kernel oops when the card is freed. Fix the issue by rescheduling the jack detect handler in case the card is not ready. In case card never shows up, the shutdown/remove/suspend calls can now cancel the detect task. Signed-off-by: Kai Vehmanen Reviewed-by: Bard Liao Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Shuming Fan Link: https://lore.kernel.org/r/20220207153000.3452802-1-kai.vehmanen@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682s.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sound/soc/codecs/rt5682s.c b/sound/soc/codecs/rt5682s.c index efa1016831dd..1e662d1be2b3 100644 --- a/sound/soc/codecs/rt5682s.c +++ b/sound/soc/codecs/rt5682s.c @@ -824,11 +824,13 @@ static void rt5682s_jack_detect_handler(struct work_struct *work) container_of(work, struct rt5682s_priv, jack_detect_work.work); int val, btn_type; - while (!rt5682s->component) - usleep_range(10000, 15000); - - while (!rt5682s->component->card->instantiated) - usleep_range(10000, 15000); + if (!rt5682s->component || !rt5682s->component->card || + !rt5682s->component->card->instantiated) { + /* card not yet ready, try later */ + mod_delayed_work(system_power_efficient_wq, + &rt5682s->jack_detect_work, msecs_to_jiffies(15)); + return; + } mutex_lock(&rt5682s->jdet_mutex); mutex_lock(&rt5682s->calibrate_mutex); From a6d78661dc903d90a327892bbc34268f3a5f4b9c Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Mon, 7 Feb 2022 17:29:59 +0200 Subject: [PATCH 059/773] ASoC: rt5668: do not block workqueue if card is unbound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current rt5668_jack_detect_handler() assumes the component and card will always show up and implements an infinite usleep loop waiting for them to show up. This does not hold true if a codec interrupt (or other event) occurs when the card is unbound. The codec driver's remove or shutdown functions cannot cancel the workqueue due to the wait loop. As a result, code can either end up blocking the workqueue, or hit a kernel oops when the card is freed. Fix the issue by rescheduling the jack detect handler in case the card is not ready. In case card never shows up, the shutdown/remove/suspend calls can now cancel the detect task. Signed-off-by: Kai Vehmanen Reviewed-by: Bard Liao Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Shuming Fan Link: https://lore.kernel.org/r/20220207153000.3452802-2-kai.vehmanen@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5668.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sound/soc/codecs/rt5668.c b/sound/soc/codecs/rt5668.c index fb09715bf932..5b12cbf2ba21 100644 --- a/sound/soc/codecs/rt5668.c +++ b/sound/soc/codecs/rt5668.c @@ -1022,11 +1022,13 @@ static void rt5668_jack_detect_handler(struct work_struct *work) container_of(work, struct rt5668_priv, jack_detect_work.work); int val, btn_type; - while (!rt5668->component) - usleep_range(10000, 15000); - - while (!rt5668->component->card->instantiated) - usleep_range(10000, 15000); + if (!rt5668->component || !rt5668->component->card || + !rt5668->component->card->instantiated) { + /* card not yet ready, try later */ + mod_delayed_work(system_power_efficient_wq, + &rt5668->jack_detect_work, msecs_to_jiffies(15)); + return; + } mutex_lock(&rt5668->calibrate_mutex); From 4c33de0673ced9c7c37b3bbd9bfe0fda72340b2a Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Mon, 7 Feb 2022 17:30:00 +0200 Subject: [PATCH 060/773] ASoC: rt5682: do not block workqueue if card is unbound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current rt5682_jack_detect_handler() assumes the component and card will always show up and implements an infinite usleep loop waiting for them to show up. This does not hold true if a codec interrupt (or other event) occurs when the card is unbound. The codec driver's remove or shutdown functions cannot cancel the workqueue due to the wait loop. As a result, code can either end up blocking the workqueue, or hit a kernel oops when the card is freed. Fix the issue by rescheduling the jack detect handler in case the card is not ready. In case card never shows up, the shutdown/remove/suspend calls can now cancel the detect task. Signed-off-by: Kai Vehmanen Reviewed-by: Bard Liao Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Shuming Fan Link: https://lore.kernel.org/r/20220207153000.3452802-3-kai.vehmanen@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sound/soc/codecs/rt5682.c b/sound/soc/codecs/rt5682.c index 0a0ec4a021e1..be68d573a490 100644 --- a/sound/soc/codecs/rt5682.c +++ b/sound/soc/codecs/rt5682.c @@ -1092,11 +1092,13 @@ void rt5682_jack_detect_handler(struct work_struct *work) struct snd_soc_dapm_context *dapm; int val, btn_type; - while (!rt5682->component) - usleep_range(10000, 15000); - - while (!rt5682->component->card->instantiated) - usleep_range(10000, 15000); + if (!rt5682->component || !rt5682->component->card || + !rt5682->component->card->instantiated) { + /* card not yet ready, try later */ + mod_delayed_work(system_power_efficient_wq, + &rt5682->jack_detect_work, msecs_to_jiffies(15)); + return; + } dapm = snd_soc_component_get_dapm(rt5682->component); From ab3824427b848da10e9fe2727f035bbeecae6ff4 Mon Sep 17 00:00:00 2001 From: Zhou Qingyang Date: Wed, 1 Dec 2021 01:22:53 +0800 Subject: [PATCH 061/773] spi: spi-zynq-qspi: Fix a NULL pointer dereference in zynq_qspi_exec_mem_op() In zynq_qspi_exec_mem_op(), kzalloc() is directly used in memset(), which could lead to a NULL pointer dereference on failure of kzalloc(). Fix this bug by adding a check of tmpbuf. This bug was found by a static analyzer. The analysis employs differential checking to identify inconsistent security operations (e.g., checks or kfrees) between two code paths and confirms that the inconsistent operations are not recovered in the current function or the callers, so they constitute bugs. Note that, as a bug found by static analysis, it can be a false positive or hard to trigger. Multiple researchers have cross-reviewed the bug. Builds with CONFIG_SPI_ZYNQ_QSPI=m show no new warnings, and our static analyzer no longer warns about this code. Fixes: 67dca5e580f1 ("spi: spi-mem: Add support for Zynq QSPI controller") Signed-off-by: Zhou Qingyang Link: https://lore.kernel.org/r/20211130172253.203700-1-zhou1615@umn.edu Signed-off-by: Mark Brown --- drivers/spi/spi-zynq-qspi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/spi-zynq-qspi.c b/drivers/spi/spi-zynq-qspi.c index cfa222c9bd5e..78f31b61a2aa 100644 --- a/drivers/spi/spi-zynq-qspi.c +++ b/drivers/spi/spi-zynq-qspi.c @@ -570,6 +570,9 @@ static int zynq_qspi_exec_mem_op(struct spi_mem *mem, if (op->dummy.nbytes) { tmpbuf = kzalloc(op->dummy.nbytes, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + memset(tmpbuf, 0xff, op->dummy.nbytes); reinit_completion(&xqspi->data_completion); xqspi->txbuf = tmpbuf; From 37ef4c19b4c659926ce65a7ac709ceaefb211c40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Exp=C3=B3sito?= Date: Tue, 8 Feb 2022 09:59:16 -0800 Subject: [PATCH 062/773] Input: clear BTN_RIGHT/MIDDLE on buttonpads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buttonpads are expected to map the INPUT_PROP_BUTTONPAD property bit and the BTN_LEFT key bit. As explained in the specification, where a device has a button type value of 0 (click-pad) or 1 (pressure-pad) there should not be discrete buttons: https://docs.microsoft.com/en-us/windows-hardware/design/component-guidelines/touchpad-windows-precision-touchpad-collection#device-capabilities-feature-report However, some drivers map the BTN_RIGHT and/or BTN_MIDDLE key bits even though the device is a buttonpad and therefore does not have those buttons. This behavior has forced userspace applications like libinput to implement different workarounds and quirks to detect buttonpads and offer to the user the right set of features and configuration options. For more information: https://gitlab.freedesktop.org/libinput/libinput/-/merge_requests/726 In order to avoid this issue clear the BTN_RIGHT and BTN_MIDDLE key bits when the input device is register if the INPUT_PROP_BUTTONPAD property bit is set. Notice that this change will not affect udev because it does not check for buttons. See systemd/src/udev/udev-builtin-input_id.c. List of known affected hardware: - Chuwi AeroBook Plus - Chuwi Gemibook - Framework Laptop - GPD Win Max - Huawei MateBook 2020 - Prestigio Smartbook 141 C2 - Purism Librem 14v1 - StarLite Mk II - AMI firmware - StarLite Mk II - Coreboot firmware - StarLite Mk III - AMI firmware - StarLite Mk III - Coreboot firmware - StarLabTop Mk IV - AMI firmware - StarLabTop Mk IV - Coreboot firmware - StarBook Mk V Acked-by: Peter Hutterer Acked-by: Benjamin Tissoires Acked-by: Jiri Kosina Signed-off-by: José Expósito Link: https://lore.kernel.org/r/20220208174806.17183-1-jose.exposito89@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/input/input.c b/drivers/input/input.c index ccaeb2426385..c3139bc2aa0d 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -2285,6 +2285,12 @@ int input_register_device(struct input_dev *dev) /* KEY_RESERVED is not supposed to be transmitted to userspace. */ __clear_bit(KEY_RESERVED, dev->keybit); + /* Buttonpads should not map BTN_RIGHT and/or BTN_MIDDLE. */ + if (test_bit(INPUT_PROP_BUTTONPAD, dev->propbit)) { + __clear_bit(BTN_RIGHT, dev->keybit); + __clear_bit(BTN_MIDDLE, dev->keybit); + } + /* Make sure that bitmasks not mentioned in dev->evbit are clean. */ input_cleanse_bitmasks(dev); From 4f774c4a65bf3987d1a95c966e884f38c8a942af Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 27 Jan 2022 19:25:53 -0800 Subject: [PATCH 063/773] cpufreq: Reintroduce ready() callback This effectively revert '4bf8e582119e ("cpufreq: Remove ready() callback")', in order to reintroduce the ready callback. This is needed in order to be able to leave the thermal pressure interrupts in the Qualcomm CPUfreq driver disabled during initialization, so that it doesn't fire while related_cpus are still 0. Signed-off-by: Bjorn Andersson [ Viresh: Added the Chinese translation as well and updated commit msg ] Signed-off-by: Viresh Kumar --- Documentation/cpu-freq/cpu-drivers.rst | 3 +++ Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 2 ++ drivers/cpufreq/cpufreq.c | 4 ++++ include/linux/cpufreq.h | 3 +++ 4 files changed, 12 insertions(+) diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index 3b32336a7803..d84ededb66f9 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -75,6 +75,9 @@ And optionally .resume - A pointer to a per-policy resume function which is called with interrupts disabled and _before_ the governor is started again. + .ready - A pointer to a per-policy ready function which is called after + the policy is fully initialized. + .attr - A pointer to a NULL-terminated list of "struct freq_attr" which allow to export values to sysfs. diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 87a36044f828..2ca92042767b 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -84,6 +84,8 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 .resume - 一个指向per-policy恢复函数的指针,该函数在关中断且在调节器再一次启动前被 调用。 + .ready - 一个指向per-policy准备函数的指针,该函数在策略完全初始化之后被调用。 + .attr - 一个指向NULL结尾的"struct freq_attr"列表的指针,该列表允许导出值到 sysfs。 diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b8d95536ee22..80f535cc8a75 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1518,6 +1518,10 @@ static int cpufreq_online(unsigned int cpu) kobject_uevent(&policy->kobj, KOBJ_ADD); + /* Callback for handling stuff after policy is ready */ + if (cpufreq_driver->ready) + cpufreq_driver->ready(policy); + if (cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1ab29e61b078..3522a272b74d 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -382,6 +382,9 @@ struct cpufreq_driver { int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); + /* Will be called after the driver is fully initialized */ + void (*ready)(struct cpufreq_policy *policy); + struct freq_attr **attr; /* platform specific boost support code */ From ef8ee1cb8fc8976a68f5e89cd5f7b6f7de80c66f Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 27 Jan 2022 19:25:54 -0800 Subject: [PATCH 064/773] cpufreq: qcom-hw: Delay enabling throttle_irq In the event that the SoC is under thermal pressure while booting it's possible for the dcvs notification to happen inbetween the cpufreq framework calling init and it actually updating the policy's related_cpus cpumask. Prior to the introduction of the thermal pressure update helper an empty cpumask would simply result in the thermal pressure of no cpus being updated, but the new code will attempt to dereference an invalid per_cpu variable. Avoid this problem by using the newly reintroduced "ready" callback, to postpone enabling the IRQ until the related_cpus cpumask is filled in. Fixes: 0258cb19c77d ("cpufreq: qcom-cpufreq-hw: Use new thermal pressure update function") Signed-off-by: Bjorn Andersson Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 05f3d7876e44..effbb680b453 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -388,7 +388,7 @@ static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index) snprintf(data->irq_name, sizeof(data->irq_name), "dcvsh-irq-%u", policy->cpu); ret = request_threaded_irq(data->throttle_irq, NULL, qcom_lmh_dcvs_handle_irq, - IRQF_ONESHOT, data->irq_name, data); + IRQF_ONESHOT | IRQF_NO_AUTOEN, data->irq_name, data); if (ret) { dev_err(&pdev->dev, "Error registering %s: %d\n", data->irq_name, ret); return 0; @@ -542,6 +542,14 @@ static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy) return 0; } +static void qcom_cpufreq_ready(struct cpufreq_policy *policy) +{ + struct qcom_cpufreq_data *data = policy->driver_data; + + if (data->throttle_irq >= 0) + enable_irq(data->throttle_irq); +} + static struct freq_attr *qcom_cpufreq_hw_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, &cpufreq_freq_attr_scaling_boost_freqs, @@ -561,6 +569,7 @@ static struct cpufreq_driver cpufreq_qcom_hw_driver = { .fast_switch = qcom_cpufreq_hw_fast_switch, .name = "qcom-cpufreq-hw", .attr = qcom_cpufreq_hw_attr, + .ready = qcom_cpufreq_ready, }; static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev) From 7c76ecd9c99b6e9a771d813ab1aa7fa428b3ade1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Feb 2022 16:14:32 +0200 Subject: [PATCH 065/773] xfrm: enforce validity of offload input flags struct xfrm_user_offload has flags variable that received user input, but kernel didn't check if valid bits were provided. It caused a situation where not sanitized input was forwarded directly to the drivers. For example, XFRM_OFFLOAD_IPV6 define that was exposed, was used by strongswan, but not implemented in the kernel at all. As a solution, check and sanitize input flags to forward XFRM_OFFLOAD_INBOUND to the drivers. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 6 ++++++ net/xfrm/xfrm_device.c | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 4e29d7851890..65e13a099b1a 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -511,6 +511,12 @@ struct xfrm_user_offload { int ifindex; __u8 flags; }; +/* This flag was exposed without any kernel code that supporting it. + * Unfortunately, strongswan has the code that uses sets this flag, + * which makes impossible to reuse this bit. + * + * So leave it here to make sure that it won't be reused by mistake. + */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3fa066419d37..39bce5d764de 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -223,6 +223,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (x->encap || x->tfcpad) return -EINVAL; + if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) + return -EINVAL; + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -262,7 +265,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); xso->real_dev = dev; xso->num_exthdrs = 1; - xso->flags = xuo->flags; + /* Don't forward bit that is not implemented */ + xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { From 6620e311ae76c502b685247b8f7232e81a321a5b Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Sun, 30 Jan 2022 15:39:35 +0100 Subject: [PATCH 066/773] MAINTAINERS: replace a Microchip AT91 maintainer As Ludovic is more focusing on other aspects of the Microchip Linux-based development, replace him with Claudiu. Entry is added to the CREDITS file. Thanks Ludovic for these great contributions in the kernel space! Signed-off-by: Nicolas Ferre Acked-by: Claudiu Beznea Acked-by: Ludovic Desroches Link: https://lore.kernel.org/r/23819d8baa635815d0893955197561fe4f044d5e.1643553501.git.nicolas.ferre@microchip.com --- CREDITS | 6 ++++++ MAINTAINERS | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index b97256d5bc24..7e85a53b6a88 100644 --- a/CREDITS +++ b/CREDITS @@ -895,6 +895,12 @@ S: 3000 FORE Drive S: Warrendale, Pennsylvania 15086 S: USA +N: Ludovic Desroches +E: ludovic.desroches@microchip.com +D: Maintainer for ARM/Microchip (AT91) SoC support +D: Author of ADC, pinctrl, XDMA and SDHCI drivers for this platform +S: France + N: Martin Devera E: devik@cdi.cz W: http://luxik.cdi.cz/~devik/qos/ diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..674d13708324 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2250,7 +2250,7 @@ F: drivers/phy/mediatek/ ARM/Microchip (AT91) SoC support M: Nicolas Ferre M: Alexandre Belloni -M: Ludovic Desroches +M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported W: http://www.linux4sam.org From 26077968f8389a68fdb38af3f2c2289ddc95e8ca Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Sun, 30 Jan 2022 15:39:36 +0100 Subject: [PATCH 067/773] dt-bindings: ARM: at91: update maintainers entry Align the binding documentation with the newly updated MAINTAINERS entry. Signed-off-by: Nicolas Ferre Acked-by: Claudiu Beznea Acked-by: Ludovic Desroches Link: https://lore.kernel.org/r/5bf9873eeee3cd49c52a8952a7cd4cb60b61d50a.1643553501.git.nicolas.ferre@microchip.com --- Documentation/devicetree/bindings/arm/atmel-at91.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.yaml b/Documentation/devicetree/bindings/arm/atmel-at91.yaml index c612e1f48dba..ff91df04f9f4 100644 --- a/Documentation/devicetree/bindings/arm/atmel-at91.yaml +++ b/Documentation/devicetree/bindings/arm/atmel-at91.yaml @@ -8,7 +8,8 @@ title: Atmel AT91 device tree bindings. maintainers: - Alexandre Belloni - - Ludovic Desroches + - Claudiu Beznea + - Nicolas Ferre description: | Boards with a SoC of the Atmel AT91 or SMART family shall have the following From e4e3a93c6e267572ca2345d8d86053e166843a8c Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Tue, 8 Feb 2022 11:12:42 +0800 Subject: [PATCH 068/773] MAINTAINERS: update cros_ec_codec maintainers Updates cros_ec_codec maintainers. Signed-off-by: Tzung-Bi Shih Acked-By: Cheng-Yi Chiang Acked-By: Benson Leung Link: https://lore.kernel.org/r/20220208031242.227563-1-tzungbi@google.com Signed-off-by: Mark Brown --- .../devicetree/bindings/sound/google,cros-ec-codec.yaml | 1 + MAINTAINERS | 1 + 2 files changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml b/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml index 77adbebed824..c3e9f3485449 100644 --- a/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml +++ b/Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml @@ -8,6 +8,7 @@ title: Audio codec controlled by ChromeOS EC maintainers: - Cheng-Yi Chiang + - Tzung-Bi Shih description: | Google's ChromeOS EC codec is a digital mic codec provided by the diff --git a/MAINTAINERS b/MAINTAINERS index 09734251e1de..9256c3540a0f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4476,6 +4476,7 @@ F: drivers/platform/chrome/ CHROMEOS EC CODEC DRIVER M: Cheng-Yi Chiang +M: Tzung-Bi Shih R: Guenter Roeck S: Maintained F: Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml From 728390fce4fc4d033a898fb6f5088697d03254b8 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Thu, 6 Jan 2022 17:37:03 -0600 Subject: [PATCH 069/773] dt-bindings: usb: dwc2: add compatible "intel,socfpga-agilex-hsotg" Add the compatible "intel,socfpga-agilex-hsotg" to the DWC2 implementation, because the Agilex DWC2 implementation does not support clock gating. Acked-by: Rob Herring Signed-off-by: Dinh Nguyen --- Documentation/devicetree/bindings/usb/dwc2.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/usb/dwc2.yaml b/Documentation/devicetree/bindings/usb/dwc2.yaml index f00867ebc147..481aaa09f3f2 100644 --- a/Documentation/devicetree/bindings/usb/dwc2.yaml +++ b/Documentation/devicetree/bindings/usb/dwc2.yaml @@ -53,6 +53,7 @@ properties: - const: st,stm32mp15-hsotg - const: snps,dwc2 - const: samsung,s3c6400-hsotg + - const: intel,socfpga-agilex-hsotg reg: maxItems: 1 From 268a491aebc25e6dc7c618903b09ac3a2e8af530 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Thu, 6 Jan 2022 17:53:31 -0600 Subject: [PATCH 070/773] arm64: dts: agilex: use the compatible "intel,socfpga-agilex-hsotg" The DWC2 USB controller on the Agilex platform does not support clock gating, so use the chip specific "intel,socfpga-agilex-hsotg" compatible. Signed-off-by: Dinh Nguyen --- arch/arm64/boot/dts/intel/socfpga_agilex.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi index 0dd2d2ee765a..f4270cf18996 100644 --- a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi +++ b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi @@ -502,7 +502,7 @@ }; usb0: usb@ffb00000 { - compatible = "snps,dwc2"; + compatible = "intel,socfpga-agilex-hsotg", "snps,dwc2"; reg = <0xffb00000 0x40000>; interrupts = ; phys = <&usbphy0>; @@ -515,7 +515,7 @@ }; usb1: usb@ffb40000 { - compatible = "snps,dwc2"; + compatible = "intel,socfpga-agilex-hsotg", "snps,dwc2"; reg = <0xffb40000 0x40000>; interrupts = ; phys = <&usbphy0>; From b97cca3ba9098522e5a1c3388764ead42640c1a5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 3 Feb 2022 08:29:21 -0800 Subject: [PATCH 071/773] xfs: only bother with sync_filesystem during readonly remount In commit 02b9984d6408, we pushed a sync_filesystem() call from the VFS into xfs_fs_remount. The only time that we ever need to push dirty file data or metadata to disk for a remount is if we're remounting the filesystem read only, so this really could be moved to xfs_remount_ro. Once we've moved the call site, actually check the return value from sync_filesystem. Fixes: 02b9984d6408 ("fs: push sync_filesystem() down to the file system's remount_fs()") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e8f37bdc8354..7c2f1338c6e8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1749,6 +1749,11 @@ xfs_remount_ro( }; int error; + /* Flush all the dirty data to disk. */ + error = sync_filesystem(mp->m_super); + if (error) + return error; + /* * Cancel background eofb scanning so it cannot race with the final * log force+buftarg wait and deadlock the remount. @@ -1827,8 +1832,6 @@ xfs_fs_reconfigure( if (error) return error; - sync_filesystem(mp->m_super); - /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; From 8e1741c658996a16bd096e077dae0da2460a997f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 10 Feb 2022 13:33:43 +0100 Subject: [PATCH 072/773] ALSA: memalloc: Fix dma_need_sync() checks dma_need_sync() checks each DMA address. Fix the incorrect usages for non-contiguous and non-coherent page allocations. Fortunately, there are no actual call sites that need manual syncs yet. Fixes: a25684a95646 ("ALSA: memalloc: Support for non-contiguous page allocation") Fixes: 73325f60e2ed ("ALSA: memalloc: Support for non-coherent page allocation") Cc: Reported-by: Ezequiel Garcia Link: https://lore.kernel.org/r/20220210123344.8756-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/memalloc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index d1fcd1d5adae..f98f694e3ce4 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -511,7 +511,8 @@ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size) DEFAULT_GFP, 0); if (!sgt) return NULL; - dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, dmab->dev.dir); + dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, + sg_dma_address(sgt->sgl)); p = dma_vmap_noncontiguous(dmab->dev.dev, size, sgt); if (p) dmab->private_data = sgt; @@ -671,9 +672,13 @@ static const struct snd_malloc_ops snd_dma_sg_wc_ops = { */ static void *snd_dma_noncoherent_alloc(struct snd_dma_buffer *dmab, size_t size) { - dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, dmab->dev.dir); - return dma_alloc_noncoherent(dmab->dev.dev, size, &dmab->addr, - dmab->dev.dir, DEFAULT_GFP); + void *p; + + p = dma_alloc_noncoherent(dmab->dev.dev, size, &dmab->addr, + dmab->dev.dir, DEFAULT_GFP); + if (p) + dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, dmab->addr); + return p; } static void snd_dma_noncoherent_free(struct snd_dma_buffer *dmab) From 3e16dc50d77dc3494275a241fac250c94bf45206 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 10 Feb 2022 13:33:44 +0100 Subject: [PATCH 073/773] ALSA: memalloc: invalidate SG pages before sync It seems that calling invalidate_kernel_vmap_range() is more correct to be called before dma_sync_*(), judging from the other thread: https://lore.kernel.org/all/20220111085958.GA22795@lst.de/ Although this won't matter much in practice, let's fix the call order for consistency. Fixes: a25684a95646 ("ALSA: memalloc: Support for non-contiguous page allocation") Reported-by: Ezequiel Garcia Cc: Link: https://lore.kernel.org/r/20220210123344.8756-3-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index f98f694e3ce4..6fd763d4d15b 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -541,9 +541,9 @@ static void snd_dma_noncontig_sync(struct snd_dma_buffer *dmab, if (mode == SNDRV_DMA_SYNC_CPU) { if (dmab->dev.dir == DMA_TO_DEVICE) return; + invalidate_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_cpu(dmab->dev.dev, dmab->private_data, dmab->dev.dir); - invalidate_kernel_vmap_range(dmab->area, dmab->bytes); } else { if (dmab->dev.dir == DMA_FROM_DEVICE) return; From c8d251f51ee61df06ee0e419348d8c9160bbfb86 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 9 Feb 2022 15:25:20 -0800 Subject: [PATCH 074/773] ASoC: qcom: Actually clear DMA interrupt register for HDMI In commit da0363f7bfd3 ("ASoC: qcom: Fix for DMA interrupt clear reg overwriting") we changed regmap_write() to regmap_update_bits() so that we can avoid overwriting bits that we didn't intend to modify. Unfortunately this change breaks the case where a register is writable but not readable, which is exactly how the HDMI irq clear register is designed (grep around LPASS_HDMITX_APP_IRQCLEAR_REG to see how it's write only). That's because regmap_update_bits() tries to read the register from the hardware and if it isn't readable it looks in the regmap cache to see what was written there last time to compare against what we want to write there. Eventually, we're unable to modify this register at all because the bits that we're trying to set are already set in the cache. This is doubly bad for the irq clear register because you have to write the bit to clear an interrupt. Given the irq is level triggered, we see an interrupt storm upon plugging in an HDMI cable and starting audio playback. The irq storm is so great that performance degrades significantly, leading to CPU soft lockups. Fix it by using regmap_write_bits() so that we really do write the bits in the clear register that we want to. This brings the number of irqs handled by lpass_dma_interrupt_handler() down from ~150k/sec to ~10/sec. Fixes: da0363f7bfd3 ("ASoC: qcom: Fix for DMA interrupt clear reg overwriting") Cc: Srinivasa Rao Mandadapu Cc: Srinivas Kandagatla Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20220209232520.4017634-1-swboyd@chromium.org Signed-off-by: Mark Brown --- sound/soc/qcom/lpass-platform.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/qcom/lpass-platform.c b/sound/soc/qcom/lpass-platform.c index a59e9d20cb46..4b1773c1fb95 100644 --- a/sound/soc/qcom/lpass-platform.c +++ b/sound/soc/qcom/lpass-platform.c @@ -524,7 +524,7 @@ static int lpass_platform_pcmops_trigger(struct snd_soc_component *component, return -EINVAL; } - ret = regmap_update_bits(map, reg_irqclr, val_irqclr, val_irqclr); + ret = regmap_write_bits(map, reg_irqclr, val_irqclr, val_irqclr); if (ret) { dev_err(soc_runtime->dev, "error writing to irqclear reg: %d\n", ret); return ret; @@ -665,7 +665,7 @@ static irqreturn_t lpass_dma_interrupt_handler( return -EINVAL; } if (interrupts & LPAIF_IRQ_PER(chan)) { - rv = regmap_update_bits(map, reg, mask, (LPAIF_IRQ_PER(chan) | val)); + rv = regmap_write_bits(map, reg, mask, (LPAIF_IRQ_PER(chan) | val)); if (rv) { dev_err(soc_runtime->dev, "error writing to irqclear reg: %d\n", rv); @@ -676,7 +676,7 @@ static irqreturn_t lpass_dma_interrupt_handler( } if (interrupts & LPAIF_IRQ_XRUN(chan)) { - rv = regmap_update_bits(map, reg, mask, (LPAIF_IRQ_XRUN(chan) | val)); + rv = regmap_write_bits(map, reg, mask, (LPAIF_IRQ_XRUN(chan) | val)); if (rv) { dev_err(soc_runtime->dev, "error writing to irqclear reg: %d\n", rv); @@ -688,7 +688,7 @@ static irqreturn_t lpass_dma_interrupt_handler( } if (interrupts & LPAIF_IRQ_ERR(chan)) { - rv = regmap_update_bits(map, reg, mask, (LPAIF_IRQ_ERR(chan) | val)); + rv = regmap_write_bits(map, reg, mask, (LPAIF_IRQ_ERR(chan) | val)); if (rv) { dev_err(soc_runtime->dev, "error writing to irqclear reg: %d\n", rv); From a887f9c7a4d37a8e874ba8415a42a92a1b5139fc Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 10 Feb 2022 17:20:51 +0000 Subject: [PATCH 075/773] ASoC: wm_adsp: Correct control read size when parsing compressed buffer When parsing the compressed stream the whole buffer descriptor is now read in a single cs_dsp_coeff_read_ctrl; on older firmwares this descriptor is just 4 bytes but on more modern firmwares it is 24 bytes. The current code reads the full 24 bytes regardless, this was working but reading junk for the last 20 bytes. However commit f444da38ac92 ("firmware: cs_dsp: Add offset to cs_dsp read/write") added a size check into cs_dsp_coeff_read_ctrl, causing the older firmwares to now return an error. Update the code to only read the amount of data appropriate for the firmware loaded. Fixes: 04ae08596737 ("ASoC: wm_adsp: Switch to using wm_coeff_read_ctrl for compressed buffers") Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20220210172053.22782-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm_adsp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index f3672e3d1703..0582585236a2 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -1441,7 +1441,8 @@ static int wm_adsp_buffer_parse_coeff(struct cs_dsp_coeff_ctl *cs_ctl) int ret, i; for (i = 0; i < 5; ++i) { - ret = cs_dsp_coeff_read_ctrl(cs_ctl, 0, &coeff_v1, sizeof(coeff_v1)); + ret = cs_dsp_coeff_read_ctrl(cs_ctl, 0, &coeff_v1, + min(cs_ctl->len, sizeof(coeff_v1))); if (ret < 0) return ret; From ba2ab85951c91a140a8fa51d8347d54e59ec009d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 9 Feb 2022 21:08:06 +0300 Subject: [PATCH 076/773] pinctrl: fix loop in k210_pinconf_get_drive() The loop exited too early so the k210_pinconf_drive_strength[0] array element was never used. Fixes: d4c34d09ab03 ("pinctrl: Add RISC-V Canaan Kendryte K210 FPIOA driver") Signed-off-by: Dan Carpenter Reviewed-by: Damien Le Moal Reviewed-by: Sean Anderson Link: https://lore.kernel.org/r/20220209180804.GA18385@kili Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-k210.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c index 49e32684dbb2..e3d03f2de7ef 100644 --- a/drivers/pinctrl/pinctrl-k210.c +++ b/drivers/pinctrl/pinctrl-k210.c @@ -482,7 +482,7 @@ static int k210_pinconf_get_drive(unsigned int max_strength_ua) { int i; - for (i = K210_PC_DRIVE_MAX; i; i--) { + for (i = K210_PC_DRIVE_MAX; i >= 0; i--) { if (k210_pinconf_drive_strength[i] <= max_strength_ua) return i; } From e9f7b9228a94778edb7a63fde3c0a3c5bb793064 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Wed, 9 Feb 2022 13:28:22 -0500 Subject: [PATCH 077/773] pinctrl: k210: Fix bias-pull-up Using bias-pull-up would actually cause the pin to have its pull-down enabled. Fix this. Signed-off-by: Sean Anderson Reviewed-by: Damien Le Moal Fixes: d4c34d09ab03 ("pinctrl: Add RISC-V Canaan Kendryte K210 FPIOA driver") Link: https://lore.kernel.org/r/20220209182822.640905-1-seanga2@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-k210.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c index e3d03f2de7ef..ecab6bf63dc6 100644 --- a/drivers/pinctrl/pinctrl-k210.c +++ b/drivers/pinctrl/pinctrl-k210.c @@ -527,7 +527,7 @@ static int k210_pinconf_set_param(struct pinctrl_dev *pctldev, case PIN_CONFIG_BIAS_PULL_UP: if (!arg) return -EINVAL; - val |= K210_PC_PD; + val |= K210_PC_PU; break; case PIN_CONFIG_DRIVE_STRENGTH: arg *= 1000; From d1c56bfdaca465bd1d0e913053a9c5cafe8b6a6c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Feb 2022 14:14:05 +0100 Subject: [PATCH 078/773] tests: fix idmapped mount_setattr test The test treated zero as a successful run when it really should treat non-zero as a successful run. A mount's idmapping can't change once it has been attached to the filesystem. Link: https://lore.kernel.org/r/20220203131411.3093040-2-brauner@kernel.org Fixes: 01eadc8dd96d ("tests: add mount_setattr() selftests") Cc: Seth Forshee Cc: Christoph Hellwig Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- tools/testing/selftests/mount_setattr/mount_setattr_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index f31205f04ee0..8c5fea68ae67 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -1236,7 +1236,7 @@ static int get_userns_fd(unsigned long nsid, unsigned long hostid, unsigned long } /** - * Validate that an attached mount in our mount namespace can be idmapped. + * Validate that an attached mount in our mount namespace cannot be idmapped. * (The kernel enforces that the mount's mount namespace and the caller's mount * namespace match.) */ @@ -1259,7 +1259,7 @@ TEST_F(mount_setattr_idmapped, attached_mount_inside_current_mount_namespace) attr.userns_fd = get_userns_fd(0, 10000, 10000); ASSERT_GE(attr.userns_fd, 0); - ASSERT_EQ(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } From 97acd701185b13825f8ec7882345040415d46762 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Feb 2022 14:14:06 +0100 Subject: [PATCH 079/773] MAINTAINERS: add entry for idmapped mounts I'd like to continue maintaining the work that was done around idmapped, make sure that I'm Cced on new patches and work that impacts the infrastructure. Link: https://lore.kernel.org/r/20220203131411.3093040-3-brauner@kernel.org Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f41088418aae..0496b973bb87 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9253,6 +9253,15 @@ S: Maintained W: https://github.com/o2genum/ideapad-slidebar F: drivers/input/misc/ideapad_slidebar.c +IDMAPPED MOUNTS +M: Christian Brauner +L: linux-fsdevel@vger.kernel.org +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git +F: Documentation/filesystems/idmappings.rst +F: tools/testing/selftests/mount_setattr/ +F: include/linux/mnt_idmapping.h + IDT VersaClock 5 CLOCK DRIVER M: Luca Ceresoli S: Maintained From ddc204b517e60ae64db34f9832dc41dafa77c751 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 8 Feb 2022 11:39:12 -0500 Subject: [PATCH 080/773] copy_process(): Move fd_install() out of sighand->siglock critical section I was made aware of the following lockdep splat: [ 2516.308763] ===================================================== [ 2516.309085] WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected [ 2516.309433] 5.14.0-51.el9.aarch64+debug #1 Not tainted [ 2516.309703] ----------------------------------------------------- [ 2516.310149] stress-ng/153663 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: [ 2516.310512] ffff0000e422b198 (&newf->file_lock){+.+.}-{2:2}, at: fd_install+0x368/0x4f0 [ 2516.310944] and this task is already holding: [ 2516.311248] ffff0000c08140d8 (&sighand->siglock){-.-.}-{2:2}, at: copy_process+0x1e2c/0x3e80 [ 2516.311804] which would create a new lock dependency: [ 2516.312066] (&sighand->siglock){-.-.}-{2:2} -> (&newf->file_lock){+.+.}-{2:2} [ 2516.312446] but this new dependency connects a HARDIRQ-irq-safe lock: [ 2516.312983] (&sighand->siglock){-.-.}-{2:2} : [ 2516.330700] Possible interrupt unsafe locking scenario: [ 2516.331075] CPU0 CPU1 [ 2516.331328] ---- ---- [ 2516.331580] lock(&newf->file_lock); [ 2516.331790] local_irq_disable(); [ 2516.332231] lock(&sighand->siglock); [ 2516.332579] lock(&newf->file_lock); [ 2516.332922] [ 2516.333069] lock(&sighand->siglock); [ 2516.333291] *** DEADLOCK *** [ 2516.389845] stack backtrace: [ 2516.390101] CPU: 3 PID: 153663 Comm: stress-ng Kdump: loaded Not tainted 5.14.0-51.el9.aarch64+debug #1 [ 2516.390756] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 2516.391155] Call trace: [ 2516.391302] dump_backtrace+0x0/0x3e0 [ 2516.391518] show_stack+0x24/0x30 [ 2516.391717] dump_stack_lvl+0x9c/0xd8 [ 2516.391938] dump_stack+0x1c/0x38 [ 2516.392247] print_bad_irq_dependency+0x620/0x710 [ 2516.392525] check_irq_usage+0x4fc/0x86c [ 2516.392756] check_prev_add+0x180/0x1d90 [ 2516.392988] validate_chain+0x8e0/0xee0 [ 2516.393215] __lock_acquire+0x97c/0x1e40 [ 2516.393449] lock_acquire.part.0+0x240/0x570 [ 2516.393814] lock_acquire+0x90/0xb4 [ 2516.394021] _raw_spin_lock+0xe8/0x154 [ 2516.394244] fd_install+0x368/0x4f0 [ 2516.394451] copy_process+0x1f5c/0x3e80 [ 2516.394678] kernel_clone+0x134/0x660 [ 2516.394895] __do_sys_clone3+0x130/0x1f4 [ 2516.395128] __arm64_sys_clone3+0x5c/0x7c [ 2516.395478] invoke_syscall.constprop.0+0x78/0x1f0 [ 2516.395762] el0_svc_common.constprop.0+0x22c/0x2c4 [ 2516.396050] do_el0_svc+0xb0/0x10c [ 2516.396252] el0_svc+0x24/0x34 [ 2516.396436] el0t_64_sync_handler+0xa4/0x12c [ 2516.396688] el0t_64_sync+0x198/0x19c [ 2517.491197] NET: Registered PF_ATMPVC protocol family [ 2517.491524] NET: Registered PF_ATMSVC protocol family [ 2591.991877] sched: RT throttling activated One way to solve this problem is to move the fd_install() call out of the sighand->siglock critical section. Before commit 6fd2fe494b17 ("copy_process(): don't use ksys_close() on cleanups"), the pidfd installation was done without holding both the task_list lock and the sighand->siglock. Obviously, holding these two locks are not really needed to protect the fd_install() call. So move the fd_install() call down to after the releases of both locks. Link: https://lore.kernel.org/r/20220208163912.1084752-1-longman@redhat.com Fixes: 6fd2fe494b17 ("copy_process(): don't use ksys_close() on cleanups") Reviewed-by: "Eric W. Biederman" Signed-off-by: Waiman Long Signed-off-by: Christian Brauner --- kernel/fork.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..007af7fb47c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2323,10 +2323,6 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - /* past the last point of failure */ - if (pidfile) - fd_install(pidfd, pidfile); - init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2375,6 +2371,9 @@ static __latent_entropy struct task_struct *copy_process( syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + if (pidfile) + fd_install(pidfd, pidfile); + proc_fork_connector(p); sched_post_fork(p, args); cgroup_post_fork(p, args); From 1ba603f56568c3b4c2542dfba07afa25f21dcff3 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 11 Feb 2022 10:27:04 +0000 Subject: [PATCH 081/773] firmware: arm_scmi: Remove space in MODULE_ALIAS name modprobe can't handle spaces in aliases. Get rid of it to fix the issue. Link: https://lore.kernel.org/r/20220211102704.128354-1-sudeep.holla@arm.com Fixes: aa4f886f3893 ("firmware: arm_scmi: add basic driver infrastructure for SCMI") Reviewed-by: Cristian Marussi Signed-off-by: Alyssa Ross Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c index b406b3f78f46..d76bab3aaac4 100644 --- a/drivers/firmware/arm_scmi/driver.c +++ b/drivers/firmware/arm_scmi/driver.c @@ -2112,7 +2112,7 @@ static void __exit scmi_driver_exit(void) } module_exit(scmi_driver_exit); -MODULE_ALIAS("platform: arm-scmi"); +MODULE_ALIAS("platform:arm-scmi"); MODULE_AUTHOR("Sudeep Holla "); MODULE_DESCRIPTION("ARM SCMI protocol driver"); MODULE_LICENSE("GPL v2"); From 4a11678f683814df82fca9018d964771e02d7e6d Mon Sep 17 00:00:00 2001 From: Felix Maurer Date: Wed, 9 Feb 2022 16:55:26 +0100 Subject: [PATCH 082/773] bpf: Do not try bpf_msg_push_data with len 0 If bpf_msg_push_data() is called with len 0 (as it happens during selftests/bpf/test_sockmap), we do not need to do anything and can return early. Calling bpf_msg_push_data() with len 0 previously lead to a wrong ENOMEM error: we later called get_order(copy + len); if len was 0, copy + len was also often 0 and get_order() returned some undefined value (at the moment 52). alloc_pages() caught that and failed, but then bpf_msg_push_data() returned ENOMEM. This was wrong because we are most probably not out of memory and actually do not need any additional memory. Fixes: 6fff607e2f14b ("bpf: sk_msg program helper bpf_msg_push_data") Signed-off-by: Felix Maurer Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/df69012695c7094ccb1943ca02b4920db3537466.1644421921.git.fmaurer@redhat.com --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 4603b7cd3cd1..9eb785842258 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2710,6 +2710,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { From a12821d5e012a42673f6fe521971f193441d8aa4 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 11 Feb 2022 10:34:25 +0100 Subject: [PATCH 083/773] block: Add handling for zone append command in blk_complete_request Zone append command needs special handling to update the bi_sector field in the bio struct with the actual position of the data in the device. It is stored in __sector field of the request struct. Fixes: 5581a5ddfe8d ("block: add completion handler for fast path") Signed-off-by: Pankaj Raghav Reviewed-by: Christoph Hellwig Tested-by: Adam Manzanares Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220211093425.43262-2-p.raghav@samsung.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1adfe4824ef5..d69ca91fbc8b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -736,6 +736,10 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); + + if (req_op(req) == REQ_OP_ZONE_APPEND) + bio->bi_iter.bi_sector = req->__sector; + if (!is_flush) bio_endio(bio); bio = next; From a7e793a867ae312cecdeb6f06cceff98263e75dd Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 10 Feb 2022 22:13:23 +0500 Subject: [PATCH 084/773] selftests/exec: Add non-regular to TEST_GEN_PROGS non-regular file needs to be compiled and then copied to the output directory. Remove it from TEST_PROGS and add it to TEST_GEN_PROGS. This removes error thrown by rsync when non-regular object isn't found: rsync: [sender] link_stat "/linux/tools/testing/selftests/exec/non-regular" failed: No such file or directory (2) rsync error: some files/attrs were not transferred (see previous errors) (code 23) at main.c(1333) [sender=3.2.3] Fixes: 0f71241a8e32 ("selftests/exec: add file type errno tests") Reported-by: "kernelci.org bot" Signed-off-by: Muhammad Usama Anjum Reviewed-by: Shuah Khan Reviewed-by: Kees Cook Signed-off-by: Shuah Khan --- tools/testing/selftests/exec/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 12c5e27d32c1..2d7fca446c7f 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -3,8 +3,8 @@ CFLAGS = -Wall CFLAGS += -Wno-nonnull CFLAGS += -D_GNU_SOURCE -TEST_PROGS := binfmt_script non-regular -TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 +TEST_PROGS := binfmt_script +TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 non-regular TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile From a0d48505a1d68e27220369e2dd1e3573a2f362d2 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Thu, 3 Feb 2022 18:47:00 +0200 Subject: [PATCH 085/773] i2c: qcom-cci: don't delete an unregistered adapter If i2c_add_adapter() fails to add an I2C adapter found on QCOM CCI controller, on error path i2c_del_adapter() is still called. Fortunately there is a sanity check in the I2C core, so the only visible implication is a printed debug level message: i2c-core: attempting to delete unregistered adapter [Qualcomm-CCI] Nevertheless it would be reasonable to correct the probe error path. Fixes: e517526195de ("i2c: Add Qualcomm CCI I2C driver") Signed-off-by: Vladimir Zapolskiy Reviewed-by: Robert Foss Reviewed-by: Bjorn Andersson Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-qcom-cci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-qcom-cci.c b/drivers/i2c/busses/i2c-qcom-cci.c index c1de8eb66169..fd4260d18577 100644 --- a/drivers/i2c/busses/i2c-qcom-cci.c +++ b/drivers/i2c/busses/i2c-qcom-cci.c @@ -655,7 +655,7 @@ static int cci_probe(struct platform_device *pdev) return 0; error_i2c: - for (; i >= 0; i--) { + for (--i ; i >= 0; i--) { if (cci->master[i].cci) i2c_del_adapter(&cci->master[i].adap); } From 02a4a69667a2ad32f3b52ca906f19628fbdd8a01 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Thu, 3 Feb 2022 18:47:03 +0200 Subject: [PATCH 086/773] i2c: qcom-cci: don't put a device tree node before i2c_add_adapter() There is a minor chance for a race, if a pointer to an i2c-bus subnode is stored and then reused after releasing its reference, and it would be sufficient to get one more reference under a loop over children subnodes. Fixes: e517526195de ("i2c: Add Qualcomm CCI I2C driver") Signed-off-by: Vladimir Zapolskiy Reviewed-by: Robert Foss Reviewed-by: Bjorn Andersson Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-qcom-cci.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-qcom-cci.c b/drivers/i2c/busses/i2c-qcom-cci.c index fd4260d18577..cf54f1cb4c57 100644 --- a/drivers/i2c/busses/i2c-qcom-cci.c +++ b/drivers/i2c/busses/i2c-qcom-cci.c @@ -558,7 +558,7 @@ static int cci_probe(struct platform_device *pdev) cci->master[idx].adap.quirks = &cci->data->quirks; cci->master[idx].adap.algo = &cci_algo; cci->master[idx].adap.dev.parent = dev; - cci->master[idx].adap.dev.of_node = child; + cci->master[idx].adap.dev.of_node = of_node_get(child); cci->master[idx].master = idx; cci->master[idx].cci = cci; @@ -643,8 +643,10 @@ static int cci_probe(struct platform_device *pdev) continue; ret = i2c_add_adapter(&cci->master[i].adap); - if (ret < 0) + if (ret < 0) { + of_node_put(cci->master[i].adap.dev.of_node); goto error_i2c; + } } pm_runtime_set_autosuspend_delay(dev, MSEC_PER_SEC); @@ -656,8 +658,10 @@ static int cci_probe(struct platform_device *pdev) error_i2c: for (--i ; i >= 0; i--) { - if (cci->master[i].cci) + if (cci->master[i].cci) { i2c_del_adapter(&cci->master[i].adap); + of_node_put(cci->master[i].adap.dev.of_node); + } } error: disable_irq(cci->irq); @@ -673,8 +677,10 @@ static int cci_remove(struct platform_device *pdev) int i; for (i = 0; i < cci->data->num_masters; i++) { - if (cci->master[i].cci) + if (cci->master[i].cci) { i2c_del_adapter(&cci->master[i].adap); + of_node_put(cci->master[i].adap.dev.of_node); + } cci_halt(cci, i); } From f444578d727a0ca4a72b19cd4a1d7da9f1fb99fe Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 10 Jan 2022 16:50:07 +0100 Subject: [PATCH 087/773] power: supply: bq256xx: Handle OOM correctly Since we now return a pointer to an allocated object we need to account for memory allocation failure in a separate error path. Fixes: 25fd330370ac ("power: supply_core: Pass pointer to battery info") Reported-by: Dan Carpenter Signed-off-by: Linus Walleij Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq256xx_charger.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/power/supply/bq256xx_charger.c b/drivers/power/supply/bq256xx_charger.c index b274942dc46a..01ad84fd147c 100644 --- a/drivers/power/supply/bq256xx_charger.c +++ b/drivers/power/supply/bq256xx_charger.c @@ -1523,6 +1523,9 @@ static int bq256xx_hw_init(struct bq256xx_device *bq) BQ256XX_WDT_BIT_SHIFT); ret = power_supply_get_battery_info(bq->charger, &bat_info); + if (ret == -ENOMEM) + return ret; + if (ret) { dev_warn(bq->dev, "battery info missing, default values will be applied\n"); From a8abb0c3dc1e28454851a00f8b7333d9695d566c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 9 Feb 2022 12:33:23 +0530 Subject: [PATCH 088/773] bpf: Fix crash due to incorrect copy_map_value When both bpf_spin_lock and bpf_timer are present in a BPF map value, copy_map_value needs to skirt both objects when copying a value into and out of the map. However, the current code does not set both s_off and t_off in copy_map_value, which leads to a crash when e.g. bpf_spin_lock is placed in map value with bpf_timer, as bpf_map_update_elem call will be able to overwrite the other timer object. When the issue is not fixed, an overwriting can produce the following splat: [root@(none) bpf]# ./test_progs -t timer_crash [ 15.930339] bpf_testmod: loading out-of-tree module taints kernel. [ 16.037849] ================================================================== [ 16.038458] BUG: KASAN: user-memory-access in __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.038944] Write of size 8 at addr 0000000000043ec0 by task test_progs/325 [ 16.039399] [ 16.039514] CPU: 0 PID: 325 Comm: test_progs Tainted: G OE 5.16.0+ #278 [ 16.039983] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ArchLinux 1.15.0-1 04/01/2014 [ 16.040485] Call Trace: [ 16.040645] [ 16.040805] dump_stack_lvl+0x59/0x73 [ 16.041069] ? __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.041427] kasan_report.cold+0x116/0x11b [ 16.041673] ? __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.042040] __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.042328] ? memcpy+0x39/0x60 [ 16.042552] ? pv_hash+0xd0/0xd0 [ 16.042785] ? lockdep_hardirqs_off+0x95/0xd0 [ 16.043079] __bpf_spin_lock_irqsave+0xdf/0xf0 [ 16.043366] ? bpf_get_current_comm+0x50/0x50 [ 16.043608] ? jhash+0x11a/0x270 [ 16.043848] bpf_timer_cancel+0x34/0xe0 [ 16.044119] bpf_prog_c4ea1c0f7449940d_sys_enter+0x7c/0x81 [ 16.044500] bpf_trampoline_6442477838_0+0x36/0x1000 [ 16.044836] __x64_sys_nanosleep+0x5/0x140 [ 16.045119] do_syscall_64+0x59/0x80 [ 16.045377] ? lock_is_held_type+0xe4/0x140 [ 16.045670] ? irqentry_exit_to_user_mode+0xa/0x40 [ 16.046001] ? mark_held_locks+0x24/0x90 [ 16.046287] ? asm_exc_page_fault+0x1e/0x30 [ 16.046569] ? asm_exc_page_fault+0x8/0x30 [ 16.046851] ? lockdep_hardirqs_on+0x7e/0x100 [ 16.047137] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 16.047405] RIP: 0033:0x7f9e4831718d [ 16.047602] Code: b4 0c 00 0f 05 eb a9 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b3 6c 0c 00 f7 d8 64 89 01 48 [ 16.048764] RSP: 002b:00007fff488086b8 EFLAGS: 00000206 ORIG_RAX: 0000000000000023 [ 16.049275] RAX: ffffffffffffffda RBX: 00007f9e48683740 RCX: 00007f9e4831718d [ 16.049747] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007fff488086d0 [ 16.050225] RBP: 00007fff488086f0 R08: 00007fff488085d7 R09: 00007f9e4cb594a0 [ 16.050648] R10: 0000000000000000 R11: 0000000000000206 R12: 00007f9e484cde30 [ 16.051124] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 16.051608] [ 16.051762] ================================================================== Fixes: 68134668c17f ("bpf: Add map side support for bpf timers.") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220209070324.1093182-2-memxor@gmail.com --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..31a83449808b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -224,7 +224,8 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) if (unlikely(map_value_has_spin_lock(map))) { s_off = map->spin_lock_off; s_sz = sizeof(struct bpf_spin_lock); - } else if (unlikely(map_value_has_timer(map))) { + } + if (unlikely(map_value_has_timer(map))) { t_off = map->timer_off; t_sz = sizeof(struct bpf_timer); } From a7e75016a0753c24d6c995bc02501ae35368e333 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 9 Feb 2022 12:33:24 +0530 Subject: [PATCH 089/773] selftests/bpf: Add test for bpf_timer overwriting crash Add a test that validates that timer value is not overwritten when doing a copy_map_value call in the kernel. Without the prior fix, this test triggers a crash. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220209070324.1093182-3-memxor@gmail.com --- .../selftests/bpf/prog_tests/timer_crash.c | 32 +++++++++++ .../testing/selftests/bpf/progs/timer_crash.c | 54 +++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/timer_crash.c create mode 100644 tools/testing/selftests/bpf/progs/timer_crash.c diff --git a/tools/testing/selftests/bpf/prog_tests/timer_crash.c b/tools/testing/selftests/bpf/prog_tests/timer_crash.c new file mode 100644 index 000000000000..f74b82305da8 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_crash.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "timer_crash.skel.h" + +enum { + MODE_ARRAY, + MODE_HASH, +}; + +static void test_timer_crash_mode(int mode) +{ + struct timer_crash *skel; + + skel = timer_crash__open_and_load(); + if (!ASSERT_OK_PTR(skel, "timer_crash__open_and_load")) + return; + skel->bss->pid = getpid(); + skel->bss->crash_map = mode; + if (!ASSERT_OK(timer_crash__attach(skel), "timer_crash__attach")) + goto end; + usleep(1); +end: + timer_crash__destroy(skel); +} + +void test_timer_crash(void) +{ + if (test__start_subtest("array")) + test_timer_crash_mode(MODE_ARRAY); + if (test__start_subtest("hash")) + test_timer_crash_mode(MODE_HASH); +} diff --git a/tools/testing/selftests/bpf/progs/timer_crash.c b/tools/testing/selftests/bpf/progs/timer_crash.c new file mode 100644 index 000000000000..f8f7944e70da --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_crash.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +struct map_elem { + struct bpf_timer timer; + struct bpf_spin_lock lock; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct map_elem); +} amap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct map_elem); +} hmap SEC(".maps"); + +int pid = 0; +int crash_map = 0; /* 0 for amap, 1 for hmap */ + +SEC("fentry/do_nanosleep") +int sys_enter(void *ctx) +{ + struct map_elem *e, value = {}; + void *map = crash_map ? (void *)&hmap : (void *)&amap; + + if (bpf_get_current_task_btf()->tgid != pid) + return 0; + + *(void **)&value = (void *)0xdeadcaf3; + + bpf_map_update_elem(map, &(int){0}, &value, 0); + /* For array map, doing bpf_map_update_elem will do a + * check_and_free_timer_in_array, which will trigger the crash if timer + * pointer was overwritten, for hmap we need to use bpf_timer_cancel. + */ + if (crash_map == 1) { + e = bpf_map_lookup_elem(map, &(int){0}); + if (!e) + return 0; + bpf_timer_cancel(&e->timer); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; From 3bd916ee0ecbbdd902fc24845f2fef332b2a310c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 11 Feb 2022 11:49:48 -0800 Subject: [PATCH 090/773] bpf: Emit bpf_timer in vmlinux BTF Currently the following code in check_and_init_map_value() *(struct bpf_timer *)(dst + map->timer_off) = (struct bpf_timer){}; can help generate bpf_timer definition in vmlinuxBTF. But the code above may not zero the whole structure due to anonymour members and that code will be replaced by memset in the subsequent patch and bpf_timer definition will disappear from vmlinuxBTF. Let us emit the type explicitly so bpf program can continue to use it from vmlinux.h. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220211194948.3141529-1-yhs@fb.com --- kernel/bpf/helpers.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01cfdf40c838..55c084251fab 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include +#include #include #include #include @@ -1075,6 +1076,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) void *key; u32 idx; + BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; From 5eaed6eedbe9612f642ad2b880f961d1c6c8ec2b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 11 Feb 2022 11:49:53 -0800 Subject: [PATCH 091/773] bpf: Fix a bpf_timer initialization issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch in [1] intends to fix a bpf_timer related issue, but the fix caused existing 'timer' selftest to fail with hang or some random errors. After some debug, I found an issue with check_and_init_map_value() in the hashtab.c. More specifically, in hashtab.c, we have code l_new = bpf_map_kmalloc_node(&htab->map, ...) check_and_init_map_value(&htab->map, l_new...) Note that bpf_map_kmalloc_node() does not do initialization so l_new contains random value. The function check_and_init_map_value() intends to zero the bpf_spin_lock and bpf_timer if they exist in the map. But I found bpf_spin_lock is zero'ed but bpf_timer is not zero'ed. With [1], later copy_map_value() skips copying of bpf_spin_lock and bpf_timer. The non-zero bpf_timer caused random failures for 'timer' selftest. Without [1], for both bpf_spin_lock and bpf_timer case, bpf_timer will be zero'ed, so 'timer' self test is okay. For check_and_init_map_value(), why bpf_spin_lock is zero'ed properly while bpf_timer not. In bpf uapi header, we have struct bpf_spin_lock { __u32 val; }; struct bpf_timer { __u64 :64; __u64 :64; } __attribute__((aligned(8))); The initialization code: *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = (struct bpf_spin_lock){}; *(struct bpf_timer *)(dst + map->timer_off) = (struct bpf_timer){}; It appears the compiler has no obligation to initialize anonymous fields. For example, let us use clang with bpf target as below: $ cat t.c struct bpf_timer { unsigned long long :64; }; struct bpf_timer2 { unsigned long long a; }; void test(struct bpf_timer *t) { *t = (struct bpf_timer){}; } void test2(struct bpf_timer2 *t) { *t = (struct bpf_timer2){}; } $ clang -target bpf -O2 -c -g t.c $ llvm-objdump -d t.o ... 0000000000000000 : 0: 95 00 00 00 00 00 00 00 exit 0000000000000008 : 1: b7 02 00 00 00 00 00 00 r2 = 0 2: 7b 21 00 00 00 00 00 00 *(u64 *)(r1 + 0) = r2 3: 95 00 00 00 00 00 00 00 exit gcc11.2 does not have the above issue. But from INTERNATIONAL STANDARD ©ISO/IEC ISO/IEC 9899:201x Programming languages — C http://www.open-std.org/Jtc1/sc22/wg14/www/docs/n1547.pdf page 157: Except where explicitly stated otherwise, for the purposes of this subclause unnamed members of objects of structure and union type do not participate in initialization. Unnamed members of structure objects have indeterminate value even after initialization. To fix the problem, let use memset for bpf_timer case in check_and_init_map_value(). For consistency, memset is also used for bpf_spin_lock case. [1] https://lore.kernel.org/bpf/20220209070324.1093182-2-memxor@gmail.com/ Fixes: 68134668c17f3 ("bpf: Add map side support for bpf timers.") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220211194953.3142152-1-yhs@fb.com --- include/linux/bpf.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 31a83449808b..d0ad379d1e62 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,11 +209,9 @@ static inline bool map_value_has_timer(const struct bpf_map *map) static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) - *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = - (struct bpf_spin_lock){}; + memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) - *(struct bpf_timer *)(dst + map->timer_off) = - (struct bpf_timer){}; + memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); } /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ From f10f582d28220f50099d3f561116256267821429 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Tue, 8 Feb 2022 12:54:48 -0600 Subject: [PATCH 092/773] scsi: qedi: Fix ABBA deadlock in qedi_process_tmf_resp() and qedi_process_cmd_cleanup_resp() This fixes a deadlock added with commit b40f3894e39e ("scsi: qedi: Complete TMF works before disconnect") Bug description from Jia-Ju Bai: qedi_process_tmf_resp() spin_lock(&session->back_lock); --> Line 201 (Lock A) spin_lock(&qedi_conn->tmf_work_lock); --> Line 230 (Lock B) qedi_process_cmd_cleanup_resp() spin_lock_bh(&qedi_conn->tmf_work_lock); --> Line 752 (Lock B) spin_lock_bh(&conn->session->back_lock); --> Line 784 (Lock A) When qedi_process_tmf_resp() and qedi_process_cmd_cleanup_resp() are concurrently executed, the deadlock can occur. This patch fixes the deadlock by not holding the tmf_work_lock in qedi_process_cmd_cleanup_resp while holding the back_lock. The tmf_work_lock is only needed while we remove the tmf_work from the work_list. Link: https://lore.kernel.org/r/20220208185448.6206-1-michael.christie@oracle.com Fixes: b40f3894e39e ("scsi: qedi: Complete TMF works before disconnect") Cc: Manish Rangankar Cc: Nilesh Javali Reported-by: TOTE Robot Reported-by: Jia-Ju Bai Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/scsi/qedi/qedi_fw.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c index 5916ed7662d5..4eb89aa4a39d 100644 --- a/drivers/scsi/qedi/qedi_fw.c +++ b/drivers/scsi/qedi/qedi_fw.c @@ -771,11 +771,10 @@ static void qedi_process_cmd_cleanup_resp(struct qedi_ctx *qedi, qedi_cmd->list_tmf_work = NULL; } } + spin_unlock_bh(&qedi_conn->tmf_work_lock); - if (!found) { - spin_unlock_bh(&qedi_conn->tmf_work_lock); + if (!found) goto check_cleanup_reqs; - } QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, "TMF work, cqe->tid=0x%x, tmf flags=0x%x, cid=0x%x\n", @@ -806,7 +805,6 @@ static void qedi_process_cmd_cleanup_resp(struct qedi_ctx *qedi, qedi_cmd->state = CLEANUP_RECV; unlock: spin_unlock_bh(&conn->session->back_lock); - spin_unlock_bh(&qedi_conn->tmf_work_lock); wake_up_interruptible(&qedi_conn->wait_queue); return; From 06582bc86d7f48d35cd044098ca1e246e8c7c52e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 26 Jan 2022 11:58:30 +0800 Subject: [PATCH 093/773] block: loop:use kstatfs.f_bsize of backing file to set discard granularity If backing file's filesystem has implemented ->fallocate(), we think the loop device can support discard, then pass sb->s_blocksize as discard_granularity. However, some underlying FS, such as overlayfs, doesn't set sb->s_blocksize, and causes discard_granularity to be set as zero, then the warning in __blkdev_issue_discard() is triggered. Christoph suggested to pass kstatfs.f_bsize as discard granularity, and this way is fine because kstatfs.f_bsize means 'Optimal transfer block size', which still matches with definition of discard granularity. So fix the issue by setting discard_granularity as kstatfs.f_bsize if it is available, otherwise claims discard isn't supported. Cc: Christoph Hellwig Cc: Vivek Goyal Reported-by: Pei Zhang Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220126035830.296465-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 150012ffb387..19fe19eaa50e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -79,6 +79,7 @@ #include #include #include +#include #include "loop.h" @@ -774,8 +775,13 @@ static void loop_config_discard(struct loop_device *lo) granularity = 0; } else { + struct kstatfs sbuf; + max_discard_sectors = UINT_MAX >> 9; - granularity = inode->i_sb->s_blocksize; + if (!vfs_statfs(&file->f_path, &sbuf)) + granularity = sbuf.f_bsize; + else + max_discard_sectors = 0; } if (max_discard_sectors) { From 28df029d53a2fd80c1b8674d47895648ad26dcfb Mon Sep 17 00:00:00 2001 From: Cheng Jui Wang Date: Thu, 10 Feb 2022 18:50:11 +0800 Subject: [PATCH 094/773] lockdep: Correct lock_classes index mapping A kernel exception was hit when trying to dump /proc/lockdep_chains after lockdep report "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!": Unable to handle kernel paging request at virtual address 00054005450e05c3 ... 00054005450e05c3] address between user and kernel address ranges ... pc : [0xffffffece769b3a8] string+0x50/0x10c lr : [0xffffffece769ac88] vsnprintf+0x468/0x69c ... Call trace: string+0x50/0x10c vsnprintf+0x468/0x69c seq_printf+0x8c/0xd8 print_name+0x64/0xf4 lc_show+0xb8/0x128 seq_read_iter+0x3cc/0x5fc proc_reg_read_iter+0xdc/0x1d4 The cause of the problem is the function lock_chain_get_class() will shift lock_classes index by 1, but the index don't need to be shifted anymore since commit 01bb6f0af992 ("locking/lockdep: Change the range of class_idx in held_lock struct") already change the index to start from 0. The lock_classes[-1] located at chain_hlocks array. When printing lock_classes[-1] after the chain_hlocks entries are modified, the exception happened. The output of lockdep_chains are incorrect due to this problem too. Fixes: f611e8cf98ec ("lockdep: Take read/write status in consideration when generate chainkey") Signed-off-by: Cheng Jui Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Boqun Feng Link: https://lore.kernel.org/r/20220210105011.21712-1-cheng-jui.wang@mediatek.com --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4a882f83aeb9..f8a0212189ca 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3462,7 +3462,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) u16 chain_hlock = chain_hlocks[chain->base + i]; unsigned int class_idx = chain_hlock_class_idx(chain_hlock); - return lock_classes + class_idx - 1; + return lock_classes + class_idx; } /* @@ -3530,7 +3530,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) hlock_id = chain_hlocks[chain->base + i]; chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1); + print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } From be4e65bdffab5f588044325117df77dad7e9c45a Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 10 Feb 2022 15:23:53 +0100 Subject: [PATCH 095/773] ARM: dts: rockchip: reorder rk322x hmdi clocks The binding specifies the clock order to "iahb", "isfr", "cec". Reorder the clocks accordingly. Signed-off-by: Sascha Hauer Link: https://lore.kernel.org/r/20220210142353.3420859-1-s.hauer@pengutronix.de Signed-off-by: Heiko Stuebner --- arch/arm/boot/dts/rk322x.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/rk322x.dtsi b/arch/arm/boot/dts/rk322x.dtsi index 8eed9e3a92e9..5868eb512f69 100644 --- a/arch/arm/boot/dts/rk322x.dtsi +++ b/arch/arm/boot/dts/rk322x.dtsi @@ -718,8 +718,8 @@ interrupts = ; assigned-clocks = <&cru SCLK_HDMI_PHY>; assigned-clock-parents = <&hdmi_phy>; - clocks = <&cru SCLK_HDMI_HDCP>, <&cru PCLK_HDMI_CTRL>, <&cru SCLK_HDMI_CEC>; - clock-names = "isfr", "iahb", "cec"; + clocks = <&cru PCLK_HDMI_CTRL>, <&cru SCLK_HDMI_HDCP>, <&cru SCLK_HDMI_CEC>; + clock-names = "iahb", "isfr", "cec"; pinctrl-names = "default"; pinctrl-0 = <&hdmii2c_xfer &hdmi_hpd &hdmi_cec>; resets = <&cru SRST_HDMI_P>; From 3916c3619599a3970d3e6f98fb430b7c46266ada Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Wed, 9 Feb 2022 12:03:55 +0000 Subject: [PATCH 096/773] ARM: dts: rockchip: fix a typo on rk3288 crypto-controller crypto-controller had a typo, fix it. In the same time, rename it to just crypto Signed-off-by: Corentin Labbe Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220209120355.1985707-1-clabbe@baylibre.com Signed-off-by: Heiko Stuebner --- arch/arm/boot/dts/rk3288.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index aaaa61875701..45a9d9b908d2 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -971,7 +971,7 @@ status = "disabled"; }; - crypto: cypto-controller@ff8a0000 { + crypto: crypto@ff8a0000 { compatible = "rockchip,rk3288-crypto"; reg = <0x0 0xff8a0000 0x0 0x4000>; interrupts = ; From 9405b5f8b20c2bfa6523a555279a0379640dc136 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 12 Feb 2022 01:54:14 -0600 Subject: [PATCH 097/773] smb3: fix snapshot mount option The conversion to the new API broke the snapshot mount option due to 32 vs. 64 bit type mismatch Fixes: 24e0a1eff9e2 ("cifs: switch to new mount api") Cc: stable@vger.kernel.org # 5.11+ Reported-by: Acked-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/fs_context.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 7ec35f3f0a5f..a92e9eec521f 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -149,7 +149,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), fsparam_u32("handletimeout", Opt_handletimeout), - fsparam_u32("snapshot", Opt_snapshot), + fsparam_u64("snapshot", Opt_snapshot), fsparam_u32("max_channels", Opt_max_channels), /* Mount options which take string value */ @@ -1078,7 +1078,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->echo_interval = result.uint_32; break; case Opt_snapshot: - ctx->snapshot_time = result.uint_32; + ctx->snapshot_time = result.uint_64; break; case Opt_max_credits: if (result.uint_32 < 20 || result.uint_32 > 60000) { From dd5a927e411836eaef44eb9b00fece615e82e242 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 3 Jan 2022 16:50:25 +0200 Subject: [PATCH 098/773] cifs: fix set of group SID via NTSD xattrs 'setcifsacl -g ' silently fails to set the group SID on server. Actually, the bug existed since commit 438471b67963 ("CIFS: Add support for setting owner info, dos attributes, and create time"), but this fix will not apply cleanly to kernel versions <= v5.10. Fixes: 3970acf7ddb9 ("SMB3: Add support for getting and setting SACLs") Cc: stable@vger.kernel.org # 5.11+ Signed-off-by: Amir Goldstein Signed-off-by: Steve French --- fs/cifs/xattr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 7d8b72d67c80..9d486fbbfbbd 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -175,11 +175,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler, switch (handler->flags) { case XATTR_CIFS_NTSD_FULL: aclflags = (CIFS_ACL_OWNER | + CIFS_ACL_GROUP | CIFS_ACL_DACL | CIFS_ACL_SACL); break; case XATTR_CIFS_NTSD: aclflags = (CIFS_ACL_OWNER | + CIFS_ACL_GROUP | CIFS_ACL_DACL); break; case XATTR_CIFS_ACL: From 26d3dadebbcbddfaf1d9caad42527a28a0ed28d8 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Sat, 12 Feb 2022 08:16:20 +1000 Subject: [PATCH 099/773] cifs: do not use uninitialized data in the owner/group sid When idsfromsid is used we create a special SID for owner/group. This structure must be initialized or else the first 5 bytes of the Authority field of the SID will contain uninitialized data and thus not be a valid SID. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index ee3aab3dd4ac..5df21d63dd04 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1297,7 +1297,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, if (uid_valid(uid)) { /* chown */ uid_t id; - nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), + nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!nowner_sid_ptr) { rc = -ENOMEM; @@ -1326,7 +1326,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, } if (gid_valid(gid)) { /* chgrp */ gid_t id; - ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), + ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!ngroup_sid_ptr) { rc = -ENOMEM; From 3d6cc9898efdfb062efb74dc18cfc700e082f5d5 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 11 Feb 2022 02:59:15 +1000 Subject: [PATCH 100/773] cifs: fix double free race when mount fails in cifs_get_root() When cifs_get_root() fails during cifs_smb3_do_mount() we call deactivate_locked_super() which eventually will call delayed_free() which will free the context. In this situation we should not proceed to enter the out: section in cifs_smb3_do_mount() and free the same resources a second time. [Thu Feb 10 12:59:06 2022] BUG: KASAN: use-after-free in rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] Read of size 8 at addr ffff888364f4d110 by task swapper/1/0 [Thu Feb 10 12:59:06 2022] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G OE 5.17.0-rc3+ #4 [Thu Feb 10 12:59:06 2022] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 [Thu Feb 10 12:59:06 2022] Call Trace: [Thu Feb 10 12:59:06 2022] [Thu Feb 10 12:59:06 2022] dump_stack_lvl+0x5d/0x78 [Thu Feb 10 12:59:06 2022] print_address_description.constprop.0+0x24/0x150 [Thu Feb 10 12:59:06 2022] ? rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] kasan_report.cold+0x7d/0x117 [Thu Feb 10 12:59:06 2022] ? rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] __asan_load8+0x86/0xa0 [Thu Feb 10 12:59:06 2022] rcu_cblist_dequeue+0x32/0x60 [Thu Feb 10 12:59:06 2022] rcu_core+0x547/0xca0 [Thu Feb 10 12:59:06 2022] ? call_rcu+0x3c0/0x3c0 [Thu Feb 10 12:59:06 2022] ? __this_cpu_preempt_check+0x13/0x20 [Thu Feb 10 12:59:06 2022] ? lock_is_held_type+0xea/0x140 [Thu Feb 10 12:59:06 2022] rcu_core_si+0xe/0x10 [Thu Feb 10 12:59:06 2022] __do_softirq+0x1d4/0x67b [Thu Feb 10 12:59:06 2022] __irq_exit_rcu+0x100/0x150 [Thu Feb 10 12:59:06 2022] irq_exit_rcu+0xe/0x30 [Thu Feb 10 12:59:06 2022] sysvec_hyperv_stimer0+0x9d/0xc0 ... [Thu Feb 10 12:59:07 2022] Freed by task 58179: [Thu Feb 10 12:59:07 2022] kasan_save_stack+0x26/0x50 [Thu Feb 10 12:59:07 2022] kasan_set_track+0x25/0x30 [Thu Feb 10 12:59:07 2022] kasan_set_free_info+0x24/0x40 [Thu Feb 10 12:59:07 2022] ____kasan_slab_free+0x137/0x170 [Thu Feb 10 12:59:07 2022] __kasan_slab_free+0x12/0x20 [Thu Feb 10 12:59:07 2022] slab_free_freelist_hook+0xb3/0x1d0 [Thu Feb 10 12:59:07 2022] kfree+0xcd/0x520 [Thu Feb 10 12:59:07 2022] cifs_smb3_do_mount+0x149/0xbe0 [cifs] [Thu Feb 10 12:59:07 2022] smb3_get_tree+0x1a0/0x2e0 [cifs] [Thu Feb 10 12:59:07 2022] vfs_get_tree+0x52/0x140 [Thu Feb 10 12:59:07 2022] path_mount+0x635/0x10c0 [Thu Feb 10 12:59:07 2022] __x64_sys_mount+0x1bf/0x210 [Thu Feb 10 12:59:07 2022] do_syscall_64+0x5c/0xc0 [Thu Feb 10 12:59:07 2022] entry_SYSCALL_64_after_hwframe+0x44/0xae [Thu Feb 10 12:59:07 2022] Last potentially related work creation: [Thu Feb 10 12:59:07 2022] kasan_save_stack+0x26/0x50 [Thu Feb 10 12:59:07 2022] __kasan_record_aux_stack+0xb6/0xc0 [Thu Feb 10 12:59:07 2022] kasan_record_aux_stack_noalloc+0xb/0x10 [Thu Feb 10 12:59:07 2022] call_rcu+0x76/0x3c0 [Thu Feb 10 12:59:07 2022] cifs_umount+0xce/0xe0 [cifs] [Thu Feb 10 12:59:07 2022] cifs_kill_sb+0xc8/0xe0 [cifs] [Thu Feb 10 12:59:07 2022] deactivate_locked_super+0x5d/0xd0 [Thu Feb 10 12:59:07 2022] cifs_smb3_do_mount+0xab9/0xbe0 [cifs] [Thu Feb 10 12:59:07 2022] smb3_get_tree+0x1a0/0x2e0 [cifs] [Thu Feb 10 12:59:07 2022] vfs_get_tree+0x52/0x140 [Thu Feb 10 12:59:07 2022] path_mount+0x635/0x10c0 [Thu Feb 10 12:59:07 2022] __x64_sys_mount+0x1bf/0x210 [Thu Feb 10 12:59:07 2022] do_syscall_64+0x5c/0xc0 [Thu Feb 10 12:59:07 2022] entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: Shyam Prasad N Reviewed-by: Shyam Prasad N Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 199edac0cb59..082c21478686 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -919,6 +919,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, out_super: deactivate_locked_super(sb); + return root; out: if (cifs_sb) { kfree(cifs_sb->prepath); From 2874b7911132f6975e668f6849c8ac93bc4e1f35 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 12 Feb 2022 00:44:11 +0100 Subject: [PATCH 101/773] netfilter: xt_socket: missing ifdef CONFIG_IP6_NF_IPTABLES dependency nf_defrag_ipv6_disable() requires CONFIG_IP6_NF_IPTABLES. Fixes: 75063c9294fb ("netfilter: xt_socket: fix a typo in socket_mt_destroy()") Reported-by: kernel test robot Reviewed-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 662e5eb1cc39..7013f55f05d1 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -220,8 +220,10 @@ static void socket_mt_destroy(const struct xt_mtdtor_param *par) { if (par->family == NFPROTO_IPV4) nf_defrag_ipv4_disable(par->net); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) else if (par->family == NFPROTO_IPV6) nf_defrag_ipv6_disable(par->net); +#endif } static struct xt_match socket_mt_reg[] __read_mostly = { From ef3075d6638d3d5353a97fcc7bb0338fc85675f5 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Tue, 25 Jan 2022 11:11:25 -0600 Subject: [PATCH 102/773] arm64: dts: imx8mm: Fix VPU Hanging The vpumix power domain has a reset assigned to it, however when used, it causes a system hang. Testing has shown that it does not appear to be needed anywhere. Fixes: d39d4bb15310 ("arm64: dts: imx8mm: add GPC node") Signed-off-by: Adam Ford Reviewed-by: Lucas Stach Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mm.dtsi | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi index f77f90ed416f..0c7a72c51a31 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi @@ -707,7 +707,6 @@ clocks = <&clk IMX8MM_CLK_VPU_DEC_ROOT>; assigned-clocks = <&clk IMX8MM_CLK_VPU_BUS>; assigned-clock-parents = <&clk IMX8MM_SYS_PLL1_800M>; - resets = <&src IMX8MQ_RESET_VPU_RESET>; }; pgc_vpu_g1: power-domain@7 { From 45d941f67b000b6d79159522a0bbfc37cfd584d6 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 11 Feb 2022 11:02:04 +0000 Subject: [PATCH 103/773] arm64: dts: imx8ulp: Set #thermal-sensor-cells to 1 as required The SCMI binding clearly states the value of #thermal-sensor-cells must be 1. However arch/arm64/boot/dts/freescale/imx8ulp.dtsi sets it 0 which results in the following warning with dtbs_check: | arch/arm64/boot/dts/freescale/imx8ulp-evk.dt.yaml: scmi: | protocol@15:#thermal-sensor-cells:0:0: 1 was expected | From schema: Documentation/devicetree/bindings/firmware/arm,scmi.yaml Fix it by setting it to 1 as required. Cc:Shawn Guo Cc: Sascha Hauer Signed-off-by: Sudeep Holla Reviewed-by: Fabio Estevam Acked-by: Peng Fan Fixes: a38771d7a49b ("arm64: dts: imx8ulp: add scmi firmware node") Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8ulp.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8ulp.dtsi b/arch/arm64/boot/dts/freescale/imx8ulp.dtsi index a987ff7156bd..09f7364dd1d0 100644 --- a/arch/arm64/boot/dts/freescale/imx8ulp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8ulp.dtsi @@ -132,7 +132,7 @@ scmi_sensor: protocol@15 { reg = <0x15>; - #thermal-sensor-cells = <0>; + #thermal-sensor-cells = <1>; }; }; }; From 538f4f022a4612f969d5324ee227403c9f8b1d72 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Feb 2022 14:14:07 +0100 Subject: [PATCH 104/773] fs: add kernel doc for mnt_{hold,unhold}_writers() When I introduced mnt_{hold,unhold}_writers() in commit fbdc2f6c40f6 ("fs: split out functions to hold writers") I did not add kernel doc for them. Fix this and introduce proper documentation. Link: https://lore.kernel.org/r/20220203131411.3093040-4-brauner@kernel.org Fixes: fbdc2f6c40f6 ("fs: split out functions to hold writers") Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/namespace.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 40b994a29e90..de6fae84f1a1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -469,6 +469,24 @@ void mnt_drop_write_file(struct file *file) } EXPORT_SYMBOL(mnt_drop_write_file); +/** + * mnt_hold_writers - prevent write access to the given mount + * @mnt: mnt to prevent write access to + * + * Prevents write access to @mnt if there are no active writers for @mnt. + * This function needs to be called and return successfully before changing + * properties of @mnt that need to remain stable for callers with write access + * to @mnt. + * + * After this functions has been called successfully callers must pair it with + * a call to mnt_unhold_writers() in order to stop preventing write access to + * @mnt. + * + * Context: This function expects lock_mount_hash() to be held serializing + * setting MNT_WRITE_HOLD. + * Return: On success 0 is returned. + * On error, -EBUSY is returned. + */ static inline int mnt_hold_writers(struct mount *mnt) { mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; @@ -500,6 +518,18 @@ static inline int mnt_hold_writers(struct mount *mnt) return 0; } +/** + * mnt_unhold_writers - stop preventing write access to the given mount + * @mnt: mnt to stop preventing write access to + * + * Stop preventing write access to @mnt allowing callers to gain write access + * to @mnt again. + * + * This function can only be called after a successful call to + * mnt_hold_writers(). + * + * Context: This function expects lock_mount_hash() to be held. + */ static inline void mnt_unhold_writers(struct mount *mnt) { /* From 19d20c7a29bf2e46ff1ab8e8c4fcd2da8a4f38e2 Mon Sep 17 00:00:00 2001 From: Matteo Martelli Date: Fri, 11 Feb 2022 23:49:13 +0100 Subject: [PATCH 105/773] ALSA: usb-audio: revert to IMPLICIT_FB_FIXED_DEV for M-Audio FastTrack Ultra Commit 83b7dcbc51c930fc2079ab6c6fc9d719768321f1 introduced a generic implicit feedback parser, which fails to execute for M-Audio FastTrack Ultra sound cards. The issue is with the ENDPOINT_SYNCTYPE check in add_generic_implicit_fb() where the SYNCTYPE is ADAPTIVE instead of ASYNC. The reason is that the sync type of the FastTrack output endpoints are set to adaptive in the quirks table since commit 65f04443c96dbda11b8fff21d6390e082846aa3c. Fixes: 83b7dcbc51c9 ("ALSA: usb-audio: Add generic implicit fb parsing") Signed-off-by: Matteo Martelli Cc: Link: https://lore.kernel.org/r/20220211224913.20683-2-matteomartelli3@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/implicit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/usb/implicit.c b/sound/usb/implicit.c index 70319c822c10..2d444ec74202 100644 --- a/sound/usb/implicit.c +++ b/sound/usb/implicit.c @@ -47,13 +47,13 @@ struct snd_usb_implicit_fb_match { static const struct snd_usb_implicit_fb_match playback_implicit_fb_quirks[] = { /* Generic matching */ IMPLICIT_FB_GENERIC_DEV(0x0499, 0x1509), /* Steinberg UR22 */ - IMPLICIT_FB_GENERIC_DEV(0x0763, 0x2080), /* M-Audio FastTrack Ultra */ - IMPLICIT_FB_GENERIC_DEV(0x0763, 0x2081), /* M-Audio FastTrack Ultra */ IMPLICIT_FB_GENERIC_DEV(0x0763, 0x2030), /* M-Audio Fast Track C400 */ IMPLICIT_FB_GENERIC_DEV(0x0763, 0x2031), /* M-Audio Fast Track C600 */ /* Fixed EP */ /* FIXME: check the availability of generic matching */ + IMPLICIT_FB_FIXED_DEV(0x0763, 0x2080, 0x81, 2), /* M-Audio FastTrack Ultra */ + IMPLICIT_FB_FIXED_DEV(0x0763, 0x2081, 0x81, 2), /* M-Audio FastTrack Ultra */ IMPLICIT_FB_FIXED_DEV(0x2466, 0x8010, 0x81, 2), /* Fractal Audio Axe-Fx III */ IMPLICIT_FB_FIXED_DEV(0x31e9, 0x0001, 0x81, 2), /* Solid State Logic SSL2 */ IMPLICIT_FB_FIXED_DEV(0x31e9, 0x0002, 0x81, 2), /* Solid State Logic SSL2+ */ From c07f2c7b45413a9e50ba78630fda04ecfa17b4f2 Mon Sep 17 00:00:00 2001 From: Yu Huang Date: Sun, 13 Feb 2022 00:08:33 +0800 Subject: [PATCH 106/773] ALSA: hda/realtek: Add quirk for Legion Y9000X 2019 Legion Y9000X 2019 has the same speaker with Y9000X 2020, but with a different quirk address. Add one quirk entry to make the speaker work on Y9000X 2019 too. Signed-off-by: Yu Huang Cc: Link: https://lore.kernel.org/r/20220212160835.165065-1-diwang90@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8315bf7d4c38..9473fb76ff19 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9170,6 +9170,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3824, "Legion Y9000X 2020", ALC285_FIXUP_LEGION_Y9000X_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3827, "Ideapad S740", ALC285_FIXUP_IDEAPAD_S740_COEF), SND_PCI_QUIRK(0x17aa, 0x3834, "Lenovo IdeaPad Slim 9i 14ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x383d, "Legion Y9000X 2019", ALC285_FIXUP_LEGION_Y9000X_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3843, "Yoga 9i", ALC287_FIXUP_IDEAPAD_BASS_SPK_AMP), SND_PCI_QUIRK(0x17aa, 0x3847, "Legion 7 16ACHG6", ALC287_FIXUP_LEGION_16ACHG6), SND_PCI_QUIRK(0x17aa, 0x384a, "Lenovo Yoga 7 15ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), From ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Fri, 11 Feb 2022 02:12:52 +0100 Subject: [PATCH 107/773] swiotlb: fix info leak with DMA_FROM_DEVICE The problem I'm addressing was discovered by the LTP test covering cve-2018-1000204. A short description of what happens follows: 1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV and a corresponding dxferp. The peculiar thing about this is that TUR is not reading from the device. 2) In sg_start_req() the invocation of blk_rq_map_user() effectively bounces the user-space buffer. As if the device was to transfer into it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in sg_build_indirect()") we make sure this first bounce buffer is allocated with GFP_ZERO. 3) For the rest of the story we keep ignoring that we have a TUR, so the device won't touch the buffer we prepare as if the we had a DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device and the buffer allocated by SG is mapped by the function virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here scatter-gather and not scsi generics). This mapping involves bouncing via the swiotlb (we need swiotlb to do virtio in protected guest like s390 Secure Execution, or AMD SEV). 4) When the SCSI TUR is done, we first copy back the content of the second (that is swiotlb) bounce buffer (which most likely contains some previous IO data), to the first bounce buffer, which contains all zeros. Then we copy back the content of the first bounce buffer to the user-space buffer. 5) The test case detects that the buffer, which it zero-initialized, ain't all zeros and fails. One can argue that this is an swiotlb problem, because without swiotlb we leak all zeros, and the swiotlb should be transparent in a sense that it does not affect the outcome (if all other participants are well behaved). Copying the content of the original buffer into the swiotlb buffer is the only way I can think of to make swiotlb transparent in such scenarios. So let's do just that if in doubt, but allow the driver to tell us that the whole mapped buffer is going to be overwritten, in which case we can preserve the old behavior and avoid the performance impact of the extra bounce. Signed-off-by: Halil Pasic Signed-off-by: Christoph Hellwig --- Documentation/core-api/dma-attributes.rst | 8 ++++++++ include/linux/dma-mapping.h | 8 ++++++++ kernel/dma/swiotlb.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..17706dc91ec9 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,11 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_OVERWRITE +------------------ + +This is a hint to the DMA-mapping subsystem that the device is expected to +overwrite the entire mapped size, thus the caller does not require any of the +previous buffer contents to be preserved. This allows bounce-buffering +implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index dca2b1355bb1..6150d11a607e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,6 +61,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f1e7ea160b43..bfc56cb21705 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -628,7 +628,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || + dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } From 6317f7449348a897483a2b4841f7a9190745c81b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 14 Feb 2022 11:00:19 +0100 Subject: [PATCH 108/773] ALSA: hda: Fix regression on forced probe mask option The forced probe mask via probe_mask 0x100 bit doesn't work any longer as expected since the bus init code was moved and it's clearing the codec_mask value that was set beforehand. This patch fixes the long-time regression by moving the check_probe_mask() call. Fixes: a41d122449be ("ALSA: hda - Embed bus into controller object") Reported-by: dmummenschanz@web.de Cc: Link: https://lore.kernel.org/r/trinity-f018660b-95c9-442b-a2a8-c92a56eb07ed-1644345967148@3c-app-webde-bap22 Link: https://lore.kernel.org/r/20220214100020.8870-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 4b0338c4c543..18b795220b52 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -1798,8 +1798,6 @@ static int azx_create(struct snd_card *card, struct pci_dev *pci, assign_position_fix(chip, check_position_fix(chip, position_fix[dev])); - check_probe_mask(chip, dev); - if (single_cmd < 0) /* allow fallback to single_cmd at errors */ chip->fallback_to_single_cmd = 1; else /* explicitly set to single_cmd or not */ @@ -1825,6 +1823,8 @@ static int azx_create(struct snd_card *card, struct pci_dev *pci, chip->bus.core.needs_damn_long_delay = 1; } + check_probe_mask(chip, dev); + err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops); if (err < 0) { dev_err(card->dev, "Error creating device [card]!\n"); From dd8e5b161d7fb9cefa1f1d6e35a39b9e1563c8d3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 14 Feb 2022 11:00:20 +0100 Subject: [PATCH 109/773] ALSA: hda: Fix missing codec probe on Shenker Dock 15 By some unknown reason, BIOS on Shenker Dock 15 doesn't set up the codec mask properly for the onboard audio. Let's set the forced codec mask to enable the codec discovery. Reported-by: dmummenschanz@web.de Cc: Link: https://lore.kernel.org/r/trinity-f018660b-95c9-442b-a2a8-c92a56eb07ed-1644345967148@3c-app-webde-bap22 Link: https://lore.kernel.org/r/20220214100020.8870-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 18b795220b52..917ad9d375b1 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -1615,6 +1615,7 @@ static const struct snd_pci_quirk probe_mask_list[] = { /* forced codec slots */ SND_PCI_QUIRK(0x1043, 0x1262, "ASUS W5Fm", 0x103), SND_PCI_QUIRK(0x1046, 0x1262, "ASUS W5F", 0x103), + SND_PCI_QUIRK(0x1558, 0x0351, "Schenker Dock 15", 0x105), /* WinFast VP200 H (Teradici) user reported broken communication */ SND_PCI_QUIRK(0x3a21, 0x040d, "WinFast VP200 H", 0x101), {} From 5ce97f4ec5e0f8726a5dda1710727b1ee9badcac Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Mon, 4 Oct 2021 13:07:24 +0300 Subject: [PATCH 110/773] iommu/amd: Recover from event log overflow The AMD IOMMU logs I/O page faults and such to a ring buffer in system memory, and this ring buffer can overflow. The AMD IOMMU spec has the following to say about the interrupt status bit that signals this overflow condition: EventOverflow: Event log overflow. RW1C. Reset 0b. 1 = IOMMU event log overflow has occurred. This bit is set when a new event is to be written to the event log and there is no usable entry in the event log, causing the new event information to be discarded. An interrupt is generated when EventOverflow = 1b and MMIO Offset 0018h[EventIntEn] = 1b. No new event log entries are written while this bit is set. Software Note: To resume logging, clear EventOverflow (W1C), and write a 1 to MMIO Offset 0018h[EventLogEn]. The AMD IOMMU driver doesn't currently implement this recovery sequence, meaning that if a ring buffer overflow occurs, logging of EVT/PPR/GA events will cease entirely. This patch implements the spec-mandated reset sequence, with the minor tweak that the hardware seems to want to have a 0 written to MMIO Offset 0018h[EventLogEn] first, before writing an 1 into this field, or the IOMMU won't actually resume logging events. Signed-off-by: Lennert Buytenhek Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/YVrSXEdW2rzEfOvk@wantstofly.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 1 + drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/init.c | 10 ++++++++++ drivers/iommu/amd/iommu.c | 10 ++++++++-- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 416815a525d6..bb95edf74415 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -14,6 +14,7 @@ extern irqreturn_t amd_iommu_int_thread(int irq, void *data); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_restart_event_logging(struct amd_iommu *iommu); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); extern int amd_iommu_init_devices(void); extern void amd_iommu_uninit_devices(void); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index ffc89c4fb120..47108ed44fbb 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -110,6 +110,7 @@ #define PASID_MASK 0x0000ffff /* MMIO status bits */ +#define MMIO_STATUS_EVT_OVERFLOW_INT_MASK (1 << 0) #define MMIO_STATUS_EVT_INT_MASK (1 << 1) #define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2) #define MMIO_STATUS_PPR_INT_MASK (1 << 6) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index b10fb52ea442..7bfe37e52e21 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -657,6 +657,16 @@ static int __init alloc_command_buffer(struct amd_iommu *iommu) return iommu->cmd_buf ? 0 : -ENOMEM; } +/* + * This function restarts event logging in case the IOMMU experienced + * an event log buffer overflow. + */ +void amd_iommu_restart_event_logging(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); +} + /* * This function resets the command buffer if the IOMMU stopped fetching * commands from it. diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 461f1844ed1f..a18b549951bb 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -764,7 +764,8 @@ amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { } #endif /* !CONFIG_IRQ_REMAP */ #define AMD_IOMMU_INT_MASK \ - (MMIO_STATUS_EVT_INT_MASK | \ + (MMIO_STATUS_EVT_OVERFLOW_INT_MASK | \ + MMIO_STATUS_EVT_INT_MASK | \ MMIO_STATUS_PPR_INT_MASK | \ MMIO_STATUS_GALOG_INT_MASK) @@ -774,7 +775,7 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data) u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); while (status & AMD_IOMMU_INT_MASK) { - /* Enable EVT and PPR and GA interrupts again */ + /* Enable interrupt sources again */ writel(AMD_IOMMU_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); @@ -795,6 +796,11 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data) } #endif + if (status & MMIO_STATUS_EVT_OVERFLOW_INT_MASK) { + pr_info_ratelimited("IOMMU event log overflow\n"); + amd_iommu_restart_event_logging(iommu); + } + /* * Hardware bug: ERBT1312 * When re-enabling interrupt (by writing 1 From 40eb0dcf4114cbfff4d207890fa5a19e82da9fdc Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 10 Feb 2022 17:10:53 +0800 Subject: [PATCH 111/773] tee: optee: fix error return code in probe function If teedev_open() fails, probe function need return error code. Fixes: aceeafefff73 ("optee: use driver internal tee_context for some rpc") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Jens Wiklander --- drivers/tee/optee/ffa_abi.c | 4 +++- drivers/tee/optee/smc_abi.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/tee/optee/ffa_abi.c b/drivers/tee/optee/ffa_abi.c index 545f61af1248..0c90355896a0 100644 --- a/drivers/tee/optee/ffa_abi.c +++ b/drivers/tee/optee/ffa_abi.c @@ -860,8 +860,10 @@ static int optee_ffa_probe(struct ffa_device *ffa_dev) optee_supp_init(&optee->supp); ffa_dev_set_drvdata(ffa_dev, optee); ctx = teedev_open(optee->teedev); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + rc = PTR_ERR(ctx); goto err_rhashtable_free; + } optee->ctx = ctx; rc = optee_notif_init(optee, OPTEE_DEFAULT_MAX_NOTIF_VALUE); if (rc) diff --git a/drivers/tee/optee/smc_abi.c b/drivers/tee/optee/smc_abi.c index bacd1a1d79ee..4157f4b41bdd 100644 --- a/drivers/tee/optee/smc_abi.c +++ b/drivers/tee/optee/smc_abi.c @@ -1427,8 +1427,10 @@ static int optee_probe(struct platform_device *pdev) platform_set_drvdata(pdev, optee); ctx = teedev_open(optee->teedev); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + rc = PTR_ERR(ctx); goto err_supp_uninit; + } optee->ctx = ctx; rc = optee_notif_init(optee, max_notif_value); if (rc) From 6b0b2d9a6a308bcd9300c2d83000a82812c56cea Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 10 Feb 2022 09:47:45 -0600 Subject: [PATCH 112/773] iommu/amd: Fix I/O page table memory leak The current logic updates the I/O page table mode for the domain before calling the logic to free memory used for the page table. This results in IOMMU page table memory leak, and can be observed when launching VM w/ pass-through devices. Fix by freeing the memory used for page table before updating the mode. Cc: Joerg Roedel Reported-by: Daniel Jordan Tested-by: Daniel Jordan Signed-off-by: Suravee Suthikulpanit Fixes: e42ba0633064 ("iommu/amd: Restructure code for freeing page table") Link: https://lore.kernel.org/all/20220118194720.urjgi73b7c3tq2o6@oracle.com/ Link: https://lore.kernel.org/r/20220210154745.11524-1-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index b1bf4125b0f7..6608d1717574 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -492,18 +492,18 @@ static void v1_free_pgtable(struct io_pgtable *iop) dom = container_of(pgtable, struct protection_domain, iop); - /* Update data structure */ - amd_iommu_domain_clr_pt_root(dom); - - /* Make changes visible to IOMMUs */ - amd_iommu_domain_update(dom); - /* Page-table is not visible to IOMMU anymore, so free it */ BUG_ON(pgtable->mode < PAGE_MODE_NONE || pgtable->mode > PAGE_MODE_6_LEVEL); free_sub_pt(pgtable->root, pgtable->mode, &freelist); + /* Update data structure */ + amd_iommu_domain_clr_pt_root(dom); + + /* Make changes visible to IOMMUs */ + amd_iommu_domain_update(dom); + put_pages_list(&freelist); } From 9a5adeb28b77416446658e75bdef3bbe5fb92a83 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 14 Feb 2022 13:57:11 +0100 Subject: [PATCH 113/773] ALSA: usb-audio: Don't abort resume upon errors The default mixer resume code treats the errors at restoring the modified mixer items as a fatal error, and it returns back to the caller. This ends up in the resume failure, and the device will be come unavailable, although basically those errors are intermittent and can be safely ignored. The problem itself has been present from the beginning, but it didn't hit usually because the code tries to resume only the modified items. But now with the recent commit to forcibly initialize each item at the probe time, the problem surfaced more often, hence it appears as a regression. This patch fixes the regression simply by ignoring the errors at resume. Fixes: b96681bd5827 ("ALSA: usb-audio: Initialize every feature unit once at probe time") Cc: BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215561 Link: https://lore.kernel.org/r/20220214125711.20531-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 630766ba259f..a5641956ef10 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -3678,17 +3678,14 @@ static int restore_mixer_value(struct usb_mixer_elem_list *list) err = snd_usb_set_cur_mix_value(cval, c + 1, idx, cval->cache_val[idx]); if (err < 0) - return err; + break; } idx++; } } else { /* master */ - if (cval->cached) { - err = snd_usb_set_cur_mix_value(cval, 0, 0, *cval->cache_val); - if (err < 0) - return err; - } + if (cval->cached) + snd_usb_set_cur_mix_value(cval, 0, 0, *cval->cache_val); } return 0; From 2a845837e3d0ddaed493b4c5c4643d7f0542804d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 14 Feb 2022 14:04:10 +0100 Subject: [PATCH 114/773] ALSA: hda/realtek: Fix deadlock by COEF mutex The recently introduced coef_mutex for Realtek codec seems causing a deadlock when the relevant code is invoked from the power-off state; then the HD-audio core tries to power-up internally, and this kicks off the codec runtime PM code that tries to take the same coef_mutex. In order to avoid the deadlock, do the temporary power up/down around the coef_mutex acquisition and release. This assures that the power-up sequence runs before the mutex, hence no re-entrance will happen. Fixes: b837a9f5ab3b ("ALSA: hda: realtek: Fix race at concurrent COEF updates") Reported-and-tested-by: Julian Wollrath Cc: Link: https://lore.kernel.org/r/20220214132838.4db10fca@schienar Link: https://lore.kernel.org/r/20220214130410.21230-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 39 +++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 9473fb76ff19..3a42457984e9 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -138,6 +138,22 @@ struct alc_spec { * COEF access helper functions */ +static void coef_mutex_lock(struct hda_codec *codec) +{ + struct alc_spec *spec = codec->spec; + + snd_hda_power_up_pm(codec); + mutex_lock(&spec->coef_mutex); +} + +static void coef_mutex_unlock(struct hda_codec *codec) +{ + struct alc_spec *spec = codec->spec; + + mutex_unlock(&spec->coef_mutex); + snd_hda_power_down_pm(codec); +} + static int __alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, unsigned int coef_idx) { @@ -151,12 +167,11 @@ static int __alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, static int alc_read_coefex_idx(struct hda_codec *codec, hda_nid_t nid, unsigned int coef_idx) { - struct alc_spec *spec = codec->spec; unsigned int val; - mutex_lock(&spec->coef_mutex); + coef_mutex_lock(codec); val = __alc_read_coefex_idx(codec, nid, coef_idx); - mutex_unlock(&spec->coef_mutex); + coef_mutex_unlock(codec); return val; } @@ -173,11 +188,9 @@ static void __alc_write_coefex_idx(struct hda_codec *codec, hda_nid_t nid, static void alc_write_coefex_idx(struct hda_codec *codec, hda_nid_t nid, unsigned int coef_idx, unsigned int coef_val) { - struct alc_spec *spec = codec->spec; - - mutex_lock(&spec->coef_mutex); + coef_mutex_lock(codec); __alc_write_coefex_idx(codec, nid, coef_idx, coef_val); - mutex_unlock(&spec->coef_mutex); + coef_mutex_unlock(codec); } #define alc_write_coef_idx(codec, coef_idx, coef_val) \ @@ -198,11 +211,9 @@ static void alc_update_coefex_idx(struct hda_codec *codec, hda_nid_t nid, unsigned int coef_idx, unsigned int mask, unsigned int bits_set) { - struct alc_spec *spec = codec->spec; - - mutex_lock(&spec->coef_mutex); + coef_mutex_lock(codec); __alc_update_coefex_idx(codec, nid, coef_idx, mask, bits_set); - mutex_unlock(&spec->coef_mutex); + coef_mutex_unlock(codec); } #define alc_update_coef_idx(codec, coef_idx, mask, bits_set) \ @@ -235,9 +246,7 @@ struct coef_fw { static void alc_process_coef_fw(struct hda_codec *codec, const struct coef_fw *fw) { - struct alc_spec *spec = codec->spec; - - mutex_lock(&spec->coef_mutex); + coef_mutex_lock(codec); for (; fw->nid; fw++) { if (fw->mask == (unsigned short)-1) __alc_write_coefex_idx(codec, fw->nid, fw->idx, fw->val); @@ -245,7 +254,7 @@ static void alc_process_coef_fw(struct hda_codec *codec, __alc_update_coefex_idx(codec, fw->nid, fw->idx, fw->mask, fw->val); } - mutex_unlock(&spec->coef_mutex); + coef_mutex_unlock(codec); } /* From c49ae619905eebd3f54598a84e4cd2bd58ba8fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 14 Feb 2022 12:02:28 +0100 Subject: [PATCH 115/773] PCI: mvebu: Fix device enumeration regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jan reported that on Turris Omnia (Armada 385), no PCIe devices were detected after upgrading from v5.16.1 to v5.16.3 and identified the cause as the backport of 91a8d79fc797 ("PCI: mvebu: Fix configuring secondary bus of PCIe Root Port via emulated bridge"), which appeared in v5.17-rc1. 91a8d79fc797 was incorrectly applied from mailing list patch [1] to the linux git repository [2] probably due to resolving merge conflicts incorrectly. Fix it now. [1] https://lore.kernel.org/r/20211125124605.25915-12-pali@kernel.org [2] https://git.kernel.org/linus/91a8d79fc797 [bhelgaas: commit log] BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215540 Fixes: 91a8d79fc797 ("PCI: mvebu: Fix configuring secondary bus of PCIe Root Port via emulated bridge") Link: https://lore.kernel.org/r/20220214110228.25825-1-pali@kernel.org Link: https://lore.kernel.org/r/20220127234917.GA150851@bhelgaas Reported-by: Jan Palus Signed-off-by: Pali Rohár Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-mvebu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index 71258ea3d35f..f8e82c5e2d87 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -1329,7 +1329,8 @@ static int mvebu_pcie_probe(struct platform_device *pdev) * indirectly via kernel emulated PCI bridge driver. */ mvebu_pcie_setup_hw(port); - mvebu_pcie_set_local_dev_nr(port, 0); + mvebu_pcie_set_local_dev_nr(port, 1); + mvebu_pcie_set_local_bus_nr(port, 0); } pcie->nports = i; From 0c6f4ebf8835d01866eb686d47578cde80097981 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 14 Feb 2022 08:40:52 +1000 Subject: [PATCH 116/773] cifs: modefromsids must add an ACE for authenticated users When we create a file with modefromsids we set an ACL that has one ACE for the magic modefromsid as well as a second ACE that grants full access to all authenticated users. When later we chante the mode on the file we strip away this, and other, ACE for authenticated users in set_chmod_dacl() and then just add back/update the modefromsid ACE. Thus leaving the file with a single ACE that is for the mode and no ACE to grant any user any rights to access the file. Fix this by always adding back also the modefromsid ACE so that we do not drop the rights to access the file. Signed-off-by: Ronnie Sahlberg Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 5df21d63dd04..bf861fef2f0c 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -949,6 +949,9 @@ static void populate_new_aces(char *nacl_base, pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += setup_special_mode_ACE(pnntace, nmode); num_aces++; + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += setup_authusers_ACE(pnntace); + num_aces++; goto set_size; } @@ -1613,7 +1616,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, nsecdesclen = secdesclen; if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ if (mode_from_sid) - nsecdesclen += sizeof(struct cifs_ace); + nsecdesclen += 2 * sizeof(struct cifs_ace); else /* cifsacl */ nsecdesclen += 5 * sizeof(struct cifs_ace); } else { /* chown */ From 21bffcb76ee2fbafc7d5946cef10abc9df5cfff7 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Thu, 10 Feb 2022 12:30:49 -0800 Subject: [PATCH 117/773] selftests/seccomp: Fix seccomp failure by adding missing headers seccomp_bpf failed on tests 47 global.user_notification_filter_empty and 48 global.user_notification_filter_empty_threaded when it's tested on updated kernel but with old kernel headers. Because old kernel headers don't have definition of macro __NR_clone3 which is required for these two tests. Since under selftests/, we can install headers once for all tests (the default INSTALL_HDR_PATH is usr/include), fix it by adding usr/include to the list of directories to be searched. Use "-isystem" to indicate it's a system directory as the real kernel headers directories are. Signed-off-by: Sherry Yang Tested-by: Sherry Yang Reviewed-by: Kees Cook Signed-off-by: Shuah Khan --- tools/testing/selftests/seccomp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile index 0ebfe8b0e147..585f7a0c10cb 100644 --- a/tools/testing/selftests/seccomp/Makefile +++ b/tools/testing/selftests/seccomp/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -Wl,-no-as-needed -Wall +CFLAGS += -Wl,-no-as-needed -Wall -isystem ../../../../usr/include/ LDFLAGS += -lpthread TEST_GEN_PROGS := seccomp_bpf seccomp_benchmark From 4f6de676d94ee8ddfc2e7e7cd935fc7cb2feff3a Mon Sep 17 00:00:00 2001 From: Joakim Tjernlund Date: Mon, 14 Feb 2022 18:56:43 +0100 Subject: [PATCH 118/773] arm64: Correct wrong label in macro __init_el2_gicv3 In commit: 114945d84a30a5fe ("arm64: Fix labels in el2_setup macros") We renamed a label from '1' to '.Lskip_gicv3_\@', but failed to update a branch to it, which now targets a later label also called '1'. The branch is taken rarely, when GICv3 is present but SRE is disabled at EL3, causing a boot-time crash. Update the caller to the new label name. Fixes: 114945d84a30 ("arm64: Fix labels in el2_setup macros") Cc: # 5.12.x Signed-off-by: Joakim Tjernlund Link: https://lore.kernel.org/r/20220214175643.21931-1-joakim.tjernlund@infinera.com Reviewed-by: Mark Rutland Reviewed-by: Marc Zyngier Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/el2_setup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 3198acb2aad8..7f3c87f7a0ce 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -106,7 +106,7 @@ msr_s SYS_ICC_SRE_EL2, x0 isb // Make sure SRE is now set mrs_s x0, SYS_ICC_SRE_EL2 // Read SRE back, - tbz x0, #0, 1f // and check that it sticks + tbz x0, #0, .Lskip_gicv3_\@ // and check that it sticks msr_s SYS_ICH_HCR_EL2, xzr // Reset ICC_HCR_EL2 to defaults .Lskip_gicv3_\@: .endm From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Fri, 21 Jan 2022 18:12:10 +0800 Subject: [PATCH 119/773] cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As previously discussed(https://lkml.org/lkml/2022/1/20/51), cpuset_attach() is affected with similar cpu hotplug race, as follow scenario: cpuset_attach() cpu hotplug --------------------------- ---------------------- down_write(cpuset_rwsem) guarantee_online_cpus() // (load cpus_attach) sched_cpu_deactivate set_cpu_active() // will change cpu_active_mask set_cpus_allowed_ptr(cpus_attach) __set_cpus_allowed_ptr_locked() // (if the intersection of cpus_attach and cpu_active_mask is empty, will return -EINVAL) up_write(cpuset_rwsem) To avoid races such as described above, protect cpuset_attach() call with cpu_hotplug_lock. Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time") Cc: stable@vger.kernel.org # v2.6.32+ Reported-by: Zhao Gongyi Signed-off-by: Zhang Qiao Acked-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4c7254e8f49a..97c53f3cc917 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); + cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ From 9d047bf68fe8cdb4086deaf4edd119731a9481ed Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Feb 2022 12:14:44 -0500 Subject: [PATCH 120/773] NFS: Remove an incorrect revalidation in nfs4_update_changeattr_locked() In nfs4_update_changeattr_locked(), we don't need to set the NFS_INO_REVAL_PAGECACHE flag, because we already know the value of the change attribute, and we're already flagging the size. In fact, this forces us to revalidate the change attribute a second time for no good reason. This extra flag appears to have been introduced as part of the xattr feature, when update_changeattr_locked() was converted for use by the xattr code. Fixes: 1b523ca972ed ("nfs: modify update_changeattr to deal with regular files") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f5020828ab65..0e0db6c27619 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1229,8 +1229,7 @@ nfs4_update_changeattr_locked(struct inode *inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | - NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR | - NFS_INO_REVAL_PAGECACHE; + NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR; nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); } nfsi->attrtimeo_timestamp = jiffies; From e0caaf75d443e02e55e146fd75fe2efc8aed5540 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Feb 2022 13:38:23 -0500 Subject: [PATCH 121/773] NFS: LOOKUP_DIRECTORY is also ok with symlinks Commit ac795161c936 (NFSv4: Handle case where the lookup of a directory fails) [1], part of Linux since 5.17-rc2, introduced a regression, where a symbolic link on an NFS mount to a directory on another NFS does not resolve(?) the first time it is accessed: Reported-by: Paul Menzel Fixes: ac795161c936 ("NFSv4: Handle case where the lookup of a directory fails") Signed-off-by: Trond Myklebust Tested-by: Donald Buczek Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7bc7cf6b26f0..75cb1cbe4cde 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2010,14 +2010,14 @@ no_open: if (!res) { inode = d_inode(dentry); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && - !S_ISDIR(inode->i_mode)) + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) res = ERR_PTR(-ENOTDIR); else if (inode && S_ISREG(inode->i_mode)) res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && - !S_ISDIR(inode->i_mode)) { + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { dput(res); res = ERR_PTR(-ENOTDIR); } else if (inode && S_ISREG(inode->i_mode)) { From 12f4a665cc3568328728e46c3162873b5b69cd27 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 31 Jan 2022 14:26:20 +0100 Subject: [PATCH 122/773] RISC-V: Fix hartid mask handling for hartid 31 and up Jessica reports that using "1 << hartid" causes undefined behavior for hartid 31 and up. Fix this by using the BIT() helper instead of an explicit shift. Reported-by: Jessica Clarke Fixes: 26fb751ca37846c9 ("RISC-V: Do not use cpumask data structure for hartid bitmap") Signed-off-by: Geert Uytterhoeven Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sbi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index f72527fcb347..f93bc75f2bc4 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -5,6 +5,7 @@ * Copyright (c) 2020 Western Digital Corporation or its affiliates. */ +#include #include #include #include @@ -85,7 +86,7 @@ static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mas pr_warn("Unable to send any request to hartid > BITS_PER_LONG for SBI v0.1\n"); break; } - hmask |= 1 << hartid; + hmask |= BIT(hartid); } return hmask; @@ -268,7 +269,7 @@ static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask) } if (!hmask) hbase = hartid; - hmask |= 1UL << (hartid - hbase); + hmask |= BIT(hartid - hbase); } if (hmask) { @@ -362,7 +363,7 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, } if (!hmask) hbase = hartid; - hmask |= 1UL << (hartid - hbase); + hmask |= BIT(hartid - hbase); } if (hmask) { From 2b35d5b7d13062b805aa82dc53812a5f56249287 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 31 Jan 2022 14:26:21 +0100 Subject: [PATCH 123/773] RISC-V: Fix handling of empty cpu masks The cpumask rework slightly changed the behavior of the code. Fix this by treating an empty cpumask as meaning all online CPUs. Extracted from a patch by Atish Patra . Reported-by: Jessica Clarke Fixes: 26fb751ca37846c9 ("RISC-V: Do not use cpumask data structure for hartid bitmap") Signed-off-by: Geert Uytterhoeven Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sbi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index f93bc75f2bc4..22444cfcd56c 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -161,7 +161,7 @@ static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) { unsigned long hart_mask; - if (!cpu_mask) + if (!cpu_mask || cpumask_empty(cpu_mask)) cpu_mask = cpu_online_mask; hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); @@ -177,7 +177,7 @@ static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, int result = 0; unsigned long hart_mask; - if (!cpu_mask) + if (!cpu_mask || cpumask_empty(cpu_mask)) cpu_mask = cpu_online_mask; hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); @@ -254,7 +254,7 @@ static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask) struct sbiret ret = {0}; int result; - if (!cpu_mask) + if (!cpu_mask || cpumask_empty(cpu_mask)) cpu_mask = cpu_online_mask; for_each_cpu(cpuid, cpu_mask) { @@ -348,7 +348,7 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, unsigned long hartid, cpuid, hmask = 0, hbase = 0; int result; - if (!cpu_mask) + if (!cpu_mask || cpumask_empty(cpu_mask)) cpu_mask = cpu_online_mask; for_each_cpu(cpuid, cpu_mask) { From 5feef64f4c67068c49f5409d43c67cabf2327f66 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 31 Jan 2022 14:26:22 +0100 Subject: [PATCH 124/773] RISC-V: Fix IPI/RFENCE hmask on non-monotonic hartid ordering If the boot CPU does not have the lowest hartid, "hartid - hbase" can become negative, leading to an incorrect hmask, causing userspace to crash with SEGV. This is observed on e.g. Starlight Beta, where cpuid 1 maps to hartid 0, and cpuid 0 maps to hartid 1. Fix this by detecting this case, and shifting the accumulated mask and updating hbase, if possible. Fixes: 26fb751ca37846c9 ("RISC-V: Do not use cpumask data structure for hartid bitmap") Signed-off-by: Geert Uytterhoeven Reviewed-by: Atish Patra Tested-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sbi.c | 57 ++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 22444cfcd56c..775d3322b422 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -250,7 +250,7 @@ static void __sbi_set_timer_v02(uint64_t stime_value) static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask) { - unsigned long hartid, cpuid, hmask = 0, hbase = 0; + unsigned long hartid, cpuid, hmask = 0, hbase = 0, htop = 0; struct sbiret ret = {0}; int result; @@ -259,16 +259,27 @@ static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask) for_each_cpu(cpuid, cpu_mask) { hartid = cpuid_to_hartid_map(cpuid); - if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) { - ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, - hmask, hbase, 0, 0, 0, 0); - if (ret.error) - goto ecall_failed; - hmask = 0; - hbase = 0; + if (hmask) { + if (hartid + BITS_PER_LONG <= htop || + hbase + BITS_PER_LONG <= hartid) { + ret = sbi_ecall(SBI_EXT_IPI, + SBI_EXT_IPI_SEND_IPI, hmask, + hbase, 0, 0, 0, 0); + if (ret.error) + goto ecall_failed; + hmask = 0; + } else if (hartid < hbase) { + /* shift the mask to fit lower hartid */ + hmask <<= hbase - hartid; + hbase = hartid; + } } - if (!hmask) + if (!hmask) { hbase = hartid; + htop = hartid; + } else if (hartid > htop) { + htop = hartid; + } hmask |= BIT(hartid - hbase); } @@ -345,7 +356,7 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { - unsigned long hartid, cpuid, hmask = 0, hbase = 0; + unsigned long hartid, cpuid, hmask = 0, hbase = 0, htop = 0; int result; if (!cpu_mask || cpumask_empty(cpu_mask)) @@ -353,16 +364,26 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, for_each_cpu(cpuid, cpu_mask) { hartid = cpuid_to_hartid_map(cpuid); - if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) { - result = __sbi_rfence_v02_call(fid, hmask, hbase, - start, size, arg4, arg5); - if (result) - return result; - hmask = 0; - hbase = 0; + if (hmask) { + if (hartid + BITS_PER_LONG <= htop || + hbase + BITS_PER_LONG <= hartid) { + result = __sbi_rfence_v02_call(fid, hmask, + hbase, start, size, arg4, arg5); + if (result) + return result; + hmask = 0; + } else if (hartid < hbase) { + /* shift the mask to fit lower hartid */ + hmask <<= hbase - hartid; + hbase = hartid; + } } - if (!hmask) + if (!hmask) { hbase = hartid; + htop = hartid; + } else if (hartid > htop) { + htop = hartid; + } hmask |= BIT(hartid - hbase); } From 6fec1ab67f8d60704cc7de64abcfd389ab131542 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 14 Feb 2022 09:36:57 +0100 Subject: [PATCH 125/773] selftests/ftrace: Do not trace do_softirq because of PREEMPT_RT The PREEMPT_RT patchset does not use do_softirq() function thus trying to filter for do_softirq fails for such kernel: echo do_softirq ftracetest: 81: echo: echo: I/O error Choose some other visible function for the test. The function does not have to be actually executed during the test, because it is only testing filter API interface. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index e96e279e0533..25432b8cd5bd 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -19,7 +19,7 @@ fail() { # mesg FILTER=set_ftrace_filter FUNC1="schedule" -FUNC2="do_softirq" +FUNC2="scheduler_tick" ALL_FUNCS="#### all functions enabled ####" From 7f4c5a26f735dea4bbc0eb8eb9da99cda95a8563 Mon Sep 17 00:00:00 2001 From: James Smart Date: Sat, 12 Feb 2022 08:31:20 -0800 Subject: [PATCH 126/773] scsi: lpfc: Fix pt2pt NVMe PRLI reject LOGO loop When connected point to point, the driver does not know the FC4's supported by the other end. In Fabrics, it can query the nameserver. Thus the driver must send PRLIs for the FC4s it supports and enable support based on the acc(ept) or rej(ect) of the respective FC4 PRLI. Currently the driver supports SCSI and NVMe PRLIs. Unfortunately, although the behavior is per standard, many devices have come to expect only SCSI PRLIs. In this particular example, the NVMe PRLI is properly RJT'd but the target decided that it must LOGO after seeing the unexpected NVMe PRLI. The LOGO causes the sequence to restart and login is now in an infinite failure loop. Fix the problem by having the driver, on a pt2pt link, remember NVMe PRLI accept or reject status across logout as long as the link stays "up". When retrying login, if the prior NVMe PRLI was rejected, it will not be sent on the next login. Link: https://lore.kernel.org/r/20220212163120.15385-1-jsmart2021@gmail.com Cc: # v5.4+ Reviewed-by: Ewan D. Milne Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc.h | 1 + drivers/scsi/lpfc/lpfc_attr.c | 3 +++ drivers/scsi/lpfc/lpfc_els.c | 20 +++++++++++++++++++- drivers/scsi/lpfc/lpfc_nportdisc.c | 5 +++-- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index a1e0a106c132..98cabe09c040 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -592,6 +592,7 @@ struct lpfc_vport { #define FC_VPORT_LOGO_RCVD 0x200 /* LOGO received on vport */ #define FC_RSCN_DISCOVERY 0x400 /* Auth all devices after RSCN */ #define FC_LOGO_RCVD_DID_CHNG 0x800 /* FDISC on phys port detect DID chng*/ +#define FC_PT2PT_NO_NVME 0x1000 /* Don't send NVME PRLI */ #define FC_SCSI_SCAN_TMO 0x4000 /* scsi scan timer running */ #define FC_ABORT_DISCOVERY 0x8000 /* we want to abort discovery */ #define FC_NDISC_ACTIVE 0x10000 /* NPort discovery active */ diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index bac78fbce8d6..fa8415259cb8 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -1315,6 +1315,9 @@ lpfc_issue_lip(struct Scsi_Host *shost) pmboxq->u.mb.mbxCommand = MBX_DOWN_LINK; pmboxq->u.mb.mbxOwner = OWN_HOST; + if ((vport->fc_flag & FC_PT2PT) && (vport->fc_flag & FC_PT2PT_NO_NVME)) + vport->fc_flag &= ~FC_PT2PT_NO_NVME; + mbxstatus = lpfc_sli_issue_mbox_wait(phba, pmboxq, LPFC_MBOX_TMO * 2); if ((mbxstatus == MBX_SUCCESS) && diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index db5ccae1b63d..f936833c9909 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -1072,7 +1072,8 @@ stop_rr_fcf_flogi: /* FLOGI failed, so there is no fabric */ spin_lock_irq(shost->host_lock); - vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP); + vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP | + FC_PT2PT_NO_NVME); spin_unlock_irq(shost->host_lock); /* If private loop, then allow max outstanding els to be @@ -4607,6 +4608,23 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, /* Added for Vendor specifc support * Just keep retrying for these Rsn / Exp codes */ + if ((vport->fc_flag & FC_PT2PT) && + cmd == ELS_CMD_NVMEPRLI) { + switch (stat.un.b.lsRjtRsnCode) { + case LSRJT_UNABLE_TPC: + case LSRJT_INVALID_CMD: + case LSRJT_LOGICAL_ERR: + case LSRJT_CMD_UNSUPPORTED: + lpfc_printf_vlog(vport, KERN_WARNING, LOG_ELS, + "0168 NVME PRLI LS_RJT " + "reason %x port doesn't " + "support NVME, disabling NVME\n", + stat.un.b.lsRjtRsnCode); + retry = 0; + vport->fc_flag |= FC_PT2PT_NO_NVME; + goto out_retry; + } + } switch (stat.un.b.lsRjtRsnCode) { case LSRJT_UNABLE_TPC: /* The driver has a VALID PLOGI but the rport has diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index 7d717a4ac14d..fdf5e777bf11 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -1961,8 +1961,9 @@ lpfc_cmpl_reglogin_reglogin_issue(struct lpfc_vport *vport, * is configured try it. */ ndlp->nlp_fc4_type |= NLP_FC4_FCP; - if ((vport->cfg_enable_fc4_type == LPFC_ENABLE_BOTH) || - (vport->cfg_enable_fc4_type == LPFC_ENABLE_NVME)) { + if ((!(vport->fc_flag & FC_PT2PT_NO_NVME)) && + (vport->cfg_enable_fc4_type == LPFC_ENABLE_BOTH || + vport->cfg_enable_fc4_type == LPFC_ENABLE_NVME)) { ndlp->nlp_fc4_type |= NLP_FC4_NVME; /* We need to update the localport also */ lpfc_nvme_update_localport(vport); From 10af115646171afc0217177d6eae92917b785897 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Mon, 14 Feb 2022 19:33:52 +0900 Subject: [PATCH 127/773] scsi: ufs: core: Fix divide by zero in ufshcd_map_queues() Before calling blk_mq_map_queues(), the mq_map and nr_queues belonging to struct blk_mq_queue_map must have a valid value. If nr_queues is set to 0, the system may encounter a divide by zero depending on the type of architecture. blk_mq_map_queues() -> queue_index() Link: https://lore.kernel.org/r/1891546521.01644873481638.JavaMail.epsvc@epcpadp4 Reviewed-by: Bart Van Assche Signed-off-by: Jinyoung Choi Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 50b12d60dc1b..9349557b8a01 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -2681,7 +2681,7 @@ static int ufshcd_map_queues(struct Scsi_Host *shost) break; case HCTX_TYPE_READ: map->nr_queues = 0; - break; + continue; default: WARN_ON_ONCE(true); } From 382e3e0eb6a83f1cf73d4dfa3448ade1ed721f22 Mon Sep 17 00:00:00 2001 From: Steev Klimaszewski Date: Thu, 4 Nov 2021 22:52:32 -0500 Subject: [PATCH 128/773] arm64: dts: qcom: c630: disable crypto due to serror Disable the crypto block due to it causing an SError in qce_start() on the C630, which happens upon every boot when cryptomanager tests are enabled. Signed-off-by: Steev Klimaszewski [bjorn: Reworked commit message] Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20211105035235.2392-1-steev@kali.org --- arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts index 58845a14805f..e2b9ec134cb1 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts @@ -807,3 +807,8 @@ qcom,snoc-host-cap-8bit-quirk; }; + +&crypto { + /* FIXME: qce_start triggers an SError */ + status= "disable"; +}; From 3c62fd3406e0b2277c76a6984d3979c7f3f1d129 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 5 Feb 2022 07:58:44 +0100 Subject: [PATCH 129/773] dmaengine: ptdma: Fix the error handling path in pt_core_init() In order to free resources correctly in the error handling path of pt_core_init(), 2 goto's have to be switched. Otherwise, some resources will leak and we will try to release things that have not been allocated yet. Also move a dev_err() to a place where it is more meaningful. Fixes: fa5d823b16a9 ("dmaengine: ptdma: Initial driver for the AMD PTDMA") Signed-off-by: Christophe JAILLET Acked-by: Sanjay R Mehta Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/41a963a35173f89c874f5c44df5530dc09fea8da.1644044244.git.christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- drivers/dma/ptdma/ptdma-dev.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/dma/ptdma/ptdma-dev.c b/drivers/dma/ptdma/ptdma-dev.c index 8a6bf291a73f..daafea5bc35d 100644 --- a/drivers/dma/ptdma/ptdma-dev.c +++ b/drivers/dma/ptdma/ptdma-dev.c @@ -207,7 +207,7 @@ int pt_core_init(struct pt_device *pt) if (!cmd_q->qbase) { dev_err(dev, "unable to allocate command queue\n"); ret = -ENOMEM; - goto e_dma_alloc; + goto e_destroy_pool; } cmd_q->qidx = 0; @@ -229,8 +229,10 @@ int pt_core_init(struct pt_device *pt) /* Request an irq */ ret = request_irq(pt->pt_irq, pt_core_irq_handler, 0, dev_name(pt->dev), pt); - if (ret) - goto e_pool; + if (ret) { + dev_err(dev, "unable to allocate an IRQ\n"); + goto e_free_dma; + } /* Update the device registers with queue information. */ cmd_q->qcontrol &= ~CMD_Q_SIZE; @@ -250,21 +252,20 @@ int pt_core_init(struct pt_device *pt) /* Register the DMA engine support */ ret = pt_dmaengine_register(pt); if (ret) - goto e_dmaengine; + goto e_free_irq; /* Set up debugfs entries */ ptdma_debugfs_setup(pt); return 0; -e_dmaengine: +e_free_irq: free_irq(pt->pt_irq, pt); -e_dma_alloc: +e_free_dma: dma_free_coherent(dev, cmd_q->qsize, cmd_q->qbase, cmd_q->qbase_dma); -e_pool: - dev_err(dev, "unable to allocate an IRQ\n"); +e_destroy_pool: dma_pool_destroy(pt->cmd_q.dma_pool); return ret; From aa7accb7f91c4c2c98bfdde62446d96ecc1ef2c6 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 7 Jan 2022 10:40:47 +0800 Subject: [PATCH 130/773] dmaengine: at_xdmac: Fix missing unlock in at_xdmac_tasklet() Add the missing unlock before return from at_xdmac_tasklet(). Fixes: e77e561925df ("dmaengine: at_xdmac: Fix race over irq_status") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Reviewed-by: Tudor Ambarus Link: https://lore.kernel.org/r/20220107024047.1051915-1-yangyingliang@huawei.com Signed-off-by: Vinod Koul --- drivers/dma/at_xdmac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index a1da2b4b6d73..1476156af74b 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1681,8 +1681,10 @@ static void at_xdmac_tasklet(struct tasklet_struct *t) __func__, atchan->irq_status); if (!(atchan->irq_status & AT_XDMAC_CIS_LIS) && - !(atchan->irq_status & error_mask)) + !(atchan->irq_status & error_mask)) { + spin_unlock_irq(&atchan->lock); return; + } if (atchan->irq_status & error_mask) at_xdmac_handle_error(atchan); From 2d21543efe332cd8c8f212fb7d365bc8b0690bfa Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 6 Jan 2022 11:09:39 +0800 Subject: [PATCH 131/773] dmaengine: sh: rcar-dmac: Check for error num after setting mask Because of the possible failure of the dma_supported(), the dma_set_mask_and_coherent() may return error num. Therefore, it should be better to check it and return the error if fails. Fixes: dc312349e875 ("dmaengine: rcar-dmac: Widen DMA mask to 40 bits") Signed-off-by: Jiasheng Jiang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220106030939.2644320-1-jiasheng@iscas.ac.cn Signed-off-by: Vinod Koul --- drivers/dma/sh/rcar-dmac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c index 481f45c77ce1..4a34ddd9c1c6 100644 --- a/drivers/dma/sh/rcar-dmac.c +++ b/drivers/dma/sh/rcar-dmac.c @@ -1869,7 +1869,9 @@ static int rcar_dmac_probe(struct platform_device *pdev) dmac->dev = &pdev->dev; platform_set_drvdata(pdev, dmac); dma_set_max_seg_size(dmac->dev, RCAR_DMATCR_MASK); - dma_set_mask_and_coherent(dmac->dev, DMA_BIT_MASK(40)); + ret = dma_set_mask_and_coherent(dmac->dev, DMA_BIT_MASK(40)); + if (ret) + return ret; ret = rcar_dmac_parse_of(&pdev->dev, dmac); if (ret < 0) From e831c7aba950f3ae94002b10321279654525e5ec Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Sat, 8 Jan 2022 08:53:36 +0000 Subject: [PATCH 132/773] dmaengine: stm32-dmamux: Fix PM disable depth imbalance in stm32_dmamux_probe The pm_runtime_enable will increase power disable depth. If the probe fails, we should use pm_runtime_disable() to balance pm_runtime_enable(). Fixes: 4f3ceca254e0 ("dmaengine: stm32-dmamux: Add PM Runtime support") Signed-off-by: Miaoqian Lin Reviewed-by: Amelie Delaunay Link: https://lore.kernel.org/r/20220108085336.11992-1-linmq006@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/stm32-dmamux.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/stm32-dmamux.c b/drivers/dma/stm32-dmamux.c index a42164389ebc..d5d55732adba 100644 --- a/drivers/dma/stm32-dmamux.c +++ b/drivers/dma/stm32-dmamux.c @@ -292,10 +292,12 @@ static int stm32_dmamux_probe(struct platform_device *pdev) ret = of_dma_router_register(node, stm32_dmamux_route_allocate, &stm32_dmamux->dmarouter); if (ret) - goto err_clk; + goto pm_disable; return 0; +pm_disable: + pm_runtime_disable(&pdev->dev); err_clk: clk_disable_unprepare(stm32_dmamux->clk); From da2ad87fba0891576aadda9161b8505fde81a84d Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 11 Jan 2022 09:12:39 +0800 Subject: [PATCH 133/773] dmaengine: sh: rcar-dmac: Check for error num after dma_set_max_seg_size As the possible failure of the dma_set_max_seg_size(), it should be better to check the return value of the dma_set_max_seg_size(). Fixes: 97d49c59e219 ("dmaengine: rcar-dmac: set scatter/gather max segment size") Reported-by: Geert Uytterhoeven Signed-off-by: Jiasheng Jiang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220111011239.452837-1-jiasheng@iscas.ac.cn Signed-off-by: Vinod Koul --- drivers/dma/sh/rcar-dmac.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c index 4a34ddd9c1c6..13d12d660cc2 100644 --- a/drivers/dma/sh/rcar-dmac.c +++ b/drivers/dma/sh/rcar-dmac.c @@ -1868,7 +1868,10 @@ static int rcar_dmac_probe(struct platform_device *pdev) dmac->dev = &pdev->dev; platform_set_drvdata(pdev, dmac); - dma_set_max_seg_size(dmac->dev, RCAR_DMATCR_MASK); + ret = dma_set_max_seg_size(dmac->dev, RCAR_DMATCR_MASK); + if (ret) + return ret; + ret = dma_set_mask_and_coherent(dmac->dev, DMA_BIT_MASK(40)); if (ret) return ret; From 455896c53d5b803733ddd84e1bf8a430644439b6 Mon Sep 17 00:00:00 2001 From: Yongzhi Liu Date: Sat, 15 Jan 2022 21:34:56 -0800 Subject: [PATCH 134/773] dmaengine: shdma: Fix runtime PM imbalance on error pm_runtime_get_() increments the runtime PM usage counter even when it returns an error code, thus a matching decrement is needed on the error handling path to keep the counter balanced. Signed-off-by: Yongzhi Liu Link: https://lore.kernel.org/r/1642311296-87020-1-git-send-email-lyz_cs@pku.edu.cn Signed-off-by: Vinod Koul --- drivers/dma/sh/shdma-base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c index 158e5e7defae..b26ed690f03c 100644 --- a/drivers/dma/sh/shdma-base.c +++ b/drivers/dma/sh/shdma-base.c @@ -115,8 +115,10 @@ static dma_cookie_t shdma_tx_submit(struct dma_async_tx_descriptor *tx) ret = pm_runtime_get(schan->dev); spin_unlock_irq(&schan->chan_lock); - if (ret < 0) + if (ret < 0) { dev_err(schan->dev, "%s(): GET = %d\n", __func__, ret); + pm_runtime_put(schan->dev); + } pm_runtime_barrier(schan->dev); From 0b0dcb3882c8f08bdeafa03adb4487e104d26050 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 12 Feb 2022 20:45:48 +0100 Subject: [PATCH 135/773] i2c: cadence: allow COMPILE_TEST Driver builds fine with COMPILE_TEST. Enable it for wider test coverage and easier maintenance. Signed-off-by: Wolfram Sang Acked-by: Michal Simek --- drivers/i2c/busses/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 42da31c1ab70..bad2fadc94a3 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -488,7 +488,7 @@ config I2C_BRCMSTB config I2C_CADENCE tristate "Cadence I2C Controller" - depends on ARCH_ZYNQ || ARM64 || XTENSA + depends on ARCH_ZYNQ || ARM64 || XTENSA || COMPILE_TEST help Say yes here to select Cadence I2C Host Controller. This controller is e.g. used by Xilinx Zynq. From 2ce4462f2724d1b3cedccea441c6d18bb360629a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 12 Feb 2022 20:46:57 +0100 Subject: [PATCH 136/773] i2c: imx: allow COMPILE_TEST Driver builds fine with COMPILE_TEST. Enable it for wider test coverage and easier maintenance. Signed-off-by: Wolfram Sang Acked-by: Oleksij Rempel --- drivers/i2c/busses/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index bad2fadc94a3..255fd4198433 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -680,7 +680,7 @@ config I2C_IMG config I2C_IMX tristate "IMX I2C interface" - depends on ARCH_MXC || ARCH_LAYERSCAPE || COLDFIRE + depends on ARCH_MXC || ARCH_LAYERSCAPE || COLDFIRE || COMPILE_TEST select I2C_SLAVE help Say Y here if you want to use the IIC bus controller on From 5de717974005fcad2502281e9f82e139ca91f4bb Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 12 Feb 2022 20:47:07 +0100 Subject: [PATCH 137/773] i2c: qup: allow COMPILE_TEST Driver builds fine with COMPILE_TEST. Enable it for wider test coverage and easier maintenance. Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 255fd4198433..8a6c6ee28556 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -935,7 +935,7 @@ config I2C_QCOM_GENI config I2C_QUP tristate "Qualcomm QUP based I2C controller" - depends on ARCH_QCOM + depends on ARCH_QCOM || COMPILE_TEST help If you say yes to this option, support will be included for the built-in I2C interface on the Qualcomm SoCs. From fe663df7825811358531dc2e8a52d9eaa5e3515e Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 11 Feb 2022 01:51:13 +0100 Subject: [PATCH 138/773] powerpc/lib/sstep: fix 'ptesync' build error Building tinyconfig with gcc (Debian 11.2.0-16) and assembler (Debian 2.37.90.20220207) the following build error shows up: {standard input}: Assembler messages: {standard input}:2088: Error: unrecognized opcode: `ptesync' make[3]: *** [/builds/linux/scripts/Makefile.build:287: arch/powerpc/lib/sstep.o] Error 1 Add the 'ifdef CONFIG_PPC64' around the 'ptesync' in function 'emulate_update_regs()' to like it is in 'analyse_instr()'. Since it looks like it got dropped inadvertently by commit 3cdfcbfd32b9 ("powerpc: Change analyse_instr so it doesn't modify *regs"). A key detail is that analyse_instr() will never recognise lwsync or ptesync on 32-bit (because of the existing ifdef), and as a result emulate_update_regs() should never be called with an op specifying either of those on 32-bit. So removing them from emulate_update_regs() should be a nop in terms of runtime behaviour. Fixes: 3cdfcbfd32b9 ("powerpc: Change analyse_instr so it doesn't modify *regs") Cc: stable@vger.kernel.org # v4.14+ Suggested-by: Arnd Bergmann Signed-off-by: Anders Roxell [mpe: Add last paragraph of change log mentioning analyse_instr() details] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220211005113.1361436-1-anders.roxell@linaro.org --- arch/powerpc/lib/sstep.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index a94b0cd0bdc5..bd3734d5be89 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -3264,12 +3264,14 @@ void emulate_update_regs(struct pt_regs *regs, struct instruction_op *op) case BARRIER_EIEIO: eieio(); break; +#ifdef CONFIG_PPC64 case BARRIER_LWSYNC: asm volatile("lwsync" : : : "memory"); break; case BARRIER_PTESYNC: asm volatile("ptesync" : : : "memory"); break; +#endif } break; From 038438a25c45d5ac996e95a22fa9e76ff3d1f8c7 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 14 Feb 2022 17:19:48 +0300 Subject: [PATCH 139/773] usb: dwc3: pci: add support for the Intel Raptor Lake-S This patch adds the necessary PCI ID for Intel Raptor Lake-S devices. Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20220214141948.18637-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-pci.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index 7ff8fc8f79a9..4e69a9d829f2 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -43,6 +43,7 @@ #define PCI_DEVICE_ID_INTEL_ADLP 0x51ee #define PCI_DEVICE_ID_INTEL_ADLM 0x54ee #define PCI_DEVICE_ID_INTEL_ADLS 0x7ae1 +#define PCI_DEVICE_ID_INTEL_RPLS 0x7a61 #define PCI_DEVICE_ID_INTEL_TGL 0x9a15 #define PCI_DEVICE_ID_AMD_MR 0x163a @@ -409,6 +410,9 @@ static const struct pci_device_id dwc3_pci_id_table[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ADLS), (kernel_ulong_t) &dwc3_pci_intel_swnode, }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_RPLS), + (kernel_ulong_t) &dwc3_pci_intel_swnode, }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_TGL), (kernel_ulong_t) &dwc3_pci_intel_swnode, }, From d7c93a903f33ff35aa0e6b5a8032eb9755b00826 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 13 Feb 2022 14:05:16 +0100 Subject: [PATCH 140/773] usb: dwc3: pci: Add "snps,dis_u2_susphy_quirk" for Intel Bay Trail Commit e0082698b689 ("usb: dwc3: ulpi: conditionally resume ULPI PHY") fixed an issue where ULPI transfers would timeout if any requests where send to the phy sometime after init, giving it enough time to auto-suspend. Commit e5f4ca3fce90 ("usb: dwc3: ulpi: Fix USB2.0 HS/FS/LS PHY suspend regression") changed the behavior to instead of clearing the DWC3_GUSB2PHYCFG_SUSPHY bit, add an extra sleep when it is set. But on Bay Trail devices, when phy_set_mode() gets called during init, this leads to errors like these: [ 28.451522] tusb1210 dwc3.ulpi: error -110 writing val 0x01 to reg 0x0a [ 28.464089] tusb1210 dwc3.ulpi: error -110 writing val 0x01 to reg 0x0a Add "snps,dis_u2_susphy_quirk" to the settings for Bay Trail devices to fix this. This restores the old behavior for Bay Trail devices, since previously the DWC3_GUSB2PHYCFG_SUSPHY bit would get cleared on the first ulpi_read/_write() and then was never set again. Fixes: e5f4ca3fce90 ("usb: dwc3: ulpi: Fix USB2.0 HS/FS/LS PHY suspend regression") Cc: stable@kernel.org Cc: Serge Semin Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220213130524.18748-2-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-pci.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index 4e69a9d829f2..18ab49b8e66e 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -120,6 +120,13 @@ static const struct property_entry dwc3_pci_intel_properties[] = { {} }; +static const struct property_entry dwc3_pci_intel_byt_properties[] = { + PROPERTY_ENTRY_STRING("dr_mode", "peripheral"), + PROPERTY_ENTRY_BOOL("snps,dis_u2_susphy_quirk"), + PROPERTY_ENTRY_BOOL("linux,sysdev_is_parent"), + {} +}; + static const struct property_entry dwc3_pci_mrfld_properties[] = { PROPERTY_ENTRY_STRING("dr_mode", "otg"), PROPERTY_ENTRY_STRING("linux,extcon-name", "mrfld_bcove_pwrsrc"), @@ -162,6 +169,10 @@ static const struct software_node dwc3_pci_intel_swnode = { .properties = dwc3_pci_intel_properties, }; +static const struct software_node dwc3_pci_intel_byt_swnode = { + .properties = dwc3_pci_intel_byt_properties, +}; + static const struct software_node dwc3_pci_intel_mrfld_swnode = { .properties = dwc3_pci_mrfld_properties, }; @@ -345,7 +356,7 @@ static const struct pci_device_id dwc3_pci_id_table[] = { (kernel_ulong_t) &dwc3_pci_intel_swnode, }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BYT), - (kernel_ulong_t) &dwc3_pci_intel_swnode, }, + (kernel_ulong_t) &dwc3_pci_intel_byt_swnode, }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_MRFLD), (kernel_ulong_t) &dwc3_pci_intel_mrfld_swnode, }, From 8b328f8002bcf29ef517ee4bf234e09aabec4d2e Mon Sep 17 00:00:00 2001 From: Puma Hsu Date: Tue, 15 Feb 2022 14:33:19 +0200 Subject: [PATCH 141/773] xhci: re-initialize the HC during resume if HCE was set When HCE(Host Controller Error) is set, it means an internal error condition has been detected. Software needs to re-initialize the HC, so add this check in xhci resume. Cc: stable@vger.kernel.org Signed-off-by: Puma Hsu Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20220215123320.1253947-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index dc357cabb265..04ec2de158bf 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1091,6 +1091,7 @@ int xhci_resume(struct xhci_hcd *xhci, bool hibernated) int retval = 0; bool comp_timer_running = false; bool pending_portevent = false; + bool reinit_xhc = false; if (!hcd->state) return 0; @@ -1107,10 +1108,11 @@ int xhci_resume(struct xhci_hcd *xhci, bool hibernated) set_bit(HCD_FLAG_HW_ACCESSIBLE, &xhci->shared_hcd->flags); spin_lock_irq(&xhci->lock); - if ((xhci->quirks & XHCI_RESET_ON_RESUME) || xhci->broken_suspend) - hibernated = true; - if (!hibernated) { + if (hibernated || xhci->quirks & XHCI_RESET_ON_RESUME || xhci->broken_suspend) + reinit_xhc = true; + + if (!reinit_xhc) { /* * Some controllers might lose power during suspend, so wait * for controller not ready bit to clear, just as in xHC init. @@ -1143,12 +1145,17 @@ int xhci_resume(struct xhci_hcd *xhci, bool hibernated) spin_unlock_irq(&xhci->lock); return -ETIMEDOUT; } - temp = readl(&xhci->op_regs->status); } - /* If restore operation fails, re-initialize the HC during resume */ - if ((temp & STS_SRE) || hibernated) { + temp = readl(&xhci->op_regs->status); + /* re-initialize the HC on Restore Error, or Host Controller Error */ + if (temp & (STS_SRE | STS_HCE)) { + reinit_xhc = true; + xhci_warn(xhci, "xHC error in resume, USBSTS 0x%x, Reinit\n", temp); + } + + if (reinit_xhc) { if ((xhci->quirks & XHCI_COMP_MODE_QUIRK) && !(xhci_all_ports_seen_u0(xhci))) { del_timer_sync(&xhci->comp_mode_recovery_timer); From 243a1dd7ba48c120986dd9e66fee74bcb7751034 Mon Sep 17 00:00:00 2001 From: Hongyu Xie Date: Tue, 15 Feb 2022 14:33:20 +0200 Subject: [PATCH 142/773] xhci: Prevent futile URB re-submissions due to incorrect return value. The -ENODEV return value from xhci_check_args() is incorrectly changed to -EINVAL in a couple places before propagated further. xhci_check_args() returns 4 types of value, -ENODEV, -EINVAL, 1 and 0. xhci_urb_enqueue and xhci_check_streams_endpoint return -EINVAL if the return value of xhci_check_args <= 0. This causes problems for example r8152_submit_rx, calling usb_submit_urb in drivers/net/usb/r8152.c. r8152_submit_rx will never get -ENODEV after submiting an urb when xHC is halted because xhci_urb_enqueue returns -EINVAL in the very beginning. [commit message and header edit -Mathias] Fixes: 203a86613fb3 ("xhci: Avoid NULL pointer deref when host dies.") Cc: stable@vger.kernel.org Signed-off-by: Hongyu Xie Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20220215123320.1253947-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 04ec2de158bf..2d378543bc3a 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1611,9 +1611,12 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag struct urb_priv *urb_priv; int num_tds; - if (!urb || xhci_check_args(hcd, urb->dev, urb->ep, - true, true, __func__) <= 0) + if (!urb) return -EINVAL; + ret = xhci_check_args(hcd, urb->dev, urb->ep, + true, true, __func__); + if (ret <= 0) + return ret ? ret : -EINVAL; slot_id = urb->dev->slot_id; ep_index = xhci_get_endpoint_index(&urb->ep->desc); @@ -3330,7 +3333,7 @@ static int xhci_check_streams_endpoint(struct xhci_hcd *xhci, return -EINVAL; ret = xhci_check_args(xhci_to_hcd(xhci), udev, ep, 1, true, __func__); if (ret <= 0) - return -EINVAL; + return ret ? ret : -EINVAL; if (usb_ss_max_streams(&ep->ss_ep_comp) == 0) { xhci_warn(xhci, "WARN: SuperSpeed Endpoint Companion" " descriptor for ep 0x%x does not support streams\n", From f240762f88b4b1b58561939ffd44837759756477 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Feb 2022 20:10:03 -0800 Subject: [PATCH 143/773] io_uring: add a schedule point in io_add_buffers() Looping ~65535 times doing kmalloc() calls can trigger soft lockups, especially with DEBUG features (like KASAN). [ 253.536212] watchdog: BUG: soft lockup - CPU#64 stuck for 26s! [b219417889:12575] [ 253.544433] Modules linked in: vfat fat i2c_mux_pca954x i2c_mux spidev cdc_acm xhci_pci xhci_hcd sha3_generic gq(O) [ 253.544451] CPU: 64 PID: 12575 Comm: b219417889 Tainted: G S O 5.17.0-smp-DEV #801 [ 253.544457] RIP: 0010:kernel_text_address (./include/asm-generic/sections.h:192 ./include/linux/kallsyms.h:29 kernel/extable.c:67 kernel/extable.c:98) [ 253.544464] Code: 0f 93 c0 48 c7 c1 e0 63 d7 a4 48 39 cb 0f 92 c1 20 c1 0f b6 c1 5b 5d c3 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 53 48 89 fb <48> c7 c0 00 00 80 a0 41 be 01 00 00 00 48 39 c7 72 0c 48 c7 c0 40 [ 253.544468] RSP: 0018:ffff8882d8baf4c0 EFLAGS: 00000246 [ 253.544471] RAX: 1ffff1105b175e00 RBX: ffffffffa13ef09a RCX: 00000000a13ef001 [ 253.544474] RDX: ffffffffa13ef09a RSI: ffff8882d8baf558 RDI: ffffffffa13ef09a [ 253.544476] RBP: ffff8882d8baf4d8 R08: ffff8882d8baf5e0 R09: 0000000000000004 [ 253.544479] R10: ffff8882d8baf5e8 R11: ffffffffa0d59a50 R12: ffff8882eab20380 [ 253.544481] R13: ffffffffa0d59a50 R14: dffffc0000000000 R15: 1ffff1105b175eb0 [ 253.544483] FS: 00000000016d3380(0000) GS:ffff88af48c00000(0000) knlGS:0000000000000000 [ 253.544486] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 253.544488] CR2: 00000000004af0f0 CR3: 00000002eabfa004 CR4: 00000000003706e0 [ 253.544491] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 253.544492] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 253.544494] Call Trace: [ 253.544496] [ 253.544498] ? io_queue_sqe (fs/io_uring.c:7143) [ 253.544505] __kernel_text_address (kernel/extable.c:78) [ 253.544508] unwind_get_return_address (arch/x86/kernel/unwind_frame.c:19) [ 253.544514] arch_stack_walk (arch/x86/kernel/stacktrace.c:27) [ 253.544517] ? io_queue_sqe (fs/io_uring.c:7143) [ 253.544521] stack_trace_save (kernel/stacktrace.c:123) [ 253.544527] ____kasan_kmalloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:515) [ 253.544531] ? ____kasan_kmalloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:515) [ 253.544533] ? __kasan_kmalloc (mm/kasan/common.c:524) [ 253.544535] ? kmem_cache_alloc_trace (./include/linux/kasan.h:270 mm/slab.c:3567) [ 253.544541] ? io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544544] ? __io_queue_sqe (fs/io_uring.c:?) [ 253.544551] __kasan_kmalloc (mm/kasan/common.c:524) [ 253.544553] kmem_cache_alloc_trace (./include/linux/kasan.h:270 mm/slab.c:3567) [ 253.544556] ? io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544560] io_issue_sqe (fs/io_uring.c:4556 fs/io_uring.c:4589 fs/io_uring.c:6828) [ 253.544564] ? __kasan_slab_alloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) [ 253.544567] ? __kasan_slab_alloc (mm/kasan/common.c:39 mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) [ 253.544569] ? kmem_cache_alloc_bulk (mm/slab.h:732 mm/slab.c:3546) [ 253.544573] ? __io_alloc_req_refill (fs/io_uring.c:2078) [ 253.544578] ? io_submit_sqes (fs/io_uring.c:7441) [ 253.544581] ? __se_sys_io_uring_enter (fs/io_uring.c:10154 fs/io_uring.c:10096) [ 253.544584] ? __x64_sys_io_uring_enter (fs/io_uring.c:10096) [ 253.544587] ? do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 253.544590] ? entry_SYSCALL_64_after_hwframe (??:?) [ 253.544596] __io_queue_sqe (fs/io_uring.c:?) [ 253.544600] io_queue_sqe (fs/io_uring.c:7143) [ 253.544603] io_submit_sqe (fs/io_uring.c:?) [ 253.544608] io_submit_sqes (fs/io_uring.c:?) [ 253.544612] __se_sys_io_uring_enter (fs/io_uring.c:10154 fs/io_uring.c:10096) [ 253.544616] __x64_sys_io_uring_enter (fs/io_uring.c:10096) [ 253.544619] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 253.544623] entry_SYSCALL_64_after_hwframe (??:?) Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS") Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Pavel Begunkov Cc: io-uring Reported-by: syzbot Link: https://lore.kernel.org/r/20220215041003.2394784-1-eric.dumazet@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 77b9c7e4793b..b928928617a4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4567,6 +4567,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) } else { list_add_tail(&buf->list, &(*head)->list); } + cond_resched(); } return i ? i : -ENOMEM; From f8efca92ae509c25e0a4bd5d0a86decea4f0c41e Mon Sep 17 00:00:00 2001 From: Eliav Farber Date: Thu, 13 Jan 2022 10:06:19 +0000 Subject: [PATCH 144/773] EDAC: Fix calculation of returned address and next offset in edac_align_ptr() Do alignment logic properly and use the "ptr" local variable for calculating the remainder of the alignment. This became an issue because struct edac_mc_layer has a size that is not zero modulo eight, and the next offset that was prepared for the private data was unaligned, causing an alignment exception. The patch in Fixes: which broke this actually wanted to "what we actually care about is the alignment of the actual pointer that's about to be returned." But it didn't check that alignment. Use the correct variable "ptr" for that. [ bp: Massage commit message. ] Fixes: 8447c4d15e35 ("edac: Do alignment logic properly in edac_align_ptr()") Signed-off-by: Eliav Farber Signed-off-by: Borislav Petkov Cc: Link: https://lore.kernel.org/r/20220113100622.12783-2-farbere@amazon.com --- drivers/edac/edac_mc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 9d9aabdec96b..f5677d81bd2d 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -215,7 +215,7 @@ void *edac_align_ptr(void **p, unsigned int size, int n_elems) else return (char *)ptr; - r = (unsigned long)p % align; + r = (unsigned long)ptr % align; if (r == 0) return (char *)ptr; From b6821b0d9b56386d2bf14806f90ec401468c799f Mon Sep 17 00:00:00 2001 From: Oliver Graute Date: Thu, 10 Feb 2022 09:53:22 +0100 Subject: [PATCH 145/773] staging: fbtft: fb_st7789v: reset display before initialization In rare cases the display is flipped or mirrored. This was observed more often in a low temperature environment. A clean reset on init_display() should help to get registers in a sane state. Fixes: ef8f317795da (staging: fbtft: use init function instead of init sequence) Cc: stable@vger.kernel.org Signed-off-by: Oliver Graute Link: https://lore.kernel.org/r/20220210085322.15676-1-oliver.graute@kococonnector.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/fbtft/fb_st7789v.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/fbtft/fb_st7789v.c b/drivers/staging/fbtft/fb_st7789v.c index abe9395a0aef..861a154144e6 100644 --- a/drivers/staging/fbtft/fb_st7789v.c +++ b/drivers/staging/fbtft/fb_st7789v.c @@ -144,6 +144,8 @@ static int init_display(struct fbtft_par *par) { int rc; + par->fbtftops.reset(par); + rc = init_tearing_effect_line(par); if (rc) return rc; From 6f66db29e2415cbe8759c48584f9cae19b3c2651 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 14 Dec 2021 19:49:13 +0200 Subject: [PATCH 146/773] pinctrl: tigerlake: Revert "Add Alder Lake-M ACPI ID" It appears that last minute change moved ACPI ID of Alder Lake-M to the INTC1055, which is already in the driver. This ID on the other hand will be used elsewhere. This reverts commit 258435a1c8187f559549e515d2f77fa0b57bcd27. Signed-off-by: Andy Shevchenko --- drivers/pinctrl/intel/pinctrl-tigerlake.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pinctrl/intel/pinctrl-tigerlake.c b/drivers/pinctrl/intel/pinctrl-tigerlake.c index 0bcd19597e4a..3ddaeffc0415 100644 --- a/drivers/pinctrl/intel/pinctrl-tigerlake.c +++ b/drivers/pinctrl/intel/pinctrl-tigerlake.c @@ -749,7 +749,6 @@ static const struct acpi_device_id tgl_pinctrl_acpi_match[] = { { "INT34C5", (kernel_ulong_t)&tgllp_soc_data }, { "INT34C6", (kernel_ulong_t)&tglh_soc_data }, { "INTC1055", (kernel_ulong_t)&tgllp_soc_data }, - { "INTC1057", (kernel_ulong_t)&tgllp_soc_data }, { } }; MODULE_DEVICE_TABLE(acpi, tgl_pinctrl_acpi_match); From 4330e2c5c04c27bebf89d34e0bc14e6943413067 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 17 Nov 2021 15:15:26 +0000 Subject: [PATCH 147/773] arm64: entry.S: Add ventry overflow sanity checks Subsequent patches add even more code to the ventry slots. Ensure kernels that overflow a ventry slot don't get built. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 772ec2ecf488..bd940b2254da 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -37,6 +37,7 @@ .macro kernel_ventry, el:req, ht:req, regsize:req, label:req .align 7 +.Lventry_start\@: #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 .if \el == 0 alternative_if ARM64_UNMAP_KERNEL_AT_EL0 @@ -95,6 +96,7 @@ alternative_else_nop_endif mrs x0, tpidrro_el0 #endif b el\el\ht\()_\regsize\()_\label +.org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm .macro tramp_alias, dst, sym @@ -662,6 +664,7 @@ alternative_else_nop_endif add x30, x30, #(1b - tramp_vectors) isb ret +.org 1b + 128 // Did we overflow the ventry slot? .endm .macro tramp_exit, regsize = 64 From 1b33d4860deaecf1d8eec3061b7e7ed7ab0bae8d Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 16 Nov 2021 15:00:51 +0000 Subject: [PATCH 148/773] arm64: spectre: Rename spectre_v4_patch_fw_mitigation_conduit The spectre-v4 sequence includes an SMC from the assembly entry code. spectre_v4_patch_fw_mitigation_conduit is the patching callback that generates an HVC or SMC depending on the SMCCC conduit type. As this isn't specific to spectre-v4, rename it smccc_patch_fw_mitigation_conduit so it can be re-used. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/proton-pack.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index bd940b2254da..d97eb024adec 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -118,7 +118,7 @@ alternative_cb_end tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2 mov w1, #\state -alternative_cb spectre_v4_patch_fw_mitigation_conduit +alternative_cb smccc_patch_fw_mitigation_conduit nop // Patched to SMC/HVC #0 alternative_cb_end .L__asm_ssbd_skip\@: diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 902e4084c477..9394f21d7566 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -554,9 +554,9 @@ void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt, * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction * to call into firmware to adjust the mitigation state. */ -void __init spectre_v4_patch_fw_mitigation_conduit(struct alt_instr *alt, - __le32 *origptr, - __le32 *updptr, int nr_inst) +void __init smccc_patch_fw_mitigation_conduit(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) { u32 insn; From 5bdf3437603d4af87f9c7f424b0c8aeed2420745 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 16 Nov 2021 15:06:19 +0000 Subject: [PATCH 149/773] KVM: arm64: Allow indirect vectors to be used without SPECTRE_V3A CPUs vulnerable to Spectre-BHB either need to make an SMC-CC firmware call from the vectors, or run a sequence of branches. This gets added to the hyp vectors. If there is no support for arch-workaround-1 in firmware, the indirect vector will be used. kvm_init_vector_slots() only initialises the two indirect slots if the platform is vulnerable to Spectre-v3a. pKVM's hyp_map_vectors() only initialises __hyp_bp_vect_base if the platform is vulnerable to Spectre-v3a. As there are about to more users of the indirect vectors, ensure their entries in hyp_spectre_vector_selector[] are always initialised, and __hyp_bp_vect_base defaults to the regular VA mapping. The Spectre-v3a check is moved to a helper kvm_system_needs_idmapped_vectors(), and merged with the code that creates the hyp mappings. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/kvm_host.h | 5 +++++ arch/arm64/kvm/arm.c | 5 +---- arch/arm64/kvm/hyp/nvhe/mm.c | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5bc01e62c08a..031e3a2537fc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -714,6 +714,11 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr(); } +static inline bool kvm_system_needs_idmapped_vectors(void) +{ + return cpus_have_const_cap(ARM64_SPECTRE_V3A); +} + void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); static inline void kvm_arch_hardware_unsetup(void) {} diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ecc5958e27fe..4dca6ffd03d4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1491,10 +1491,7 @@ static int kvm_init_vector_slots(void) base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); - if (!cpus_have_const_cap(ARM64_SPECTRE_V3A)) - return 0; - - if (!has_vhe()) { + if (kvm_system_needs_idmapped_vectors() && !has_vhe()) { err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), __BP_HARDEN_HYP_VECS_SZ, &base); if (err) diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 526a7d6fa86f..cdbe8e246418 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -148,8 +148,10 @@ int hyp_map_vectors(void) phys_addr_t phys; void *bp_base; - if (!cpus_have_const_cap(ARM64_SPECTRE_V3A)) + if (!kvm_system_needs_idmapped_vectors()) { + __hyp_bp_vect_base = __bp_harden_hyp_vecs; return 0; + } phys = __hyp_pa(__bp_harden_hyp_vecs); bp_base = (void *)__pkvm_create_private_mapping(phys, From d739da1694a0eaef0358a42b76904b611539b77b Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 24 Nov 2021 15:36:12 +0000 Subject: [PATCH 150/773] arm64: entry: Make the trampoline cleanup optional Subsequent patches will add additional sets of vectors that use the same tricks as the kpti vectors to reach the full-fat vectors. The full-fat vectors contain some cleanup for kpti that is patched in by alternatives when kpti is in use. Once there are additional vectors, the cleanup will be needed in more cases. But on big/little systems, the cleanup would be harmful if no trampoline vector were in use. Instead of forcing CPUs that don't need a trampoline vector to use one, make the trampoline cleanup optional. Entry at the top of the vectors will skip the cleanup. The trampoline vectors can then skip the first instruction, triggering the cleanup to run. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index d97eb024adec..bdbdb92b5f77 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -40,14 +40,18 @@ .Lventry_start\@: #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 .if \el == 0 -alternative_if ARM64_UNMAP_KERNEL_AT_EL0 + /* + * This must be the first instruction of the EL0 vector entries. It is + * skipped by the trampoline vectors, to trigger the cleanup. + */ + b .Lskip_tramp_vectors_cleanup\@ .if \regsize == 64 mrs x30, tpidrro_el0 msr tpidrro_el0, xzr .else mov x30, xzr .endif -alternative_else_nop_endif +.Lskip_tramp_vectors_cleanup\@: .endif #endif @@ -661,7 +665,7 @@ alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM prfm plil1strm, [x30, #(1b - tramp_vectors)] alternative_else_nop_endif msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors) + add x30, x30, #(1b - tramp_vectors + 4) isb ret .org 1b + 128 // Did we overflow the ventry slot? From 03aff3a77a58b5b52a77e00537a42090ad57b80b Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Nov 2021 18:41:43 +0000 Subject: [PATCH 151/773] arm64: entry: Free up another register on kpti's tramp_exit path Kpti stashes x30 in far_el1 while it uses x30 for all its work. Making the vectors a per-cpu data structure will require a second register. Allow tramp_exit two registers before it unmaps the kernel, by leaving x30 on the stack, and stashing x29 in far_el1. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index bdbdb92b5f77..45e89135dc11 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -419,14 +419,16 @@ alternative_else_nop_endif ldp x24, x25, [sp, #16 * 12] ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] - ldr lr, [sp, #S_LR] - add sp, sp, #PT_REGS_SIZE // restore sp .if \el == 0 -alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 + ldr lr, [sp, #S_LR] + add sp, sp, #PT_REGS_SIZE // restore sp + eret +alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f - msr far_el1, x30 + msr far_el1, x29 tramp_alias x30, tramp_exit_native br x30 4: @@ -434,6 +436,9 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 br x30 #endif .else + ldr lr, [sp, #S_LR] + add sp, sp, #PT_REGS_SIZE // restore sp + /* Ensure any device/NC reads complete */ alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 @@ -674,10 +679,12 @@ alternative_else_nop_endif .macro tramp_exit, regsize = 64 adr x30, tramp_vectors msr vbar_el1, x30 - tramp_unmap_kernel x30 + ldr lr, [sp, #S_LR] + tramp_unmap_kernel x29 .if \regsize == 64 - mrs x30, far_el1 + mrs x29, far_el1 .endif + add sp, sp, #PT_REGS_SIZE // restore sp eret sb .endm From c091fb6ae059cda563b2a4d93fdbc548ef34e1d6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Nov 2021 15:43:31 +0000 Subject: [PATCH 152/773] arm64: entry: Move the trampoline data page before the text page The trampoline code has a data page that holds the address of the vectors, which is unmapped when running in user-space. This ensures that with CONFIG_RANDOMIZE_BASE, the randomised address of the kernel can't be discovered until after the kernel has been mapped. If the trampoline text page is extended to include multiple sets of vectors, it will be larger than a single page, making it tricky to find the data page without knowing the size of the trampoline text pages, which will vary with PAGE_SIZE. Move the data page to appear before the text page. This allows the data page to be found without knowing the size of the trampoline text pages. 'tramp_vectors' is used to refer to the beginning of the .entry.tramp.text section, do that explicitly. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/fixmap.h | 2 +- arch/arm64/kernel/entry.S | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 4335800201c9..0aabc0253b18 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -62,8 +62,8 @@ enum fixed_addresses { #endif /* CONFIG_ACPI_APEI_GHES */ #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - FIX_ENTRY_TRAMP_DATA, FIX_ENTRY_TRAMP_TEXT, + FIX_ENTRY_TRAMP_DATA, #define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 45e89135dc11..d8a76869e873 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -644,6 +644,11 @@ alternative_else_nop_endif */ .endm + .macro tramp_data_page dst + adr \dst, .entry.tramp.text + sub \dst, \dst, PAGE_SIZE + .endm + .macro tramp_ventry, regsize = 64 .align 7 1: @@ -660,7 +665,7 @@ alternative_else_nop_endif 2: tramp_map_kernel x30 #ifdef CONFIG_RANDOMIZE_BASE - adr x30, tramp_vectors + PAGE_SIZE + tramp_data_page x30 alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 ldr x30, [x30] #else @@ -851,7 +856,7 @@ SYM_CODE_START(__sdei_asm_entry_trampoline) 1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] #ifdef CONFIG_RANDOMIZE_BASE - adr x4, tramp_vectors + PAGE_SIZE + tramp_data_page x4 add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler ldr x4, [x4] #else From 6c5bf79b69f911560fbf82214c0971af6e58e682 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 24 Nov 2021 11:40:18 +0000 Subject: [PATCH 153/773] arm64: entry: Allow tramp_alias to access symbols after the 4K boundary Systems using kpti enter and exit the kernel through a trampoline mapping that is always mapped, even when the kernel is not. tramp_valias is a macro to find the address of a symbol in the trampoline mapping. Adding extra sets of vectors will expand the size of the entry.tramp.text section to beyond 4K. tramp_valias will be unable to generate addresses for symbols beyond 4K as it uses the 12 bit immediate of the add instruction. As there are now two registers available when tramp_alias is called, use the extra register to avoid the 4K limit of the 12 bit immediate. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index d8a76869e873..192cf77bd374 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -103,9 +103,12 @@ .org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm - .macro tramp_alias, dst, sym + .macro tramp_alias, dst, sym, tmp mov_q \dst, TRAMP_VALIAS - add \dst, \dst, #(\sym - .entry.tramp.text) + adr_l \tmp, \sym + add \dst, \dst, \tmp + adr_l \tmp, .entry.tramp.text + sub \dst, \dst, \tmp .endm /* @@ -429,10 +432,10 @@ alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f msr far_el1, x29 - tramp_alias x30, tramp_exit_native + tramp_alias x30, tramp_exit_native, x29 br x30 4: - tramp_alias x30, tramp_exit_compat + tramp_alias x30, tramp_exit_compat, x29 br x30 #endif .else @@ -1000,7 +1003,7 @@ alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline + tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline, tmp=x3 br x5 #endif SYM_CODE_END(__sdei_asm_handler) From ed50da7764535f1e24432ded289974f2bf2b0c5a Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 24 Nov 2021 13:40:09 +0000 Subject: [PATCH 154/773] arm64: entry: Don't assume tramp_vectors is the start of the vectors The tramp_ventry macro uses tramp_vectors as the address of the vectors when calculating which ventry in the 'full fat' vectors to branch to. While there is one set of tramp_vectors, this will be true. Adding multiple sets of vectors will break this assumption. Move the generation of the vectors to a macro, and pass the start of the vectors as an argument to tramp_ventry. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 192cf77bd374..4e6b8d97d941 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -652,7 +652,7 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, regsize = 64 + .macro tramp_ventry, vector_start, regsize .align 7 1: .if \regsize == 64 @@ -675,10 +675,10 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 ldr x30, =vectors #endif alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM - prfm plil1strm, [x30, #(1b - tramp_vectors)] + prfm plil1strm, [x30, #(1b - \vector_start)] alternative_else_nop_endif msr vbar_el1, x30 - add x30, x30, #(1b - tramp_vectors + 4) + add x30, x30, #(1b - \vector_start + 4) isb ret .org 1b + 128 // Did we overflow the ventry slot? @@ -697,19 +697,21 @@ alternative_else_nop_endif sb .endm - .align 11 -SYM_CODE_START_NOALIGN(tramp_vectors) + .macro generate_tramp_vector +.Lvector_start\@: .space 0x400 - tramp_ventry - tramp_ventry - tramp_ventry - tramp_ventry + .rept 4 + tramp_ventry .Lvector_start\@, 64 + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32 + .endr + .endm - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 - tramp_ventry 32 + .align 11 +SYM_CODE_START_NOALIGN(tramp_vectors) + generate_tramp_vector SYM_CODE_END(tramp_vectors) SYM_CODE_START(tramp_exit_native) From 13d7a08352a83ef2252aeb464a5e08dfc06b5dfd Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 14:02:30 +0000 Subject: [PATCH 155/773] arm64: entry: Move trampoline macros out of ifdef'd section The macros for building the kpti trampoline are all behind CONFIG_UNMAP_KERNEL_AT_EL0, and in a region that outputs to the .entry.tramp.text section. Move the macros out so they can be used to generate other kinds of trampoline. Only the symbols need to be guarded by CONFIG_UNMAP_KERNEL_AT_EL0 and appear in the .entry.tramp.text section. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 4e6b8d97d941..2735c8941c2d 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -608,12 +608,6 @@ SYM_CODE_END(ret_to_user) .popsection // .entry.text -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -/* - * Exception vectors trampoline. - */ - .pushsection ".entry.tramp.text", "ax" - // Move from tramp_pg_dir to swapper_pg_dir .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 @@ -709,6 +703,11 @@ alternative_else_nop_endif .endr .endm +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + */ + .pushsection ".entry.tramp.text", "ax" .align 11 SYM_CODE_START_NOALIGN(tramp_vectors) generate_tramp_vector From c47e4d04ba0f1ea17353d85d45f611277507e07a Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 13:16:23 +0000 Subject: [PATCH 156/773] arm64: entry: Make the kpti trampoline's kpti sequence optional Spectre-BHB needs to add sequences to the vectors. Having one global set of vectors is a problem for big/little systems where the sequence is costly on cpus that are not vulnerable. Making the vectors per-cpu in the style of KVM's bh_harden_hyp_vecs requires the vectors to be generated by macros. Make the kpti re-mapping of the kernel optional, so the macros can be used without kpti. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2735c8941c2d..23e85c94ea36 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -646,9 +646,10 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, vector_start, regsize + .macro tramp_ventry, vector_start, regsize, kpti .align 7 1: + .if \kpti == 1 .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif @@ -671,9 +672,14 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM prfm plil1strm, [x30, #(1b - \vector_start)] alternative_else_nop_endif + msr vbar_el1, x30 - add x30, x30, #(1b - \vector_start + 4) isb + .else + ldr x30, =vectors + .endif // \kpti == 1 + + add x30, x30, #(1b - \vector_start + 4) ret .org 1b + 128 // Did we overflow the ventry slot? .endm @@ -691,15 +697,15 @@ alternative_else_nop_endif sb .endm - .macro generate_tramp_vector + .macro generate_tramp_vector, kpti .Lvector_start\@: .space 0x400 .rept 4 - tramp_ventry .Lvector_start\@, 64 + tramp_ventry .Lvector_start\@, 64, \kpti .endr .rept 4 - tramp_ventry .Lvector_start\@, 32 + tramp_ventry .Lvector_start\@, 32, \kpti .endr .endm @@ -710,7 +716,7 @@ alternative_else_nop_endif .pushsection ".entry.tramp.text", "ax" .align 11 SYM_CODE_START_NOALIGN(tramp_vectors) - generate_tramp_vector + generate_tramp_vector kpti=1 SYM_CODE_END(tramp_vectors) SYM_CODE_START(tramp_exit_native) From a9c406e6462ff14956d690de7bbe5131a5677dc9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 15:04:32 +0000 Subject: [PATCH 157/773] arm64: entry: Allow the trampoline text to occupy multiple pages Adding a second set of vectors to .entry.tramp.text will make it larger than a single 4K page. Allow the trampoline text to occupy up to three pages by adding two more fixmap slots. Previous changes to tramp_valias allowed it to reach beyond a single page. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/fixmap.h | 6 ++++-- arch/arm64/include/asm/sections.h | 5 +++++ arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/vmlinux.lds.S | 2 +- arch/arm64/mm/mmu.c | 12 +++++++++--- 5 files changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 0aabc0253b18..daff882883f9 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -62,9 +62,11 @@ enum fixed_addresses { #endif /* CONFIG_ACPI_APEI_GHES */ #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - FIX_ENTRY_TRAMP_TEXT, + FIX_ENTRY_TRAMP_TEXT3, + FIX_ENTRY_TRAMP_TEXT2, + FIX_ENTRY_TRAMP_TEXT1, FIX_ENTRY_TRAMP_DATA, -#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) +#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT1)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 152cb35bf9df..40971ac1303f 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -23,4 +23,9 @@ extern char __mmuoff_data_start[], __mmuoff_data_end[]; extern char __entry_tramp_text_start[], __entry_tramp_text_end[]; extern char __relocate_new_kernel_start[], __relocate_new_kernel_end[]; +static inline size_t entry_tramp_text_size(void) +{ + return __entry_tramp_text_end - __entry_tramp_text_start; +} + #endif /* __ASM_SECTIONS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 23e85c94ea36..e0a0c1da5db8 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -642,7 +642,7 @@ alternative_else_nop_endif .endm .macro tramp_data_page dst - adr \dst, .entry.tramp.text + adr_l \dst, .entry.tramp.text sub \dst, \dst, PAGE_SIZE .endm diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 50bab186c49b..edaf0faf766f 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -341,7 +341,7 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE, "Entry trampoline text too big") #endif #ifdef CONFIG_KVM diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index acfae9b41cc8..49abbf43bf35 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -617,6 +617,8 @@ early_param("rodata", parse_rodata); #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static int __init map_entry_trampoline(void) { + int i; + pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); @@ -625,11 +627,15 @@ static int __init map_entry_trampoline(void) /* Map only the text into the trampoline page table */ memset(tramp_pg_dir, 0, PGD_SIZE); - __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, - prot, __pgd_pgtable_alloc, 0); + __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, + entry_tramp_text_size(), prot, + __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS); /* Map both the text and data into the kernel page table */ - __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) + __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, + pa_start + i * PAGE_SIZE, prot); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern char __entry_tramp_data_start[]; From aff65393fa1401e034656e349abd655cfe272de0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 24 Nov 2021 15:03:15 +0000 Subject: [PATCH 158/773] arm64: entry: Add non-kpti __bp_harden_el1_vectors for mitigations kpti is an optional feature, for systems not using kpti a set of vectors for the spectre-bhb mitigations is needed. Add another set of vectors, __bp_harden_el1_vectors, that will be used if a mitigation is needed and kpti is not in use. The EL1 ventries are repeated verbatim as there is no additional work needed for entry from EL1. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e0a0c1da5db8..9c4ff75f983e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -649,10 +649,11 @@ alternative_else_nop_endif .macro tramp_ventry, vector_start, regsize, kpti .align 7 1: - .if \kpti == 1 .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy * entry onto the return stack and using a RET instruction to @@ -739,6 +740,38 @@ SYM_DATA_END(__entry_tramp_data_start) #endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ +/* + * Exception vectors for spectre mitigations on entry from EL1 when + * kpti is not in use. + */ + .macro generate_el1_vector +.Lvector_start\@: + kernel_ventry 1, t, 64, sync // Synchronous EL1t + kernel_ventry 1, t, 64, irq // IRQ EL1t + kernel_ventry 1, t, 64, fiq // FIQ EL1h + kernel_ventry 1, t, 64, error // Error EL1t + + kernel_ventry 1, h, 64, sync // Synchronous EL1h + kernel_ventry 1, h, 64, irq // IRQ EL1h + kernel_ventry 1, h, 64, fiq // FIQ EL1h + kernel_ventry 1, h, 64, error // Error EL1h + + .rept 4 + tramp_ventry .Lvector_start\@, 64, kpti=0 + .endr + .rept 4 + tramp_ventry .Lvector_start\@, 32, kpti=0 + .endr + .endm + + .pushsection ".entry.text", "ax" + .align 11 +SYM_CODE_START(__bp_harden_el1_vectors) + generate_el1_vector +SYM_CODE_END(__bp_harden_el1_vectors) + .popsection + + /* * Register switch for AArch64. The callee-saved registers need to be saved * and restored. On entry: From 61d06f01f9710b327a53492e5add9f972eb909b3 Mon Sep 17 00:00:00 2001 From: Felix Maurer Date: Fri, 11 Feb 2022 18:43:36 +0100 Subject: [PATCH 159/773] selftests: bpf: Check bpf_msg_push_data return value bpf_msg_push_data may return a non-zero value to indicate an error. The return value should be checked to prevent undetected errors. To indicate an error, the BPF programs now perform a different action than their intended one to make the userspace test program notice the error, i.e., the programs supposed to pass/redirect drop, the program supposed to drop passes. Fixes: 84fbfe026acaa ("bpf: test_sockmap add options to use msg_push_data") Signed-off-by: Felix Maurer Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/89f767bb44005d6b4dd1f42038c438f76b3ebfad.1644601294.git.fmaurer@redhat.com --- .../selftests/bpf/progs/test_sockmap_kern.h | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 2966564b8497..6c85b00f27b2 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -235,7 +235,7 @@ SEC("sk_msg1") int bpf_prog4(struct sk_msg_md *msg) { int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int *start, *end, *start_push, *end_push, *start_pop, *pop; + int *start, *end, *start_push, *end_push, *start_pop, *pop, err = 0; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) @@ -249,8 +249,11 @@ int bpf_prog4(struct sk_msg_md *msg) bpf_msg_pull_data(msg, *start, *end, 0); start_push = bpf_map_lookup_elem(&sock_bytes, &two); end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) - bpf_msg_push_data(msg, *start_push, *end_push, 0); + if (start_push && end_push) { + err = bpf_msg_push_data(msg, *start_push, *end_push, 0); + if (err) + return SK_DROP; + } start_pop = bpf_map_lookup_elem(&sock_bytes, &four); pop = bpf_map_lookup_elem(&sock_bytes, &five); if (start_pop && pop) @@ -263,6 +266,7 @@ int bpf_prog6(struct sk_msg_md *msg) { int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; + int err = 0; __u64 flags = 0; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); @@ -279,8 +283,11 @@ int bpf_prog6(struct sk_msg_md *msg) start_push = bpf_map_lookup_elem(&sock_bytes, &two); end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) - bpf_msg_push_data(msg, *start_push, *end_push, 0); + if (start_push && end_push) { + err = bpf_msg_push_data(msg, *start_push, *end_push, 0); + if (err) + return SK_DROP; + } start_pop = bpf_map_lookup_elem(&sock_bytes, &four); pop = bpf_map_lookup_elem(&sock_bytes, &five); @@ -338,7 +345,7 @@ SEC("sk_msg5") int bpf_prog10(struct sk_msg_md *msg) { int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; + int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, err = 0; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) @@ -352,8 +359,11 @@ int bpf_prog10(struct sk_msg_md *msg) bpf_msg_pull_data(msg, *start, *end, 0); start_push = bpf_map_lookup_elem(&sock_bytes, &two); end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) - bpf_msg_push_data(msg, *start_push, *end_push, 0); + if (start_push && end_push) { + err = bpf_msg_push_data(msg, *start_push, *end_push, 0); + if (err) + return SK_PASS; + } start_pop = bpf_map_lookup_elem(&sock_bytes, &four); pop = bpf_map_lookup_elem(&sock_bytes, &five); if (start_pop && pop) From 741b23a970a79d5d3a1db2d64fa2c7b375a4febb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C4=81vis=20Mos=C4=81ns?= Date: Wed, 2 Feb 2022 23:44:55 +0200 Subject: [PATCH 160/773] btrfs: prevent copying too big compressed lzo segment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compressed length can be corrupted to be a lot larger than memory we have allocated for buffer. This will cause memcpy in copy_compressed_segment to write outside of allocated memory. This mostly results in stuck read syscall but sometimes when using btrfs send can get #GP kernel: general protection fault, probably for non-canonical address 0x841551d5c1000: 0000 [#1] PREEMPT SMP NOPTI kernel: CPU: 17 PID: 264 Comm: kworker/u256:7 Tainted: P OE 5.17.0-rc2-1 #12 kernel: Workqueue: btrfs-endio btrfs_work_helper [btrfs] kernel: RIP: 0010:lzo_decompress_bio (./include/linux/fortify-string.h:225 fs/btrfs/lzo.c:322 fs/btrfs/lzo.c:394) btrfs Code starting with the faulting instruction =========================================== 0:* 48 8b 06 mov (%rsi),%rax <-- trapping instruction 3: 48 8d 79 08 lea 0x8(%rcx),%rdi 7: 48 83 e7 f8 and $0xfffffffffffffff8,%rdi b: 48 89 01 mov %rax,(%rcx) e: 44 89 f0 mov %r14d,%eax 11: 48 8b 54 06 f8 mov -0x8(%rsi,%rax,1),%rdx kernel: RSP: 0018:ffffb110812efd50 EFLAGS: 00010212 kernel: RAX: 0000000000001000 RBX: 000000009ca264c8 RCX: ffff98996e6d8ff8 kernel: RDX: 0000000000000064 RSI: 000841551d5c1000 RDI: ffffffff9500435d kernel: RBP: ffff989a3be856c0 R08: 0000000000000000 R09: 0000000000000000 kernel: R10: 0000000000000000 R11: 0000000000001000 R12: ffff98996e6d8000 kernel: R13: 0000000000000008 R14: 0000000000001000 R15: 000841551d5c1000 kernel: FS: 0000000000000000(0000) GS:ffff98a09d640000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 00001e9f984d9ea8 CR3: 000000014971a000 CR4: 00000000003506e0 kernel: Call Trace: kernel: kernel: end_compressed_bio_read (fs/btrfs/compression.c:104 fs/btrfs/compression.c:1363 fs/btrfs/compression.c:323) btrfs kernel: end_workqueue_fn (fs/btrfs/disk-io.c:1923) btrfs kernel: btrfs_work_helper (fs/btrfs/async-thread.c:326) btrfs kernel: process_one_work (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:212 ./include/trace/events/workqueue.h:108 kernel/workqueue.c:2312) kernel: worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2455) kernel: ? process_one_work (kernel/workqueue.c:2397) kernel: kthread (kernel/kthread.c:377) kernel: ? kthread_complete_and_exit (kernel/kthread.c:332) kernel: ret_from_fork (arch/x86/entry/entry_64.S:301) kernel: CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Dāvis Mosāns Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/lzo.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 0fb90cbe7669..e6e28a9c7987 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -380,6 +380,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) kunmap(cur_page); cur_in += LZO_LEN; + if (seg_len > lzo1x_worst_compress(PAGE_SIZE)) { + /* + * seg_len shouldn't be larger than we have allocated + * for workspace->cbuf + */ + btrfs_err(fs_info, "unexpectedly large lzo segment len %u", + seg_len); + ret = -EIO; + goto out; + } + /* Copy the compressed segment payload into workspace */ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); From 966d879bafaaf020c11a7cee9526f6dd823a4126 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 11 Feb 2022 14:41:39 +0800 Subject: [PATCH 161/773] btrfs: defrag: allow defrag_one_cluster() to skip large extent which is not a target In the rework of btrfs_defrag_file(), we always call defrag_one_cluster() and increase the offset by cluster size, which is only 256K. But there are cases where we have a large extent (e.g. 128M) which doesn't need to be defragged at all. Before the refactor, we can directly skip the range, but now we have to scan that extent map again and again until the cluster moves after the non-target extent. Fix the problem by allow defrag_one_cluster() to increase btrfs_defrag_ctrl::last_scanned to the end of an extent, if and only if the last extent of the cluster is not a target. The test script looks like this: mkfs.btrfs -f $dev > /dev/null mount $dev $mnt # As btrfs ioctl uses 32M as extent_threshold xfs_io -f -c "pwrite 0 64M" $mnt/file1 sync # Some fragemented range to defrag xfs_io -s -c "pwrite 65548k 4k" \ -c "pwrite 65544k 4k" \ -c "pwrite 65540k 4k" \ -c "pwrite 65536k 4k" \ $mnt/file1 sync echo "=== before ===" xfs_io -c "fiemap -v" $mnt/file1 echo "=== after ===" btrfs fi defrag $mnt/file1 sync xfs_io -c "fiemap -v" $mnt/file1 umount $mnt With extra ftrace put into defrag_one_cluster(), before the patch it would result tons of loops: (As defrag_one_cluster() is inlined, the function name is its caller) btrfs-126062 [005] ..... 4682.816026: btrfs_defrag_file: r/i=5/257 start=0 len=262144 btrfs-126062 [005] ..... 4682.816027: btrfs_defrag_file: r/i=5/257 start=262144 len=262144 btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=524288 len=262144 btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=786432 len=262144 btrfs-126062 [005] ..... 4682.816028: btrfs_defrag_file: r/i=5/257 start=1048576 len=262144 ... btrfs-126062 [005] ..... 4682.816043: btrfs_defrag_file: r/i=5/257 start=67108864 len=262144 But with this patch there will be just one loop, then directly to the end of the extent: btrfs-130471 [014] ..... 5434.029558: defrag_one_cluster: r/i=5/257 start=0 len=262144 btrfs-130471 [014] ..... 5434.029559: defrag_one_cluster: r/i=5/257 start=67108864 len=16384 CC: stable@vger.kernel.org # 5.16 Signed-off-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 48 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 90136562d865..218724e4edd6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1186,8 +1186,10 @@ struct defrag_target_range { static int defrag_collect_targets(struct btrfs_inode *inode, u64 start, u64 len, u32 extent_thresh, u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list) + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) { + bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1197,6 +1199,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool next_mergeable = true; u64 range_len; + last_is_target = false; em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); if (!em) break; @@ -1272,6 +1275,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, } add: + last_is_target = true; range_len = min(extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into @@ -1315,6 +1319,17 @@ next: kfree(entry); } } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } return ret; } @@ -1373,7 +1388,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, } static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress) + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) { struct extent_state *cached_state = NULL; struct defrag_target_range *entry; @@ -1419,7 +1435,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, */ ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, true, - &target_list); + &target_list, last_scanned_ret); if (ret < 0) goto unlock_extent; @@ -1454,7 +1470,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, unsigned long *sectors_defragged, - unsigned long max_sectors) + unsigned long max_sectors, + u64 *last_scanned_ret) { const u32 sectorsize = inode->root->fs_info->sectorsize; struct defrag_target_range *entry; @@ -1465,7 +1482,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, false, - &target_list); + &target_list, NULL); if (ret < 0) goto out; @@ -1482,6 +1499,15 @@ static int defrag_one_cluster(struct btrfs_inode *inode, range_len = min_t(u32, range_len, (max_sectors - *sectors_defragged) * sectorsize); + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + if (ra) page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, @@ -1494,7 +1520,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, * accounting. */ ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress); + extent_thresh, newer_than, do_compress, + last_scanned_ret); if (ret < 0) break; *sectors_defragged += range_len >> @@ -1505,6 +1532,8 @@ out: list_del_init(&entry->list); kfree(entry); } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); return ret; } @@ -1590,6 +1619,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; u64 cluster_end; /* The cluster size 256K should always be page aligned */ @@ -1619,8 +1649,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, BTRFS_I(inode)->defrag_compress = compress_type; ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, - §ors_defragged, max_to_defrag); + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1628,7 +1658,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, btrfs_inode_unlock(inode, 0); if (ret < 0) break; - cur = cluster_end + 1; + cur = max(cluster_end + 1, last_scanned); if (ret > 0) { ret = 0; break; From f98da1d66298882b1d2061051ea14ddc15c58884 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 15 Feb 2022 10:54:23 -0800 Subject: [PATCH 162/773] ACPI: tables: Quiet ACPI table not found warning Paul reports that the ACPI core complains on every boot about a missing CEDT table. Unlike the standard NUMA tables (SRAT, MADT, and SLIT) that are critical to NUMA init, CEDT is only expected on CXL platforms. Given the notice is not actionable lower its severity to debug. Link: https://lore.kernel.org/r/55f5c077-061c-7e53-b02d-53dde1dd654f@molgen.mpg.de Fixes: fd49f99c1809 ("ACPI: NUMA: Add a node and memblk for each CFMWS not in SRAT") Reported-by: Paul Menzel Signed-off-by: Dan Williams Signed-off-by: Rafael J. Wysocki --- drivers/acpi/tables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 0741a4933f62..34600b5b9d8e 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -400,7 +400,7 @@ int __init_or_acpilib acpi_table_parse_entries_array( acpi_get_table(id, instance, &table_header); if (!table_header) { - pr_warn("%4.4s not present\n", id); + pr_debug("%4.4s not present\n", id); return -ENODEV; } From de8aa31ac7c23af98fe24d1c1b43b065027d6af5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 15 Feb 2022 16:29:08 -0800 Subject: [PATCH 163/773] Input: zinitix - add new compatible strings This driver works just fine with the BT404 version of the touchscreen as well. Tested on the Samsung GT-I8160 (Codina) mobile phone. Add all the new variants from the binding document so people can easily test them, we believe most of them work more or less. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20220214234033.1052681-1-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/zinitix.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/input/touchscreen/zinitix.c b/drivers/input/touchscreen/zinitix.c index 7c82c4f5fa6b..129ebc810de8 100644 --- a/drivers/input/touchscreen/zinitix.c +++ b/drivers/input/touchscreen/zinitix.c @@ -571,8 +571,20 @@ static SIMPLE_DEV_PM_OPS(zinitix_pm_ops, zinitix_suspend, zinitix_resume); #ifdef CONFIG_OF static const struct of_device_id zinitix_of_match[] = { + { .compatible = "zinitix,bt402" }, + { .compatible = "zinitix,bt403" }, + { .compatible = "zinitix,bt404" }, + { .compatible = "zinitix,bt412" }, + { .compatible = "zinitix,bt413" }, + { .compatible = "zinitix,bt431" }, + { .compatible = "zinitix,bt432" }, + { .compatible = "zinitix,bt531" }, { .compatible = "zinitix,bt532" }, + { .compatible = "zinitix,bt538" }, { .compatible = "zinitix,bt541" }, + { .compatible = "zinitix,bt548" }, + { .compatible = "zinitix,bt554" }, + { .compatible = "zinitix,at100" }, { } }; MODULE_DEVICE_TABLE(of, zinitix_of_match); From ba2689234be92024e5635d30fe744f4853ad97db Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 13:59:46 +0000 Subject: [PATCH 164/773] arm64: entry: Add vectors that have the bhb mitigation sequences Some CPUs affected by Spectre-BHB need a sequence of branches, or a firmware call to be run before any indirect branch. This needs to go in the vectors. No CPU needs both. While this can be patched in, it would run on all CPUs as there is a single set of vectors. If only one part of a big/little combination is affected, the unaffected CPUs have to run the mitigation too. Create extra vectors that include the sequence. Subsequent patches will allow affected CPUs to select this set of vectors. Later patches will modify the loop count to match what the CPU requires. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/assembler.h | 24 ++++++++++++++ arch/arm64/include/asm/vectors.h | 34 +++++++++++++++++++ arch/arm64/kernel/entry.S | 53 +++++++++++++++++++++++++----- arch/arm64/kernel/proton-pack.c | 16 +++++++++ include/linux/arm-smccc.h | 5 +++ 5 files changed, 123 insertions(+), 9 deletions(-) create mode 100644 arch/arm64/include/asm/vectors.h diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index e8bd0af0141c..046c38ee2841 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -850,4 +850,28 @@ alternative_endif #endif /* GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT */ + .macro __mitigate_spectre_bhb_loop tmp +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + mov \tmp, #32 +.Lspectre_bhb_loop\@: + b . + 4 + subs \tmp, \tmp, #1 + b.ne .Lspectre_bhb_loop\@ + sb +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + + /* Save/restores x0-x3 to the stack */ + .macro __mitigate_spectre_bhb_fw +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + stp x0, x1, [sp, #-16]! + stp x2, x3, [sp, #-16]! + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 +alternative_cb smccc_patch_fw_mitigation_conduit + nop // Patched to SMC/HVC #0 +alternative_cb_end + ldp x2, x3, [sp], #16 + ldp x0, x1, [sp], #16 +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h new file mode 100644 index 000000000000..bac53fad037d --- /dev/null +++ b/arch/arm64/include/asm/vectors.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 ARM Ltd. + */ +#ifndef __ASM_VECTORS_H +#define __ASM_VECTORS_H + +/* + * Note: the order of this enum corresponds to two arrays in entry.S: + * tramp_vecs and __bp_harden_el1_vectors. By default the canonical + * 'full fat' vectors are used directly. + */ +enum arm64_bp_harden_el1_vectors { +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + /* + * Perform the BHB loop mitigation, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_LOOP, + + /* + * Make the SMC call for firmware mitigation, before branching to the + * canonical vectors. + */ + EL1_VECTOR_BHB_FW, +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + + /* + * Remap the kernel before branching to the canonical vectors. + */ + EL1_VECTOR_KPTI, ++}; + +#endif /* __ASM_VECTORS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 9c4ff75f983e..2ceb0c3647b4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -646,13 +646,26 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, vector_start, regsize, kpti + +#define BHB_MITIGATION_NONE 0 +#define BHB_MITIGATION_LOOP 1 +#define BHB_MITIGATION_FW 2 + + .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + .if \bhb == BHB_MITIGATION_LOOP + /* + * This sequence must appear before the first indirect branch. i.e. the + * ret out of tramp_ventry. It appears here because x30 is free. + */ + __mitigate_spectre_bhb_loop x30 + .endif // \bhb == BHB_MITIGATION_LOOP + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy @@ -680,6 +693,15 @@ alternative_else_nop_endif ldr x30, =vectors .endif // \kpti == 1 + .if \bhb == BHB_MITIGATION_FW + /* + * The firmware sequence must appear before the first indirect branch. + * i.e. the ret out of tramp_ventry. But it also needs the stack to be + * mapped to save/restore the registers the SMC clobbers. + */ + __mitigate_spectre_bhb_fw + .endif // \bhb == BHB_MITIGATION_FW + add x30, x30, #(1b - \vector_start + 4) ret .org 1b + 128 // Did we overflow the ventry slot? @@ -687,6 +709,9 @@ alternative_else_nop_endif .macro tramp_exit, regsize = 64 adr x30, tramp_vectors +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + add x30, x30, SZ_4K +#endif msr vbar_el1, x30 ldr lr, [sp, #S_LR] tramp_unmap_kernel x29 @@ -698,26 +723,32 @@ alternative_else_nop_endif sb .endm - .macro generate_tramp_vector, kpti + .macro generate_tramp_vector, kpti, bhb .Lvector_start\@: .space 0x400 .rept 4 - tramp_ventry .Lvector_start\@, 64, \kpti + tramp_ventry .Lvector_start\@, 64, \kpti, \bhb .endr .rept 4 - tramp_ventry .Lvector_start\@, 32, \kpti + tramp_ventry .Lvector_start\@, 32, \kpti, \bhb .endr .endm #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 /* * Exception vectors trampoline. + * The order must match __bp_harden_el1_vectors and the + * arm64_bp_harden_el1_vectors enum. */ .pushsection ".entry.tramp.text", "ax" .align 11 SYM_CODE_START_NOALIGN(tramp_vectors) - generate_tramp_vector kpti=1 +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE SYM_CODE_END(tramp_vectors) SYM_CODE_START(tramp_exit_native) @@ -744,7 +775,7 @@ SYM_DATA_END(__entry_tramp_data_start) * Exception vectors for spectre mitigations on entry from EL1 when * kpti is not in use. */ - .macro generate_el1_vector + .macro generate_el1_vector, bhb .Lvector_start\@: kernel_ventry 1, t, 64, sync // Synchronous EL1t kernel_ventry 1, t, 64, irq // IRQ EL1t @@ -757,17 +788,21 @@ SYM_DATA_END(__entry_tramp_data_start) kernel_ventry 1, h, 64, error // Error EL1h .rept 4 - tramp_ventry .Lvector_start\@, 64, kpti=0 + tramp_ventry .Lvector_start\@, 64, 0, \bhb .endr .rept 4 - tramp_ventry .Lvector_start\@, 32, kpti=0 + tramp_ventry .Lvector_start\@, 32, 0, \bhb .endr .endm +/* The order must match tramp_vecs and the arm64_bp_harden_el1_vectors enum. */ .pushsection ".entry.text", "ax" .align 11 SYM_CODE_START(__bp_harden_el1_vectors) - generate_el1_vector +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_el1_vector bhb=BHB_MITIGATION_LOOP + generate_el1_vector bhb=BHB_MITIGATION_FW +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ SYM_CODE_END(__bp_harden_el1_vectors) .popsection diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 9394f21d7566..6a5eeb8beea3 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -770,3 +770,19 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) return -ENODEV; } } + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); +} diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 63ccb5252190..220c8c60e021 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -92,6 +92,11 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define ARM_SMCCC_ARCH_WORKAROUND_3 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, 0x3fff) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ From b28a8eebe81c186fdb1a0078263b30576c8e1f42 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 25 Nov 2021 14:25:34 +0000 Subject: [PATCH 165/773] arm64: entry: Add macro for reading symbol addresses from the trampoline The trampoline code needs to use the address of symbols in the wider kernel, e.g. vectors. PC-relative addressing wouldn't work as the trampoline code doesn't run at the address the linker expected. tramp_ventry uses a literal pool, unless CONFIG_RANDOMIZE_BASE is set, in which case it uses the data page as a literal pool because the data page can be unmapped when running in user-space, which is required for CPUs vulnerable to meltdown. Pull this logic out as a macro, instead of adding a third copy of it. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/entry.S | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2ceb0c3647b4..8da732fefd8f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -646,6 +646,15 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm + .macro tramp_data_read_var dst, var +#ifdef CONFIG_RANDOMIZE_BASE + tramp_data_page \dst + add \dst, \dst, #:lo12:__entry_tramp_data_\var + ldr \dst, [\dst] +#else + ldr \dst, =\var +#endif + .endm #define BHB_MITIGATION_NONE 0 #define BHB_MITIGATION_LOOP 1 @@ -676,13 +685,8 @@ alternative_else_nop_endif b . 2: tramp_map_kernel x30 -#ifdef CONFIG_RANDOMIZE_BASE - tramp_data_page x30 alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 - ldr x30, [x30] -#else - ldr x30, =vectors -#endif + tramp_data_read_var x30, vectors alternative_if_not ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM prfm plil1strm, [x30, #(1b - \vector_start)] alternative_else_nop_endif @@ -765,7 +769,12 @@ SYM_CODE_END(tramp_exit_compat) .pushsection ".rodata", "a" .align PAGE_SHIFT SYM_DATA_START(__entry_tramp_data_start) +__entry_tramp_data_vectors: .quad vectors +#ifdef CONFIG_ARM_SDE_INTERFACE +__entry_tramp_data___sdei_asm_handler: + .quad __sdei_asm_handler +#endif /* CONFIG_ARM_SDE_INTERFACE */ SYM_DATA_END(__entry_tramp_data_start) .popsection // .rodata #endif /* CONFIG_RANDOMIZE_BASE */ @@ -932,14 +941,7 @@ SYM_CODE_START(__sdei_asm_entry_trampoline) * Remember whether to unmap the kernel on exit. */ 1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] - -#ifdef CONFIG_RANDOMIZE_BASE - tramp_data_page x4 - add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler - ldr x4, [x4] -#else - ldr x4, =__sdei_asm_handler -#endif + tramp_data_read_var x4, __sdei_asm_handler br x4 SYM_CODE_END(__sdei_asm_entry_trampoline) NOKPROBE(__sdei_asm_entry_trampoline) @@ -962,13 +964,6 @@ SYM_CODE_END(__sdei_asm_exit_trampoline) NOKPROBE(__sdei_asm_exit_trampoline) .ltorg .popsection // .entry.tramp.text -#ifdef CONFIG_RANDOMIZE_BASE -.pushsection ".rodata", "a" -SYM_DATA_START(__sdei_asm_trampoline_next_handler) - .quad __sdei_asm_handler -SYM_DATA_END(__sdei_asm_trampoline_next_handler) -.popsection // .rodata -#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* From bd09128d16fac3c34b80bd6a29088ac632e8ce09 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Nov 2021 18:29:25 +0000 Subject: [PATCH 166/773] arm64: Add percpu vectors for EL1 The Spectre-BHB workaround adds a firmware call to the vectors. This is needed on some CPUs, but not others. To avoid the unaffected CPU in a big/little pair from making the firmware call, create per cpu vectors. The per-cpu vectors only apply when returning from EL0. Systems using KPTI can use the canonical 'full-fat' vectors directly at EL1, the trampoline exit code will switch to this_cpu_vector on exit to EL0. Systems not using KPTI should always use this_cpu_vector. this_cpu_vector will point at a vector in tramp_vecs or __bp_harden_el1_vectors, depending on whether KPTI is in use. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/vectors.h | 29 ++++++++++++++++++++++++++++- arch/arm64/kernel/cpufeature.c | 11 +++++++++++ arch/arm64/kernel/entry.S | 12 ++++++------ arch/arm64/kvm/hyp/vhe/switch.c | 10 ++++++++-- 4 files changed, 53 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index bac53fad037d..3f76dfd9e074 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -5,6 +5,15 @@ #ifndef __ASM_VECTORS_H #define __ASM_VECTORS_H +#include +#include + +#include + +extern char vectors[]; +extern char tramp_vectors[]; +extern char __bp_harden_el1_vectors[]; + /* * Note: the order of this enum corresponds to two arrays in entry.S: * tramp_vecs and __bp_harden_el1_vectors. By default the canonical @@ -29,6 +38,24 @@ enum arm64_bp_harden_el1_vectors { * Remap the kernel before branching to the canonical vectors. */ EL1_VECTOR_KPTI, -+}; +}; + +/* The vectors to use on return from EL0. e.g. to remap the kernel */ +DECLARE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector); + +#ifndef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_VALIAS 0 +#endif + +static inline const char * +arm64_get_bp_hardening_vector(enum arm64_bp_harden_el1_vectors slot) +{ + if (arm64_kernel_unmapped_at_el0()) + return (char *)TRAMP_VALIAS + SZ_2K * slot; + + WARN_ON_ONCE(slot == EL1_VECTOR_KPTI); + + return __bp_harden_el1_vectors + SZ_2K * slot; +} #endif /* __ASM_VECTORS_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e5f23dab1c8d..45fed4974c44 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -73,6 +73,8 @@ #include #include #include +#include + #include #include #include @@ -85,6 +87,7 @@ #include #include #include +#include #include /* Kernel representation of AT_HWCAP and AT_HWCAP2 */ @@ -110,6 +113,8 @@ DECLARE_BITMAP(boot_capabilities, ARM64_NPATCHABLE); bool arm64_use_ng_mappings = false; EXPORT_SYMBOL(arm64_use_ng_mappings); +DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; + /* * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs * support it? @@ -1590,6 +1595,12 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) int cpu = smp_processor_id(); + if (__this_cpu_read(this_cpu_vector) == vectors) { + const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); + + __this_cpu_write(this_cpu_vector, v); + } + /* * We don't need to rewrite the page-tables if either we've done * it already or we have KASLR enabled and therefore have not diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8da732fefd8f..a62fee121138 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -38,7 +38,6 @@ .macro kernel_ventry, el:req, ht:req, regsize:req, label:req .align 7 .Lventry_start\@: -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 .if \el == 0 /* * This must be the first instruction of the EL0 vector entries. It is @@ -53,7 +52,6 @@ .endif .Lskip_tramp_vectors_cleanup\@: .endif -#endif sub sp, sp, #PT_REGS_SIZE #ifdef CONFIG_VMAP_STACK @@ -712,10 +710,10 @@ alternative_else_nop_endif .endm .macro tramp_exit, regsize = 64 - adr x30, tramp_vectors -#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY - add x30, x30, SZ_4K -#endif + tramp_data_read_var x30, this_cpu_vector + get_this_cpu_offset x29 + ldr x30, [x30, x29] + msr vbar_el1, x30 ldr lr, [sp, #S_LR] tramp_unmap_kernel x29 @@ -775,6 +773,8 @@ __entry_tramp_data_vectors: __entry_tramp_data___sdei_asm_handler: .quad __sdei_asm_handler #endif /* CONFIG_ARM_SDE_INTERFACE */ +__entry_tramp_data_this_cpu_vector: + .quad this_cpu_vector SYM_DATA_END(__entry_tramp_data_start) .popsection // .rodata #endif /* CONFIG_RANDOMIZE_BASE */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 11d053fdd604..54af47005e45 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -24,6 +25,8 @@ #include #include #include +#include +#include /* VHE specific context */ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); @@ -67,7 +70,7 @@ NOKPROBE_SYMBOL(__activate_traps); static void __deactivate_traps(struct kvm_vcpu *vcpu) { - extern char vectors[]; /* kernel exception vectors */ + const char *host_vectors = vectors; ___deactivate_traps(vcpu); @@ -81,7 +84,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); - write_sysreg(vectors, vbar_el1); + + if (!arm64_kernel_unmapped_at_el0()) + host_vectors = __this_cpu_read(this_cpu_vector); + write_sysreg(host_vectors, vbar_el1); } NOKPROBE_SYMBOL(__deactivate_traps); From dee435be76f4117410bbd90573a881fd33488f37 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 8 Feb 2022 16:08:13 +0000 Subject: [PATCH 167/773] arm64: proton-pack: Report Spectre-BHB vulnerabilities as part of Spectre-v2 Speculation attacks against some high-performance processors can make use of branch history to influence future speculation as part of a spectre-v2 attack. This is not mitigated by CSV2, meaning CPUs that previously reported 'Not affected' are now moderately mitigated by CSV2. Update the value in /sys/devices/system/cpu/vulnerabilities/spectre_v2 to also show the state of the BHB mitigation. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/spectre.h | 2 ++ arch/arm64/kernel/proton-pack.c | 36 ++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index f62ca39da6c5..c617fe90a843 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -93,5 +93,7 @@ void spectre_v4_enable_task_mitigation(struct task_struct *tsk); enum mitigation_state arm64_get_meltdown_state(void); +enum mitigation_state arm64_get_spectre_bhb_state(void); + #endif /* __ASSEMBLY__ */ #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 6a5eeb8beea3..8a499b8373c0 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -96,14 +96,39 @@ static bool spectre_v2_mitigations_off(void) return ret; } +static const char *get_bhb_affected_string(enum mitigation_state bhb_state) +{ + switch (bhb_state) { + case SPECTRE_UNAFFECTED: + return ""; + default: + case SPECTRE_VULNERABLE: + return ", but not BHB"; + case SPECTRE_MITIGATED: + return ", BHB"; + } +} + ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { + enum mitigation_state bhb_state = arm64_get_spectre_bhb_state(); + const char *bhb_str = get_bhb_affected_string(bhb_state); + const char *v2_str = "Branch predictor hardening"; + switch (spectre_v2_state) { case SPECTRE_UNAFFECTED: - return sprintf(buf, "Not affected\n"); + if (bhb_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "Not affected\n"); + + /* + * Platforms affected by Spectre-BHB can't report + * "Not affected" for Spectre-v2. + */ + v2_str = "CSV2"; + fallthrough; case SPECTRE_MITIGATED: - return sprintf(buf, "Mitigation: Branch predictor hardening\n"); + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); case SPECTRE_VULNERABLE: fallthrough; default: @@ -771,6 +796,13 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) } } +static enum mitigation_state spectre_bhb_state; + +enum mitigation_state arm64_get_spectre_bhb_state(void) +{ + return spectre_bhb_state; +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, From 610d086d6df0b15c3732a7b4a5b0f1c3e1b84d4c Mon Sep 17 00:00:00 2001 From: Deren Wu Date: Sun, 13 Feb 2022 00:20:15 +0800 Subject: [PATCH 168/773] mac80211: fix EAPoL rekey fail in 802.3 rx path mac80211 set capability NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211 to upper layer by default. That means we should pass EAPoL packets through nl80211 path only, and should not send the EAPoL skb to netdevice diretly. At the meanwhile, wpa_supplicant would not register sock to listen EAPoL skb on the netdevice. However, there is no control_port_protocol handler in mac80211 for 802.3 RX packets, mac80211 driver would pass up the EAPoL rekey frame to netdevice and wpa_supplicant would be never interactive with this kind of packets, if SUPPORTS_RX_DECAP_OFFLOAD is enabled. This causes STA always rekey fail if EAPoL frame go through 802.3 path. To avoid this problem, align the same process as 802.11 type to handle this frame before put it into network stack. This also addresses a potential security issue in 802.3 RX mode that was previously fixed in commit a8c4d76a8dd4 ("mac80211: do not accept/forward invalid EAPOL frames"). Cc: stable@vger.kernel.org # 5.12+ Fixes: 80a915ec4427 ("mac80211: add rx decapsulation offload support") Signed-off-by: Deren Wu Link: https://lore.kernel.org/r/6889c9fced5859ebb088564035f84fd0fa792a49.1644680751.git.deren.wu@mediatek.com [fix typos, update comment and add note about security issue] Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 93680af62c47..7b699b2f4d89 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2607,7 +2607,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, * address, so that the authenticator (e.g. hostapd) will see * the frame, but bridge won't forward it anywhere else. Note * that due to earlier filtering, the only other address can - * be the PAE group address. + * be the PAE group address, unless the hardware allowed them + * through in 802.3 offloaded mode. */ if (unlikely(skb->protocol == sdata->control_port_protocol && !ether_addr_equal(ehdr->h_dest, sdata->vif.addr))) @@ -4514,12 +4515,7 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, /* deliver to local stack */ skb->protocol = eth_type_trans(skb, fast_rx->dev); - memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->list) - list_add_tail(&skb->list, rx->list); - else - netif_receive_skb(skb); - + ieee80211_deliver_skb_to_local_stack(skb, rx); } static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, From a6bce78262f5dd4b50510f0aa47f3995f7b185f3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 3 Feb 2022 20:15:29 +0100 Subject: [PATCH 169/773] mac80211: refuse aggregations sessions before authorized If an MFP station isn't authorized, the receiver will (or at least should) drop the action frame since it's a robust management frame, but if we're not authorized we haven't installed keys yet. Refuse attempts to start a session as they'd just time out. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220203201528.ff4d5679dce9.I34bb1f2bc341e161af2d6faf74f91b332ba11285@changeid Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 74a878f213d3..1deb3d874a4b 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation */ #include @@ -626,6 +626,14 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, return -EINVAL; } + if (test_sta_flag(sta, WLAN_STA_MFP) && + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { + ht_dbg(sdata, + "MFP STA not authorized - deny BA session request %pM tid %d\n", + sta->sta.addr, tid); + return -EINVAL; + } + /* * 802.11n-2009 11.5.1.1: If the initiating STA is an HT STA, is a * member of an IBSS, and has no other existing Block Ack agreement From 859ae7018316daa4adbc496012dcbbb458d7e510 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Mon, 14 Feb 2022 18:32:14 +0100 Subject: [PATCH 170/773] mac80211: fix forwarded mesh frames AC & queue selection There are two problems with the current code that have been highlighted with the AQL feature that is now enbaled by default. First problem is in ieee80211_rx_h_mesh_fwding(), ieee80211_select_queue_80211() is used on received packets to choose the sending AC queue of the forwarding packet although this function should only be called on TX packet (it uses ieee80211_tx_info). This ends with forwarded mesh packets been sent on unrelated random AC queue. To fix that, AC queue can directly be infered from skb->priority which has been extracted from QOS info (see ieee80211_parse_qos()). Second problem is the value of queue_mapping set on forwarded mesh frames via skb_set_queue_mapping() is not the AC of the packet but a hardware queue index. This may or may not work depending on AC to HW queue mapping which is driver specific. Both of these issues lead to improper AC selection while forwarding mesh packets but more importantly due to improper airtime accounting (which is done on a per STA, per AC basis) caused traffic stall with the introduction of AQL. Fixes: cf44012810cc ("mac80211: fix unnecessary frame drops in mesh fwding") Fixes: d3c1597b8d1b ("mac80211: fix forwarded mesh frame queue mapping") Co-developed-by: Remi Pommarel Signed-off-by: Remi Pommarel Signed-off-by: Nicolas Escande Link: https://lore.kernel.org/r/20220214173214.368862-1-nico.escande@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7b699b2f4d89..48d9553dafe3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2923,13 +2923,13 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) ether_addr_equal(sdata->vif.addr, hdr->addr3)) return RX_CONTINUE; - ac = ieee80211_select_queue_80211(sdata, skb, hdr); + ac = ieee802_1d_to_ac[skb->priority]; q = sdata->vif.hw_queue[ac]; if (ieee80211_queue_stopped(&local->hw, q)) { IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion); return RX_DROP_MONITOR; } - skb_set_queue_mapping(skb, q); + skb_set_queue_mapping(skb, ac); if (!--mesh_hdr->ttl) { if (!is_multicast_ether_addr(hdr->addr1)) From 7920af5c826cb4a7ada1ae26fdd317642805adc2 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sat, 12 Feb 2022 14:50:48 -0600 Subject: [PATCH 171/773] gpio: rockchip: Reset int_bothedge when changing trigger With v2 hardware, an IRQ can be configured to trigger on both edges via a bit in the int_bothedge register. Currently, the driver sets this bit when changing the trigger type to IRQ_TYPE_EDGE_BOTH, but fails to reset this bit if the trigger type is later changed to something else. This causes spurious IRQs, and when using gpio-keys with wakeup-event-action set to EV_ACT_(DE)ASSERTED, those IRQs translate into spurious wakeups. Fixes: 3bcbd1a85b68 ("gpio/rockchip: support next version gpio controller") Reported-by: Guillaume Savaton Tested-by: Guillaume Savaton Signed-off-by: Samuel Holland Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-rockchip.c | 56 +++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index a4c4e4584f5b..099e358d2491 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -410,10 +410,8 @@ static int rockchip_irq_set_type(struct irq_data *d, unsigned int type) level = rockchip_gpio_readl(bank, bank->gpio_regs->int_type); polarity = rockchip_gpio_readl(bank, bank->gpio_regs->int_polarity); - switch (type) { - case IRQ_TYPE_EDGE_BOTH: + if (type == IRQ_TYPE_EDGE_BOTH) { if (bank->gpio_type == GPIO_TYPE_V2) { - bank->toggle_edge_mode &= ~mask; rockchip_gpio_writel_bit(bank, d->hwirq, 1, bank->gpio_regs->int_bothedge); goto out; @@ -431,30 +429,34 @@ static int rockchip_irq_set_type(struct irq_data *d, unsigned int type) else polarity |= mask; } - break; - case IRQ_TYPE_EDGE_RISING: - bank->toggle_edge_mode &= ~mask; - level |= mask; - polarity |= mask; - break; - case IRQ_TYPE_EDGE_FALLING: - bank->toggle_edge_mode &= ~mask; - level |= mask; - polarity &= ~mask; - break; - case IRQ_TYPE_LEVEL_HIGH: - bank->toggle_edge_mode &= ~mask; - level &= ~mask; - polarity |= mask; - break; - case IRQ_TYPE_LEVEL_LOW: - bank->toggle_edge_mode &= ~mask; - level &= ~mask; - polarity &= ~mask; - break; - default: - ret = -EINVAL; - goto out; + } else { + if (bank->gpio_type == GPIO_TYPE_V2) { + rockchip_gpio_writel_bit(bank, d->hwirq, 0, + bank->gpio_regs->int_bothedge); + } else { + bank->toggle_edge_mode &= ~mask; + } + switch (type) { + case IRQ_TYPE_EDGE_RISING: + level |= mask; + polarity |= mask; + break; + case IRQ_TYPE_EDGE_FALLING: + level |= mask; + polarity &= ~mask; + break; + case IRQ_TYPE_LEVEL_HIGH: + level &= ~mask; + polarity |= mask; + break; + case IRQ_TYPE_LEVEL_LOW: + level &= ~mask; + polarity &= ~mask; + break; + default: + ret = -EINVAL; + goto out; + } } rockchip_gpio_writel(bank, level, bank->gpio_regs->int_type); From 25666e8ccd952627899b09b68f7c9b68cfeaf028 Mon Sep 17 00:00:00 2001 From: Lucas Zampieri Date: Wed, 26 Jan 2022 11:44:00 -0300 Subject: [PATCH 172/773] HID: logitech-dj: add new lightspeed receiver id As of logitech lightspeed receiver fw version 04.02.B0009, HIDPP_PARAM_DEVICE_INFO is being reported as 0x11. With patch "HID: logitech-dj: add support for the new lightspeed receiver iteration", the mouse starts to error out with: logitech-djreceiver: unusable device of type UNKNOWN (0x011) connected on slot 1 and becomes unusable. This has been noticed on a Logitech G Pro X Superlight fw MPM 25.01.B0018. Signed-off-by: Lucas Zampieri Acked-by: Nestor Lopez Casado Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-dj.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index 7106b921b53c..c358778e070b 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -1068,6 +1068,7 @@ static void logi_hidpp_recv_queue_notif(struct hid_device *hdev, workitem.reports_supported |= STD_KEYBOARD; break; case 0x0f: + case 0x11: device_type = "eQUAD Lightspeed 1.2"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); workitem.reports_supported |= STD_KEYBOARD; From 0a5a587501b54e8c6d86960b047d4491fd40dcf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20H=C3=BCbner?= Date: Thu, 20 Jan 2022 08:40:48 +0100 Subject: [PATCH 173/773] HID: Add support for open wheel and no attachment to T300 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Different add ons to the wheel base report different models. Having no wheel mounted to the base and using the open wheel attachment is added here. Signed-off-by: Michael Hübner Signed-off-by: Jiri Kosina --- drivers/hid/hid-thrustmaster.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c index 03b935ff02d5..a4e20f9e598b 100644 --- a/drivers/hid/hid-thrustmaster.c +++ b/drivers/hid/hid-thrustmaster.c @@ -64,7 +64,9 @@ struct tm_wheel_info { */ static const struct tm_wheel_info tm_wheels_infos[] = { {0x0306, 0x0006, "Thrustmaster T150RS"}, + {0x0200, 0x0005, "Thrustmaster T300RS (Missing Attachment)"}, {0x0206, 0x0005, "Thrustmaster T300RS"}, + {0x0209, 0x0005, "Thrustmaster T300RS (Open Wheel Attachment)"}, {0x0204, 0x0005, "Thrustmaster T300 Ferrari Alcantara Edition"}, {0x0002, 0x0002, "Thrustmaster T500RS"} //{0x0407, 0x0001, "Thrustmaster TMX"} From 184b58fa816fb5ee1854daf0d430766422bf2a77 Mon Sep 17 00:00:00 2001 From: Mikko Perttunen Date: Mon, 7 Feb 2022 15:19:31 +0200 Subject: [PATCH 174/773] gpu: host1x: Always return syncpoint value when waiting The new TegraDRM UAPI uses syncpoint waiting with timeout set to zero to indicate reading the syncpoint value. To support that we need to return the syncpoint value always when waiting. Fixes: 44e961381354 ("drm/tegra: Implement syncpoint wait UAPI") Signed-off-by: Mikko Perttunen Signed-off-by: Thierry Reding --- drivers/gpu/host1x/syncpt.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c index 83a16e46146b..f87a8705f518 100644 --- a/drivers/gpu/host1x/syncpt.c +++ b/drivers/gpu/host1x/syncpt.c @@ -234,27 +234,12 @@ int host1x_syncpt_wait(struct host1x_syncpt *sp, u32 thresh, long timeout, void *ref; struct host1x_waitlist *waiter; int err = 0, check_count = 0; - u32 val; if (value) - *value = 0; - - /* first check cache */ - if (host1x_syncpt_is_expired(sp, thresh)) { - if (value) - *value = host1x_syncpt_load(sp); + *value = host1x_syncpt_load(sp); + if (host1x_syncpt_is_expired(sp, thresh)) return 0; - } - - /* try to read from register */ - val = host1x_hw_syncpt_load(sp->host, sp); - if (host1x_syncpt_is_expired(sp, thresh)) { - if (value) - *value = val; - - goto done; - } if (!timeout) { err = -EAGAIN; From 9bdd10d57a8807dba0003af0325191f3cec0f11c Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 15 Feb 2022 14:06:45 +0100 Subject: [PATCH 175/773] ASoC: ops: Shift tested values in snd_soc_put_volsw() by +min While the $val/$val2 values passed in from userspace are always >= 0 integers, the limits of the control can be signed integers and the $min can be non-zero and less than zero. To correctly validate $val/$val2 against platform_max, add the $min offset to val first. Fixes: 817f7c9335ec0 ("ASoC: ops: Reject out of bounds values in snd_soc_put_volsw()") Signed-off-by: Marek Vasut Cc: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220215130645.164025-1-marex@denx.de Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 03ea9591fb16..a0ca58ba1627 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -319,7 +319,7 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (ucontrol->value.integer.value[0] < 0) return -EINVAL; val = ucontrol->value.integer.value[0]; - if (mc->platform_max && val > mc->platform_max) + if (mc->platform_max && ((int)val + min) > mc->platform_max) return -EINVAL; if (val > max - min) return -EINVAL; @@ -332,7 +332,7 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (ucontrol->value.integer.value[1] < 0) return -EINVAL; val2 = ucontrol->value.integer.value[1]; - if (mc->platform_max && val2 > mc->platform_max) + if (mc->platform_max && ((int)val2 + min) > mc->platform_max) return -EINVAL; if (val2 > max - min) return -EINVAL; From c5487b9cdea5c1ede38a7ec94db0fc59963c8e86 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 15 Feb 2022 09:05:14 -0300 Subject: [PATCH 176/773] ASoC: cs4265: Fix the duplicated control name Currently, the following error messages are seen during boot: asoc-simple-card sound: control 2:0:0:SPDIF Switch:0 is already present cs4265 1-004f: ASoC: failed to add widget SPDIF dapm kcontrol SPDIF Switch: -16 Quoting Mark Brown: "The driver is just plain buggy, it defines both a regular SPIDF Switch control and a SND_SOC_DAPM_SWITCH() called SPDIF both of which will create an identically named control, it can never have loaded without error. One or both of those has to be renamed or they need to be merged into one thing." Fix the duplicated control name by combining the two SPDIF controls here and move the register bits onto the DAPM widget and have DAPM control them. Fixes: f853d6b3ba34 ("ASoC: cs4265: Add a S/PDIF enable switch") Signed-off-by: Fabio Estevam Acked-by: Charles Keepax Link: https://lore.kernel.org/r/20220215120514.1760628-1-festevam@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs4265.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs4265.c b/sound/soc/codecs/cs4265.c index 4aaee1873a11..4415fb364d4d 100644 --- a/sound/soc/codecs/cs4265.c +++ b/sound/soc/codecs/cs4265.c @@ -150,7 +150,6 @@ static const struct snd_kcontrol_new cs4265_snd_controls[] = { SOC_SINGLE("E to F Buffer Disable Switch", CS4265_SPDIF_CTL1, 6, 1, 0), SOC_ENUM("C Data Access", cam_mode_enum), - SOC_SINGLE("SPDIF Switch", CS4265_SPDIF_CTL2, 5, 1, 1), SOC_SINGLE("Validity Bit Control Switch", CS4265_SPDIF_CTL2, 3, 1, 0), SOC_ENUM("SPDIF Mono/Stereo", spdif_mono_stereo_enum), @@ -186,7 +185,7 @@ static const struct snd_soc_dapm_widget cs4265_dapm_widgets[] = { SND_SOC_DAPM_SWITCH("Loopback", SND_SOC_NOPM, 0, 0, &loopback_ctl), - SND_SOC_DAPM_SWITCH("SPDIF", SND_SOC_NOPM, 0, 0, + SND_SOC_DAPM_SWITCH("SPDIF", CS4265_SPDIF_CTL2, 5, 1, &spdif_switch), SND_SOC_DAPM_SWITCH("DAC", CS4265_PWRCTL, 1, 1, &dac_switch), From bfe55a1f7fd6bfede16078bf04c6250fbca11588 Mon Sep 17 00:00:00 2001 From: Woody Suwalski Date: Wed, 9 Feb 2022 16:05:09 -0500 Subject: [PATCH 177/773] ACPI: processor: idle: fix lockup regression on 32-bit ThinkPad T40 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add and ACPI idle power level limit for 32-bit ThinkPad T40. There is a regression on T40 introduced by commit d6b88ce2, starting with kernel 5.16: commit d6b88ce2eb9d2698eb24451eb92c0a1649b17bb1 Author: Richard Gong Date:   Wed Sep 22 08:31:16 2021 -0500 ACPI: processor idle: Allow playing dead in C3 state The above patch is trying to enter C3 state during init, what is causing a T40 system freeze. I have not found a similar issue on any other of my 32-bit machines. The fix is to add another exception to the processor_power_dmi_table[] list. As a result the dmesg shows as expected: [2.155398] ACPI: IBM ThinkPad T40 detected - limiting to C2 max_cstate. Override with "processor.max_cstate=9" [2.155404] ACPI: processor limited to max C-state 2 The fix is trivial and affects only vintage T40 systems. Fixes: d6b88ce2eb9d ("CPI: processor idle: Allow playing dead in C3 state") Signed-off-by: Woody Suwalski Reviewed-by: Hans de Goede Cc: 5.16+ # 5.16+ [ rjw: New subject ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/processor_idle.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 86560a28751b..f8e9fa82cb9b 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -96,6 +96,11 @@ static const struct dmi_system_id processor_power_dmi_table[] = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), DMI_MATCH(DMI_PRODUCT_NAME,"L8400B series Notebook PC")}, (void *)1}, + /* T40 can not handle C3 idle state */ + { set_max_cstate, "IBM ThinkPad T40", { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "23737CU")}, + (void *)2}, {}, }; From cc19db8b312a6c75645645f5cc1b45166b109006 Mon Sep 17 00:00:00 2001 From: Chuanhong Guo Date: Fri, 11 Feb 2022 08:13:44 +0800 Subject: [PATCH 178/773] MIPS: ralink: mt7621: do memory detection on KSEG1 It's reported that current memory detection code occasionally detects larger memory under some bootloaders. Current memory detection code tests whether address space wraps around on KSEG0, which is unreliable because it's cached. Rewrite memory size detection to perform the same test on KSEG1 instead. While at it, this patch also does the following two things: 1. use a fixed pattern instead of a random function pointer as the magic value. 2. add an additional memory write and a second comparison as part of the test to prevent possible smaller memory detection result due to leftover values in memory. Fixes: 139c949f7f0a MIPS: ("ralink: mt7621: add memory detection support") Reported-by: Rui Salvaterra Signed-off-by: Chuanhong Guo Tested-by: Sergio Paracuellos Tested-by: Rui Salvaterra Signed-off-by: Thomas Bogendoerfer --- arch/mips/ralink/mt7621.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c index d6efffd4dd20..12c8808e0dea 100644 --- a/arch/mips/ralink/mt7621.c +++ b/arch/mips/ralink/mt7621.c @@ -22,7 +22,9 @@ #include "common.h" -static void *detect_magic __initdata = detect_memory_region; +#define MT7621_MEM_TEST_PATTERN 0xaa5555aa + +static u32 detect_magic __initdata; int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { @@ -58,24 +60,32 @@ phys_addr_t mips_cpc_default_phys_base(void) panic("Cannot detect cpc address"); } +static bool __init mt7621_addr_wraparound_test(phys_addr_t size) +{ + void *dm = (void *)KSEG1ADDR(&detect_magic); + + if (CPHYSADDR(dm + size) >= MT7621_LOWMEM_MAX_SIZE) + return true; + __raw_writel(MT7621_MEM_TEST_PATTERN, dm); + if (__raw_readl(dm) != __raw_readl(dm + size)) + return false; + __raw_writel(!MT7621_MEM_TEST_PATTERN, dm); + return __raw_readl(dm) == __raw_readl(dm + size); +} + static void __init mt7621_memory_detect(void) { - void *dm = &detect_magic; phys_addr_t size; - for (size = 32 * SZ_1M; size < 256 * SZ_1M; size <<= 1) { - if (!__builtin_memcmp(dm, dm + size, sizeof(detect_magic))) - break; + for (size = 32 * SZ_1M; size <= 256 * SZ_1M; size <<= 1) { + if (mt7621_addr_wraparound_test(size)) { + memblock_add(MT7621_LOWMEM_BASE, size); + return; + } } - if ((size == 256 * SZ_1M) && - (CPHYSADDR(dm + size) < MT7621_LOWMEM_MAX_SIZE) && - __builtin_memcmp(dm, dm + size, sizeof(detect_magic))) { - memblock_add(MT7621_LOWMEM_BASE, MT7621_LOWMEM_MAX_SIZE); - memblock_add(MT7621_HIGHMEM_BASE, MT7621_HIGHMEM_SIZE); - } else { - memblock_add(MT7621_LOWMEM_BASE, size); - } + memblock_add(MT7621_LOWMEM_BASE, MT7621_LOWMEM_MAX_SIZE); + memblock_add(MT7621_HIGHMEM_BASE, MT7621_HIGHMEM_SIZE); } void __init ralink_of_remap(void) From f2703def339c793674010cc9f01bfe4980231808 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 12 Feb 2022 22:21:11 +0000 Subject: [PATCH 179/773] MIPS: smp: fill in sibling and core maps earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After enabling CONFIG_SCHED_CORE (landed during 5.14 cycle), 2-core 2-thread-per-core interAptiv (CPS-driven) started emitting the following: [ 0.025698] CPU1 revision is: 0001a120 (MIPS interAptiv (multi)) [ 0.048183] ------------[ cut here ]------------ [ 0.048187] WARNING: CPU: 1 PID: 0 at kernel/sched/core.c:6025 sched_core_cpu_starting+0x198/0x240 [ 0.048220] Modules linked in: [ 0.048233] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.17.0-rc3+ #35 b7b319f24073fd9a3c2aa7ad15fb7993eec0b26f [ 0.048247] Stack : 817f0000 00000004 327804c8 810eb050 00000000 00000004 00000000 c314fdd1 [ 0.048278] 830cbd64 819c0000 81800000 817f0000 83070bf4 00000001 830cbd08 00000000 [ 0.048307] 00000000 00000000 815fcbc4 00000000 00000000 00000000 00000000 00000000 [ 0.048334] 00000000 00000000 00000000 00000000 817f0000 00000000 00000000 817f6f34 [ 0.048361] 817f0000 818a3c00 817f0000 00000004 00000000 00000000 4dc33260 0018c933 [ 0.048389] ... [ 0.048396] Call Trace: [ 0.048399] [<8105a7bc>] show_stack+0x3c/0x140 [ 0.048424] [<8131c2a0>] dump_stack_lvl+0x60/0x80 [ 0.048440] [<8108b5c0>] __warn+0xc0/0xf4 [ 0.048454] [<8108b658>] warn_slowpath_fmt+0x64/0x10c [ 0.048467] [<810bd418>] sched_core_cpu_starting+0x198/0x240 [ 0.048483] [<810c6514>] sched_cpu_starting+0x14/0x80 [ 0.048497] [<8108c0f8>] cpuhp_invoke_callback_range+0x78/0x140 [ 0.048510] [<8108d914>] notify_cpu_starting+0x94/0x140 [ 0.048523] [<8106593c>] start_secondary+0xbc/0x280 [ 0.048539] [ 0.048543] ---[ end trace 0000000000000000 ]--- [ 0.048636] Synchronize counters for CPU 1: done. ...for each but CPU 0/boot. Basic debug printks right before the mentioned line say: [ 0.048170] CPU: 1, smt_mask: So smt_mask, which is sibling mask obviously, is empty when entering the function. This is critical, as sched_core_cpu_starting() calculates core-scheduling parameters only once per CPU start, and it's crucial to have all the parameters filled in at that moment (at least it uses cpu_smt_mask() which in fact is `&cpu_sibling_map[cpu]` on MIPS). A bit of debugging led me to that set_cpu_sibling_map() performing the actual map calculation, was being invocated after notify_cpu_start(), and exactly the latter function starts CPU HP callback round (sched_core_cpu_starting() is basically a CPU HP callback). While the flow is same on ARM64 (maps after the notifier, although before calling set_cpu_online()), x86 started calculating sibling maps earlier than starting the CPU HP callbacks in Linux 4.14 (see [0] for the reference). Neither me nor my brief tests couldn't find any potential caveats in calculating the maps right after performing delay calibration, but the WARN splat is now gone. The very same debug prints now yield exactly what I expected from them: [ 0.048433] CPU: 1, smt_mask: 0-1 [0] https://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git/commit/?id=76ce7cfe35ef Signed-off-by: Alexander Lobakin Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index d542fb7af3ba..1986d1309410 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -351,6 +351,9 @@ asmlinkage void start_secondary(void) cpu = smp_processor_id(); cpu_data[cpu].udelay_val = loops_per_jiffy; + set_cpu_sibling_map(cpu); + set_cpu_core_map(cpu); + cpumask_set_cpu(cpu, &cpu_coherent_mask); notify_cpu_starting(cpu); @@ -362,9 +365,6 @@ asmlinkage void start_secondary(void) /* The CPU is running and counters synchronised, now mark it online */ set_cpu_online(cpu, true); - set_cpu_sibling_map(cpu); - set_cpu_core_map(cpu); - calculate_cpu_foreign_map(); /* From d19e0183a88306acda07f4a01fedeeffe2a2a06b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Feb 2022 18:05:18 -0500 Subject: [PATCH 180/773] NFS: Do not report writeback errors in nfs_getattr() The result of the writeback, whether it is an ENOSPC or an EIO, or anything else, does not inhibit the NFS client from reporting the correct file timestamps. Fixes: 79566ef018f5 ("NFS: Getattr doesn't require data sync semantics") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a918c3a834b6..d96baa4450e3 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -853,12 +853,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, } /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME|STATX_MTIME)) && - S_ISREG(inode->i_mode)) { - err = filemap_write_and_wait(inode->i_mapping); - if (err) - goto out; - } + if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + S_ISREG(inode->i_mode)) + filemap_write_and_wait(inode->i_mapping); /* * We may force a getattr if the user cares about atime. From 45ce4b4f9009102cd9f581196d480a59208690c1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 17 Feb 2022 01:49:43 +0530 Subject: [PATCH 181/773] bpf: Fix crash due to out of bounds access into reg2btf_ids. When commit e6ac2450d6de ("bpf: Support bpf program calling kernel function") added kfunc support, it defined reg2btf_ids as a cheap way to translate the verifier reg type to the appropriate btf_vmlinux BTF ID, however commit c25b2ae13603 ("bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL") moved the __BPF_REG_TYPE_MAX from the last member of bpf_reg_type enum to after the base register types, and defined other variants using type flag composition. However, now, the direct usage of reg->type to index into reg2btf_ids may no longer fall into __BPF_REG_TYPE_MAX range, and hence lead to out of bounds access and kernel crash on dereference of bad pointer. Fixes: c25b2ae13603 ("bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220216201943.624869-1-memxor@gmail.com --- kernel/bpf/btf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e16dafeb2450..3e23b3fa79ff 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5688,7 +5688,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, } if (check_ptr_off_reg(env, reg, regno)) return -EINVAL; - } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) { + } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || + (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { const struct btf_type *reg_ref_t; const struct btf *reg_btf; const char *reg_ref_tname; @@ -5706,7 +5707,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_id = reg->btf_id; } else { reg_btf = btf_vmlinux; - reg_ref_id = *reg2btf_ids[reg->type]; + reg_ref_id = *reg2btf_ids[base_type(reg->type)]; } reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, From 53923e0fe2098f90f339510aeaa0e1413ae99a16 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 16 Feb 2022 13:23:53 -0600 Subject: [PATCH 182/773] cifs: fix confusing unneeded warning message on smb2.1 and earlier When mounting with SMB2.1 or earlier, even with nomultichannel, we log the confusing warning message: "CIFS: VFS: multichannel is not supported on this protocol version, use 3.0 or above" Fix this so that we don't log this unless they really are trying to mount with multichannel. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215608 Reported-by: Kim Scarborough Cc: stable@vger.kernel.org # 5.11+ Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/sess.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 5723d50340e5..32f478c7a66d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -127,11 +127,6 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) struct cifs_server_iface *ifaces = NULL; size_t iface_count; - if (ses->server->dialect < SMB30_PROT_ID) { - cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); - return 0; - } - spin_lock(&ses->chan_lock); new_chan_count = old_chan_count = ses->chan_count; @@ -145,6 +140,12 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) return 0; } + if (ses->server->dialect < SMB30_PROT_ID) { + spin_unlock(&ses->chan_lock); + cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); + return 0; + } + if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { ses->chan_max = 1; spin_unlock(&ses->chan_lock); From 16693c1b2d98cebc8dedf03b49d1053cf1826c86 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Wed, 16 Feb 2022 09:17:43 +0000 Subject: [PATCH 183/773] drm/tegra: Fix cast to restricted __le32 Sparse warns about the following cast in the function falcon_copy_firmware_image() ... drivers/gpu/drm/tegra/falcon.c:66:27: warning: cast to restricted __le32 Fix this by casting the firmware data array to __le32 instead of u32. Signed-off-by: Jon Hunter Signed-off-by: Thierry Reding --- drivers/gpu/drm/tegra/falcon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tegra/falcon.c b/drivers/gpu/drm/tegra/falcon.c index 223ab2ceb7e6..3762d87759d9 100644 --- a/drivers/gpu/drm/tegra/falcon.c +++ b/drivers/gpu/drm/tegra/falcon.c @@ -63,7 +63,7 @@ static void falcon_copy_firmware_image(struct falcon *falcon, /* copy the whole thing taking into account endianness */ for (i = 0; i < firmware->size / sizeof(u32); i++) - virt[i] = le32_to_cpu(((u32 *)firmware->data)[i]); + virt[i] = le32_to_cpu(((__le32 *)firmware->data)[i]); } static int falcon_parse_firmware_image(struct falcon *falcon) From acd289e04a0a1f52bea7ff1129b365626059e3c2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 15 Feb 2022 14:27:54 +0100 Subject: [PATCH 184/773] ALSA: hda: Set max DMA segment size The recent code refactoring to use the standard DMA helper requires the max DMA segment size setup for SG list management. Without it, the kernel may spew warnings when a large buffer is allocated. This patch sets up dma_set_max_seg_size() for avoiding spurious warnings. Fixes: 2c95b92ecd92 ("ALSA: memalloc: Unify x86 SG-buffer handling (take#3)") Cc: BugLink: https://github.com/thesofproject/linux/issues/3430 Link: https://lore.kernel.org/r/20220215132756.31236-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 917ad9d375b1..572ff0d1fafe 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -1941,6 +1941,7 @@ static int azx_first_init(struct azx *chip) dma_bits = 32; if (dma_set_mask_and_coherent(&pci->dev, DMA_BIT_MASK(dma_bits))) dma_set_mask_and_coherent(&pci->dev, DMA_BIT_MASK(32)); + dma_set_max_seg_size(&pci->dev, UINT_MAX); /* read number of streams from GCAP register instead of using * hardcoded value From 8872fc0d04592925b74ad9ab1b5686f4e016befe Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 15 Feb 2022 14:27:55 +0100 Subject: [PATCH 185/773] ASoC: SOF: hda: Set max DMA segment size The recent code refactoring to use the standard DMA helper requires the max DMA segment size setup for SG list management. Without it, the kernel may spew warnings when a large buffer is allocated. This patch sets up dma_set_max_seg_size() for avoiding spurious warnings. Fixes: 2c95b92ecd92 ("ALSA: memalloc: Unify x86 SG-buffer handling (take#3)") Acked-by: Mark Brown Cc: BugLink: https://github.com/thesofproject/linux/issues/3430 Link: https://lore.kernel.org/r/20220215132756.31236-3-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/soc/sof/intel/hda.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index c8fb082209ce..1385695d7745 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -956,6 +956,7 @@ int hda_dsp_probe(struct snd_sof_dev *sdev) dev_dbg(sdev->dev, "DMA mask is 32 bit\n"); dma_set_mask_and_coherent(&pci->dev, DMA_BIT_MASK(32)); } + dma_set_max_seg_size(&pci->dev, UINT_MAX); /* init streams */ ret = hda_dsp_stream_init(sdev); From c22a8086b384025ab97ce07465420a219697d3f2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 15 Feb 2022 14:27:56 +0100 Subject: [PATCH 186/773] ASoC: intel: skylake: Set max DMA segment size The recent code refactoring to use the standard DMA helper requires the max DMA segment size setup for SG list management. Without it, the kernel may spew warnings when a large buffer is allocated. This patch sets up dma_set_max_seg_size() for avoiding spurious warnings. Fixes: 2c95b92ecd92 ("ALSA: memalloc: Unify x86 SG-buffer handling (take#3)") Acked-by: Cezary Rojewski Acked-by: Mark Brown Cc: BugLink: https://github.com/thesofproject/linux/issues/3430 Link: https://lore.kernel.org/r/20220215132756.31236-4-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/soc/intel/skylake/skl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/intel/skylake/skl.c b/sound/soc/intel/skylake/skl.c index 148ddf4cace0..aeca58246fc7 100644 --- a/sound/soc/intel/skylake/skl.c +++ b/sound/soc/intel/skylake/skl.c @@ -952,6 +952,7 @@ static int skl_first_init(struct hdac_bus *bus) /* allow 64bit DMA address if supported by H/W */ if (dma_set_mask_and_coherent(bus->dev, DMA_BIT_MASK(64))) dma_set_mask_and_coherent(bus->dev, DMA_BIT_MASK(32)); + dma_set_max_seg_size(bus->dev, UINT_MAX); /* initialize streams */ snd_hdac_ext_stream_init_all From ac89895213d8950dba6ab342863a0959f73142a7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 17 Feb 2022 14:13:49 +0100 Subject: [PATCH 187/773] HID: elo: Revert USB reference counting Commit 817b8b9c539 ("HID: elo: fix memory leak in elo_probe") introduced memory leak on error path, but more importantly the whole USB reference counting is not needed at all in the first place, as the driver itself doesn't change the reference counting in any way, and the associated usb_device is guaranteed to be kept around by USB core as long as the driver binding exists. Reported-by: Alan Stern Reported-by: Dan Carpenter Fixes: fbf42729d0e ("HID: elo: update the reference count of the usb device structure") Fixes: 817b8b9c539 ("HID: elo: fix memory leak in elo_probe") Signed-off-by: Jiri Kosina --- drivers/hid/hid-elo.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/hid/hid-elo.c b/drivers/hid/hid-elo.c index 9b42b0cdeef0..2876cb6a7dca 100644 --- a/drivers/hid/hid-elo.c +++ b/drivers/hid/hid-elo.c @@ -228,7 +228,6 @@ static int elo_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct elo_priv *priv; int ret; - struct usb_device *udev; if (!hid_is_usb(hdev)) return -EINVAL; @@ -238,8 +237,7 @@ static int elo_probe(struct hid_device *hdev, const struct hid_device_id *id) return -ENOMEM; INIT_DELAYED_WORK(&priv->work, elo_work); - udev = interface_to_usbdev(to_usb_interface(hdev->dev.parent)); - priv->usbdev = usb_get_dev(udev); + priv->usbdev = interface_to_usbdev(to_usb_interface(hdev->dev.parent)); hid_set_drvdata(hdev, priv); @@ -262,7 +260,6 @@ static int elo_probe(struct hid_device *hdev, const struct hid_device_id *id) return 0; err_free: - usb_put_dev(udev); kfree(priv); return ret; } @@ -271,8 +268,6 @@ static void elo_remove(struct hid_device *hdev) { struct elo_priv *priv = hid_get_drvdata(hdev); - usb_put_dev(priv->usbdev); - hid_hw_stop(hdev); cancel_delayed_work_sync(&priv->work); kfree(priv); From a867e9d0cc15039a6ef72e17e2603303dcd1783f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 17 Feb 2022 10:12:42 +0000 Subject: [PATCH 188/773] KVM: arm64: Don't miss pending interrupts for suspended vCPU In order to properly emulate the WFI instruction, KVM reads back ICH_VMCR_EL2 and enables doorbells for GICv4. These preparations are necessary in order to recognize pending interrupts in kvm_arch_vcpu_runnable() and return to the guest. Until recently, this work was done by kvm_arch_vcpu_{blocking,unblocking}(). Since commit 6109c5a6ab7f ("KVM: arm64: Move vGIC v4 handling for WFI out arch callback hook"), these callbacks were gutted and superseded by kvm_vcpu_wfi(). It is important to note that KVM implements PSCI CPU_SUSPEND calls as a WFI within the guest. However, the implementation calls directly into kvm_vcpu_halt(), which skips the needed work done in kvm_vcpu_wfi() to detect pending interrupts. Fix the issue by calling the WFI helper. Fixes: 6109c5a6ab7f ("KVM: arm64: Move vGIC v4 handling for WFI out arch callback hook") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220217101242.3013716-1-oupton@google.com --- arch/arm64/kvm/psci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 3eae32876897..2ce60fecd861 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -46,8 +46,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) * specification (ARM DEN 0022A). This means all suspend states * for KVM will preserve the register state. */ - kvm_vcpu_halt(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); + kvm_vcpu_wfi(vcpu); return PSCI_RET_SUCCESS; } From 127770ac0d043435375ab86434f31a93efa88215 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Mon, 14 Feb 2022 21:29:51 +0000 Subject: [PATCH 189/773] KVM: x86: Add KVM_CAP_ENABLE_CAP to x86 Follow the precedent set by other architectures that support the VCPU ioctl, KVM_ENABLE_CAP, and advertise the VM extension, KVM_CAP_ENABLE_CAP. This way, userspace can ensure that KVM_ENABLE_CAP is available on a vcpu before using it. Fixes: 5c919412fe61 ("kvm/x86: Hyper-V synthetic interrupt controller") Signed-off-by: Aaron Lewis Message-Id: <20220214212950.1776943-1-aaronlewis@google.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 +- arch/x86/kvm/x86.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index a4267104db50..3b4da6c7b25f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1394,7 +1394,7 @@ documentation when it pops into existence). ------------------- :Capability: KVM_CAP_ENABLE_CAP -:Architectures: mips, ppc, s390 +:Architectures: mips, ppc, s390, x86 :Type: vcpu ioctl :Parameters: struct kvm_enable_cap (in) :Returns: 0 on success; -1 on error diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 641044db415d..b8bd8da32045 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4233,6 +4233,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_SYS_ATTRIBUTES: + case KVM_CAP_ENABLE_CAP: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: From 4cb9a998b1ce25fad74a82f5a5c45a4ef40de337 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 15 Feb 2022 02:15:42 -0800 Subject: [PATCH 190/773] KVM: Fix lockdep false negative during host resume I saw the below splatting after the host suspended and resumed. WARNING: CPU: 0 PID: 2943 at kvm/arch/x86/kvm/../../../virt/kvm/kvm_main.c:5531 kvm_resume+0x2c/0x30 [kvm] CPU: 0 PID: 2943 Comm: step_after_susp Tainted: G W IOE 5.17.0-rc3+ #4 RIP: 0010:kvm_resume+0x2c/0x30 [kvm] Call Trace: syscore_resume+0x90/0x340 suspend_devices_and_enter+0xaee/0xe90 pm_suspend.cold+0x36b/0x3c2 state_store+0x82/0xf0 kernfs_fop_write_iter+0x1b6/0x260 new_sync_write+0x258/0x370 vfs_write+0x33f/0x510 ksys_write+0xc9/0x160 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae lockdep_is_held() can return -1 when lockdep is disabled which triggers this warning. Let's use lockdep_assert_not_held() which can detect incorrect calls while holding a lock and it also avoids false negatives when lockdep is disabled. Signed-off-by: Wanpeng Li Message-Id: <1644920142-81249-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 58d31da8a2f7..0afc016cc54d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5528,9 +5528,7 @@ static int kvm_suspend(void) static void kvm_resume(void) { if (kvm_usage_count) { -#ifdef CONFIG_LOCKDEP - WARN_ON(lockdep_is_held(&kvm_count_lock)); -#endif + lockdep_assert_not_held(&kvm_count_lock); hardware_enable_nolock(NULL); } } From 3a55f729240a686aa8af00af436306c0cd532522 Mon Sep 17 00:00:00 2001 From: Anton Romanov Date: Wed, 16 Feb 2022 18:26:54 +0000 Subject: [PATCH 191/773] kvm: x86: Disable KVM_HC_CLOCK_PAIRING if tsc is in always catchup mode If vcpu has tsc_always_catchup set each request updates pvclock data. KVM_HC_CLOCK_PAIRING consumers such as ptp_kvm_x86 rely on tsc read on host's side and do hypercall inside pvclock_read_retry loop leading to infinite loop in such situation. v3: Removed warn Changed return code to KVM_EFAULT v2: Added warn Signed-off-by: Anton Romanov Message-Id: <20220216182653.506850-1-romanton@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b8bd8da32045..3471efe47e5f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8943,6 +8943,13 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) return -KVM_EOPNOTSUPP; + /* + * When tsc is in permanent catchup mode guests won't be able to use + * pvclock_read_retry loop to get consistent view of pvclock + */ + if (vcpu->arch.tsc_always_catchup) + return -KVM_EOPNOTSUPP; + if (!kvm_get_walltime_and_clockread(&ts, &cycle)) return -KVM_EOPNOTSUPP; From cc8f7fe1f5eab010191aa4570f27641876fa1267 Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Wed, 16 Feb 2022 16:40:38 +0800 Subject: [PATCH 192/773] block-map: add __GFP_ZERO flag for alloc_page in function bio_copy_kern Add __GFP_ZERO flag for alloc_page in function bio_copy_kern to initialize the buffer of a bio. Signed-off-by: Haimin Zhang Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220216084038.15635-1-tcs.kernel@gmail.com Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-map.c b/block/blk-map.c index 4526adde0156..c7f71d83eff1 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -446,7 +446,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (bytes > len) bytes = len; - page = alloc_page(GFP_NOIO | gfp_mask); + page = alloc_page(GFP_NOIO | __GFP_ZERO | gfp_mask); if (!page) goto cleanup; From 7a5428dcb7902700b830e912feee4e845df7c019 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Feb 2022 08:52:31 +0100 Subject: [PATCH 193/773] block: fix surprise removal for drivers calling blk_set_queue_dying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various block drivers call blk_set_queue_dying to mark a disk as dead due to surprise removal events, but since commit 8e141f9eb803 that doesn't work given that the GD_DEAD flag needs to be set to stop I/O. Replace the driver calls to blk_set_queue_dying with a new (and properly documented) blk_mark_disk_dead API, and fold blk_set_queue_dying into the only remaining caller. Fixes: 8e141f9eb803 ("block: drain file system I/O on del_gendisk") Reported-by: Markus Blöchl Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20220217075231.1140-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 10 ++-------- block/genhd.c | 14 ++++++++++++++ drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/rbd.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/md/dm.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 2 +- include/linux/blkdev.h | 3 ++- 9 files changed, 24 insertions(+), 15 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d93e3bb9a769..1039515c99d6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -284,13 +284,6 @@ void blk_queue_start_drain(struct request_queue *q) wake_up_all(&q->mq_freeze_wq); } -void blk_set_queue_dying(struct request_queue *q) -{ - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - blk_queue_start_drain(q); -} -EXPORT_SYMBOL_GPL(blk_set_queue_dying); - /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -308,7 +301,8 @@ void blk_cleanup_queue(struct request_queue *q) WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ - blk_set_queue_dying(q); + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); diff --git a/block/genhd.c b/block/genhd.c index 626c8406f21a..9eca1f7d35c9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -548,6 +548,20 @@ out_free_ext_minor: } EXPORT_SYMBOL(device_add_disk); +/** + * blk_mark_disk_dead - mark a disk as dead + * @disk: disk to mark as dead + * + * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O + * to this disk. + */ +void blk_mark_disk_dead(struct gendisk *disk) +{ + set_bit(GD_DEAD, &disk->state); + blk_queue_start_drain(disk->queue); +} +EXPORT_SYMBOL_GPL(blk_mark_disk_dead); + /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index e6005c232328..2b588b62cbbb 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4112,7 +4112,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) "Completion workers still active!\n"); } - blk_set_queue_dying(dd->queue); + blk_mark_disk_dead(dd->disk); set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); /* Clean up the block layer. */ diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4203cdab8abf..b844432bad20 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7185,7 +7185,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, * IO to complete/fail. */ blk_mq_freeze_queue(rbd_dev->disk->queue); - blk_set_queue_dying(rbd_dev->disk->queue); + blk_mark_disk_dead(rbd_dev->disk); } del_gendisk(rbd_dev->disk); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ccd0dd0c6b83..ca71a0585333 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2126,7 +2126,7 @@ static void blkfront_closing(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - blk_set_queue_dying(info->rq); + blk_mark_disk_dead(info->gd); set_capacity(info->gd, 0); for_each_rinfo(info, rinfo, i) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dcbd6d201619..997ace47bbd5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2077,7 +2077,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); - blk_set_queue_dying(md->queue); + blk_mark_disk_dead(md->disk); /* * Take suspend_lock so that presuspend and postsuspend methods diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 79005ea1a33e..469f23186159 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4574,7 +4574,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; - blk_set_queue_dying(ns->queue); + blk_mark_disk_dead(ns->disk); nvme_start_ns_queue(ns); set_capacity_and_notify(ns->disk, 0); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f8bf6606eb2f..ff775235534c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -848,7 +848,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - blk_set_queue_dying(head->disk->queue); + blk_mark_disk_dead(head->disk); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); flush_work(&head->requeue_work); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f35aea98bc35..16b47035e4b0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -748,7 +748,8 @@ extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); -extern void blk_set_queue_dying(struct request_queue *); + +void blk_mark_disk_dead(struct gendisk *disk); #ifdef CONFIG_BLOCK /* From e92bc4cd34de2ce454bdea8cd198b8067ee4e123 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Sat, 22 Jan 2022 19:10:45 +0800 Subject: [PATCH 194/773] block/wbt: fix negative inflight counter when remove scsi device Now that we disable wbt by set WBT_STATE_OFF_DEFAULT in wbt_disable_default() when switch elevator to bfq. And when we remove scsi device, wbt will be enabled by wbt_enable_default. If it become false positive between wbt_wait() and wbt_track() when submit write request. The following is the scenario that triggered the problem. T1 T2 T3 elevator_switch_mq bfq_init_queue wbt_disable_default <= Set rwb->enable_state (OFF) Submit_bio blk_mq_make_request rq_qos_throttle <= rwb->enable_state (OFF) scsi_remove_device sd_remove del_gendisk blk_unregister_queue elv_unregister_queue wbt_enable_default <= Set rwb->enable_state (ON) q_qos_track <= rwb->enable_state (ON) ^^^^^^ this request will mark WBT_TRACKED without inflight add and will lead to drop rqw->inflight to -1 in wbt_done() which will trigger IO hung. Fix this by move wbt_enable_default() from elv_unregister to bfq_exit_queue(). Only re-enable wbt when bfq exit. Fixes: 76a8040817b4b ("blk-wbt: make sure throttle is enabled properly") Remove oneline stale comment, and kill one oneshot local variable. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/linux-block/20211214133103.551813-1-qiulaibin@huawei.com/ Signed-off-by: Laibin Qiu Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 ++ block/elevator.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0c612a911696..36a66e97e3c2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7018,6 +7018,8 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_unlock_irq(&bfqd->lock); #endif + wbt_enable_default(bfqd->queue); + kfree(bfqd); } diff --git a/block/elevator.c b/block/elevator.c index ec98aed39c4f..482df2a350fc 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -525,8 +525,6 @@ void elv_unregister_queue(struct request_queue *q) kobject_del(&e->kobj); e->registered = 0; - /* Re-enable throttling in case elevator disabled it */ - wbt_enable_default(q); } } From aba2081e0a9c977396124aa6df93b55ed5912b19 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Feb 2022 11:22:04 -0700 Subject: [PATCH 195/773] tps6598x: clear int mask on probe failure The interrupt mask is enabled before any potential failure points in the driver, which can leave a failure path where we exit with interrupts enabled but the device not live. This causes an infinite stream of interrupts on an Apple M1 Pro laptop on USB-C. Add a failure label that's used post enabling interrupts, where we mask them again before returning an error. Suggested-by: Sven Peter Cc: stable Reviewed-by: Heikki Krogerus Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/e6b80669-20f3-06e7-9ed5-8951a9c6db6f@kernel.dk Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tipd/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/typec/tipd/core.c b/drivers/usb/typec/tipd/core.c index 6d27a5b5e3ca..7ffcda94d323 100644 --- a/drivers/usb/typec/tipd/core.c +++ b/drivers/usb/typec/tipd/core.c @@ -761,12 +761,12 @@ static int tps6598x_probe(struct i2c_client *client) ret = tps6598x_read32(tps, TPS_REG_STATUS, &status); if (ret < 0) - return ret; + goto err_clear_mask; trace_tps6598x_status(status); ret = tps6598x_read32(tps, TPS_REG_SYSTEM_CONF, &conf); if (ret < 0) - return ret; + goto err_clear_mask; /* * This fwnode has a "compatible" property, but is never populated as a @@ -855,7 +855,8 @@ err_role_put: usb_role_switch_put(tps->role_sw); err_fwnode_put: fwnode_handle_put(fwnode); - +err_clear_mask: + tps6598x_write64(tps, TPS_REG_INT_MASK1, 0); return ret; } From ad856280ddea3401e1f5060ef20e6de9f6122c76 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Thu, 17 Feb 2022 02:30:29 -0300 Subject: [PATCH 196/773] x86/kvm/fpu: Limit guest user_xfeatures to supported bits of XCR0 During host/guest switch (like in kvm_arch_vcpu_ioctl_run()), the kernel swaps the fpu between host/guest contexts, by using fpu_swap_kvm_fpstate(). When xsave feature is available, the fpu swap is done by: - xsave(s) instruction, with guest's fpstate->xfeatures as mask, is used to store the current state of the fpu registers to a buffer. - xrstor(s) instruction, with (fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE) as mask, is used to put the buffer into fpu regs. For xsave(s) the mask is used to limit what parts of the fpu regs will be copied to the buffer. Likewise on xrstor(s), the mask is used to limit what parts of the fpu regs will be changed. The mask for xsave(s), the guest's fpstate->xfeatures, is defined on kvm_arch_vcpu_create(), which (in summary) sets it to all features supported by the cpu which are enabled on kernel config. This means that xsave(s) will save to guest buffer all the fpu regs contents the cpu has enabled when the guest is paused, even if they are not used. This would not be an issue, if xrstor(s) would also do that. xrstor(s)'s mask for host/guest swap is basically every valid feature contained in kernel config, except XFEATURE_MASK_PKRU. Accordingto kernel src, it is instead switched in switch_to() and flush_thread(). Then, the following happens with a host supporting PKRU starts a guest that does not support it: 1 - Host has XFEATURE_MASK_PKRU set. 1st switch to guest, 2 - xsave(s) fpu regs to host fpustate (buffer has XFEATURE_MASK_PKRU) 3 - xrstor(s) guest fpustate to fpu regs (fpu regs have XFEATURE_MASK_PKRU) 4 - guest runs, then switch back to host, 5 - xsave(s) fpu regs to guest fpstate (buffer now have XFEATURE_MASK_PKRU) 6 - xrstor(s) host fpstate to fpu regs. 7 - kvm_vcpu_ioctl_x86_get_xsave() copy guest fpstate to userspace (with XFEATURE_MASK_PKRU, which should not be supported by guest vcpu) On 5, even though the guest does not support PKRU, it does have the flag set on guest fpstate, which is transferred to userspace via vcpu ioctl KVM_GET_XSAVE. This becomes a problem when the user decides on migrating the above guest to another machine that does not support PKRU: the new host restores guest's fpu regs to as they were before (xrstor(s)), but since the new host don't support PKRU, a general-protection exception ocurs in xrstor(s) and that crashes the guest. This can be solved by making the guest's fpstate->user_xfeatures hold a copy of guest_supported_xcr0. This way, on 7 the only flags copied to userspace will be the ones compatible to guest requirements, and thus there will be no issue during migration. As a bonus, it will also fail if userspace tries to set fpu features (with the KVM_SET_XSAVE ioctl) that are not compatible to the guest configuration. Such features will never be returned by KVM_GET_XSAVE or KVM_GET_XSAVE2. Also, since kvm_vcpu_after_set_cpuid() now sets fpstate->user_xfeatures, there is not need to set it in kvm_check_cpuid(). So, change fpstate_realloc() so it does not touch fpstate->user_xfeatures if a non-NULL guest_fpu is passed, which is the case when kvm_check_cpuid() calls it. Signed-off-by: Leonardo Bras Message-Id: <20220217053028.96432-2-leobras@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/fpu/xstate.c | 5 ++++- arch/x86/kvm/cpuid.c | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 02b3ddaf4f75..7c7824ae7862 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1558,7 +1558,10 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; - newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; + + if (!guest_fpu) + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; + newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 494d4d351859..71125291c578 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -296,6 +296,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); + vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0; + kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); From 988896bb61827345c6d074dd5f2af1b7b008193f Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Thu, 17 Feb 2022 02:30:30 -0300 Subject: [PATCH 197/773] x86/kvm/fpu: Remove kvm_vcpu_arch.guest_supported_xcr0 kvm_vcpu_arch currently contains the guest supported features in both guest_supported_xcr0 and guest_fpu.fpstate->user_xfeatures field. Currently both fields are set to the same value in kvm_vcpu_after_set_cpuid() and are not changed anywhere else after that. Since it's not good to keep duplicated data, remove guest_supported_xcr0. To keep the code more readable, introduce kvm_guest_supported_xcr() and kvm_guest_supported_xfd() to replace the previous usages of guest_supported_xcr0. Signed-off-by: Leonardo Bras Message-Id: <20220217053028.96432-3-leobras@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/cpuid.c | 5 +++-- arch/x86/kvm/x86.c | 18 +++++++++++++----- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6dcccb304775..ec9830d2aabf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -703,7 +703,6 @@ struct kvm_vcpu_arch { struct fpu_guest guest_fpu; u64 xcr0; - u64 guest_supported_xcr0; struct kvm_pio_request pio; void *pio_data; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 71125291c578..b8f8d268d058 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -282,6 +282,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + u64 guest_supported_xcr0; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (best && apic) { @@ -293,10 +294,10 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - vcpu->arch.guest_supported_xcr0 = + guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); - vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0; + vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0; kvm_update_pv_runtime(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3471efe47e5f..20ad6fe391bf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -984,6 +984,16 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); +static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_fpu.fpstate->user_xfeatures; +} + +static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) +{ + return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC; +} + static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; @@ -1003,7 +1013,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) * saving. However, xcr0 bit 0 is always set, even if the * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ - valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; + valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) return 1; @@ -3706,8 +3716,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) return 1; - if (data & ~(XFEATURE_MASK_USER_DYNAMIC & - vcpu->arch.guest_supported_xcr0)) + if (data & ~kvm_guest_supported_xfd(vcpu)) return 1; fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); @@ -3717,8 +3726,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) return 1; - if (data & ~(XFEATURE_MASK_USER_DYNAMIC & - vcpu->arch.guest_supported_xcr0)) + if (data & ~kvm_guest_supported_xfd(vcpu)) return 1; vcpu->arch.guest_fpu.xfd_err = data; From c16bdeb5a39ffa3f32b32f812831a2092d2a3061 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 11 Feb 2022 13:57:44 -0600 Subject: [PATCH 198/773] rlimit: Fix RLIMIT_NPROC enforcement failure caused by capability calls in set_user Solar Designer wrote: > I'm not aware of anyone actually running into this issue and reporting > it. The systems that I personally know use suexec along with rlimits > still run older/distro kernels, so would not yet be affected. > > So my mention was based on my understanding of how suexec works, and > code review. Specifically, Apache httpd has the setting RLimitNPROC, > which makes it set RLIMIT_NPROC: > > https://httpd.apache.org/docs/2.4/mod/core.html#rlimitnproc > > The above documentation for it includes: > > "This applies to processes forked from Apache httpd children servicing > requests, not the Apache httpd children themselves. This includes CGI > scripts and SSI exec commands, but not any processes forked from the > Apache httpd parent, such as piped logs." > > In code, there are: > > ./modules/generators/mod_cgid.c: ( (cgid_req.limits.limit_nproc_set) && ((rc = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC, > ./modules/generators/mod_cgi.c: ((rc = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC, > ./modules/filters/mod_ext_filter.c: rv = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC, conf->limit_nproc); > > For example, in mod_cgi.c this is in run_cgi_child(). > > I think this means an httpd child sets RLIMIT_NPROC shortly before it > execs suexec, which is a SUID root program. suexec then switches to the > target user and execs the CGI script. > > Before 2863643fb8b9, the setuid() in suexec would set the flag, and the > target user's process count would be checked against RLIMIT_NPROC on > execve(). After 2863643fb8b9, the setuid() in suexec wouldn't set the > flag because setuid() is (naturally) called when the process is still > running as root (thus, has those limits bypass capabilities), and > accordingly execve() would not check the target user's process count > against RLIMIT_NPROC. In commit 2863643fb8b9 ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds") capable calls were added to set_user to make it more consistent with fork. Unfortunately because of call site differences those capable calls were checking the credentials of the user before set*id() instead of after set*id(). This breaks enforcement of RLIMIT_NPROC for applications that set the rlimit and then call set*id() while holding a full set of capabilities. The capabilities are only changed in the new credential in security_task_fix_setuid(). The code in apache suexec appears to follow this pattern. Commit 909cc4ae86f3 ("[PATCH] Fix two bugs with process limits (RLIMIT_NPROC)") where this check was added describes the targes of this capability check as: 2/ When a root-owned process (e.g. cgiwrap) sets up process limits and then calls setuid, the setuid should fail if the user would then be running more than rlim_cur[RLIMIT_NPROC] processes, but it doesn't. This patch adds an appropriate test. With this patch, and per-user process limit imposed in cgiwrap really works. So the original use case of this check also appears to match the broken pattern. Restore the enforcement of RLIMIT_NPROC by removing the bad capable checks added in set_user. This unfortunately restores the inconsistent state the code has been in for the last 11 years, but dealing with the inconsistencies looks like a larger problem. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20210907213042.GA22626@openwall.com/ Link: https://lkml.kernel.org/r/20220212221412.GA29214@openwall.com Link: https://lkml.kernel.org/r/20220216155832.680775-1-ebiederm@xmission.com Fixes: 2863643fb8b9 ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds") History-Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Reviewed-by: Solar Designer Signed-off-by: "Eric W. Biederman" --- kernel/sys.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index ecc4cf019242..8dd938a3d2bf 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -480,8 +480,7 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) + new_user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; From 62e3f0afe246720f7646eb1b034a6897dac34405 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 13 Feb 2022 14:05:17 +0100 Subject: [PATCH 199/773] usb: dwc3: pci: Fix Bay Trail phy GPIO mappings When the Bay Trail phy GPIO mappings where added cs and reset were swapped, this did not cause any issues sofar, because sofar they were always driven high/low at the same time. Note the new mapping has been verified both in /sys/kernel/debug/gpio output on Android factory images on multiple devices, as well as in the schematics for some devices. Fixes: 5741022cbdf3 ("usb: dwc3: pci: Add GPIO lookup table on platforms without ACPI GPIO resources") Cc: stable Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220213130524.18748-3-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index 18ab49b8e66e..06d0e88ec8af 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -86,8 +86,8 @@ static const struct acpi_gpio_mapping acpi_dwc3_byt_gpios[] = { static struct gpiod_lookup_table platform_bytcr_gpios = { .dev_id = "0000:00:16.0", .table = { - GPIO_LOOKUP("INT33FC:00", 54, "reset", GPIO_ACTIVE_HIGH), - GPIO_LOOKUP("INT33FC:02", 14, "cs", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("INT33FC:00", 54, "cs", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("INT33FC:02", 14, "reset", GPIO_ACTIVE_HIGH), {} }, }; From 32fde84362c40961726a5c91f35ad37355ccc0c6 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Wed, 16 Feb 2022 09:12:15 +0100 Subject: [PATCH 200/773] usb: dwc2: drd: fix soft connect when gadget is unconfigured When the gadget driver hasn't been (yet) configured, and the cable is connected to a HOST, the SFTDISCON gets cleared unconditionally, so the HOST tries to enumerate it. At the host side, this can result in a stuck USB port or worse. When getting lucky, some dmesg can be observed at the host side: new high-speed USB device number ... device descriptor read/64, error -110 Fix it in drd, by checking the enabled flag before calling dwc2_hsotg_core_connect(). It will be called later, once configured, by the normal flow: - udc_bind_to_driver - usb_gadget_connect - dwc2_hsotg_pullup - dwc2_hsotg_core_connect Fixes: 17f934024e84 ("usb: dwc2: override PHY input signals with usb role switch support") Cc: stable Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/1644999135-13478-1-git-send-email-fabrice.gasnier@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/core.h | 2 ++ drivers/usb/dwc2/drd.c | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/dwc2/core.h b/drivers/usb/dwc2/core.h index 8a63da3ab39d..88c337bf564f 100644 --- a/drivers/usb/dwc2/core.h +++ b/drivers/usb/dwc2/core.h @@ -1418,6 +1418,7 @@ void dwc2_hsotg_core_connect(struct dwc2_hsotg *hsotg); void dwc2_hsotg_disconnect(struct dwc2_hsotg *dwc2); int dwc2_hsotg_set_test_mode(struct dwc2_hsotg *hsotg, int testmode); #define dwc2_is_device_connected(hsotg) (hsotg->connected) +#define dwc2_is_device_enabled(hsotg) (hsotg->enabled) int dwc2_backup_device_registers(struct dwc2_hsotg *hsotg); int dwc2_restore_device_registers(struct dwc2_hsotg *hsotg, int remote_wakeup); int dwc2_gadget_enter_hibernation(struct dwc2_hsotg *hsotg); @@ -1454,6 +1455,7 @@ static inline int dwc2_hsotg_set_test_mode(struct dwc2_hsotg *hsotg, int testmode) { return 0; } #define dwc2_is_device_connected(hsotg) (0) +#define dwc2_is_device_enabled(hsotg) (0) static inline int dwc2_backup_device_registers(struct dwc2_hsotg *hsotg) { return 0; } static inline int dwc2_restore_device_registers(struct dwc2_hsotg *hsotg, diff --git a/drivers/usb/dwc2/drd.c b/drivers/usb/dwc2/drd.c index 1b39c4776369..d8d6493bc457 100644 --- a/drivers/usb/dwc2/drd.c +++ b/drivers/usb/dwc2/drd.c @@ -130,8 +130,10 @@ static int dwc2_drd_role_sw_set(struct usb_role_switch *sw, enum usb_role role) already = dwc2_ovr_avalid(hsotg, true); } else if (role == USB_ROLE_DEVICE) { already = dwc2_ovr_bvalid(hsotg, true); - /* This clear DCTL.SFTDISCON bit */ - dwc2_hsotg_core_connect(hsotg); + if (dwc2_is_device_enabled(hsotg)) { + /* This clear DCTL.SFTDISCON bit */ + dwc2_hsotg_core_connect(hsotg); + } } else { if (dwc2_is_device_mode(hsotg)) { if (!dwc2_ovr_bvalid(hsotg, false)) From 8f2f9c4d82f24f172ae439e5035fc1e0e4c229dd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 9 Feb 2022 20:03:19 -0600 Subject: [PATCH 201/773] ucounts: Enforce RLIMIT_NPROC not RLIMIT_NPROC+1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michal Koutný wrote: > It was reported that v5.14 behaves differently when enforcing > RLIMIT_NPROC limit, namely, it allows one more task than previously. > This is consequence of the commit 21d1c5e386bc ("Reimplement > RLIMIT_NPROC on top of ucounts") that missed the sharpness of > equality in the forking path. This can be fixed either by fixing the test or by moving the increment to be before the test. Fix it my moving copy_creds which contains the increment before is_ucounts_overlimit. In the case of CLONE_NEWUSER the ucounts in the task_cred changes. The function is_ucounts_overlimit needs to use the final version of the ucounts for the new process. Which means moving the is_ucounts_overlimit test after copy_creds is necessary. Both the test in fork and the test in set_user were semantically changed when the code moved to ucounts. The change of the test in fork was bad because it was before the increment. The test in set_user was wrong and the change to ucounts fixed it. So this fix only restores the old behavior in one lcation not two. Link: https://lkml.kernel.org/r/20220204181144.24462-1-mkoutny@suse.com Link: https://lkml.kernel.org/r/20220216155832.680775-2-ebiederm@xmission.com Cc: stable@vger.kernel.org Reported-by: Michal Koutný Reviewed-by: Michal Koutný Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..17d8a8c85e3b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2021,18 +2021,18 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif + retval = copy_creds(p, clone_flags); + if (retval < 0) + goto bad_fork_free; + retval = -EAGAIN; if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) - goto bad_fork_free; + goto bad_fork_cleanup_count; } current->flags &= ~PF_NPROC_EXCEEDED; - retval = copy_creds(p, clone_flags); - if (retval < 0) - goto bad_fork_free; - /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there From a55d07294f1e9b576093bdfa95422f8119941e83 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 9 Feb 2022 16:22:20 -0600 Subject: [PATCH 202/773] ucounts: Base set_cred_ucounts changes on the real user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michal Koutný wrote: > Tasks are associated to multiple users at once. Historically and as per > setrlimit(2) RLIMIT_NPROC is enforce based on real user ID. > > The commit 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") > made the accounting structure "indexed" by euid and hence potentially > account tasks differently. > > The effective user ID may be different e.g. for setuid programs but > those are exec'd into already existing task (i.e. below limit), so > different accounting is moot. > > Some special setresuid(2) users may notice the difference, justifying > this fix. I looked at cred->ucount and it is only used for rlimit operations that were previously stored in cred->user. Making the fact cred->ucount can refer to a different user from cred->user a bug, affecting all uses of cred->ulimit not just RLIMIT_NPROC. Fix set_cred_ucounts to always use the real uid not the effective uid. Further simplify set_cred_ucounts by noticing that set_cred_ucounts somehow retained a draft version of the check to see if alloc_ucounts was needed that checks the new->user and new->user_ns against the current_real_cred(). Remove that draft version of the check. All that matters for setting the cred->ucounts are the user_ns and uid fields in the cred. Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220207121800.5079-4-mkoutny@suse.com Link: https://lkml.kernel.org/r/20220216155832.680775-3-ebiederm@xmission.com Reported-by: Michal Koutný Reviewed-by: Michal Koutný Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Signed-off-by: "Eric W. Biederman" --- kernel/cred.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/cred.c b/kernel/cred.c index 473d17c431f3..933155c96922 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -665,21 +665,16 @@ EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { - struct task_struct *task = current; - const struct cred *old = task->real_cred; struct ucounts *new_ucounts, *old_ucounts = new->ucounts; - if (new->user == old->user && new->user_ns == old->user_ns) - return 0; - /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ - if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; - if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid))) + if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; From c923a8e7edb010da67424077cbf1a6f1396ebd2e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Feb 2022 09:40:25 -0600 Subject: [PATCH 203/773] ucounts: Move RLIMIT_NPROC handling after set_user During set*id() which cred->ucounts to charge the the current process to is not known until after set_cred_ucounts. So move the RLIMIT_NPROC checking into a new helper flag_nproc_exceeded and call flag_nproc_exceeded after set_cred_ucounts. This is very much an arbitrary subset of the places where we currently change the RLIMIT_NPROC accounting, designed to preserve the existing logic. Fixing the existing logic will be the subject of another series of changes. Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220216155832.680775-4-ebiederm@xmission.com Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Signed-off-by: "Eric W. Biederman" --- kernel/sys.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 8dd938a3d2bf..97dc9e5d6bf9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -472,6 +472,16 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + free_uid(new->user); + new->user = new_user; + return 0; +} + +static void flag_nproc_exceeded(struct cred *new) +{ + if (new->ucounts == current_ucounts()) + return; + /* * We don't fail in case of NPROC limit excess here because too many * poorly written programs don't check set*uid() return code, assuming @@ -480,14 +490,10 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER) + new->user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; - - free_uid(new->user); - new->user = new_user; - return 0; } /* @@ -562,6 +568,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -624,6 +631,7 @@ long __sys_setuid(uid_t uid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -703,6 +711,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: From 0cbae9e24fa7d6c6e9f828562f084da82217a0c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 9 Feb 2022 18:09:41 -0600 Subject: [PATCH 204/773] ucounts: Handle wrapping in is_ucounts_overlimit While examining is_ucounts_overlimit and reading the various messages I realized that is_ucounts_overlimit fails to deal with counts that may have wrapped. Being wrapped should be a transitory state for counts and they should never be wrapped for long, but it can happen so handle it. Cc: stable@vger.kernel.org Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Link: https://lkml.kernel.org/r/20220216155832.680775-5-ebiederm@xmission.com Reviewed-by: Shuah Khan Signed-off-by: "Eric W. Biederman" --- kernel/ucount.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/ucount.c b/kernel/ucount.c index 65b597431c86..06ea04d44685 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -350,7 +350,8 @@ bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsign if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - if (get_ucounts_value(iter, type) > max) + long val = get_ucounts_value(iter, type); + if (val < 0 || val > max) return true; max = READ_ONCE(iter->ns->ucount_max[type]); } From d04ad245d67a3991dfea5e108e4c452c2ab39bac Mon Sep 17 00:00:00 2001 From: Prasad Kumpatla Date: Thu, 17 Feb 2022 14:20:07 +0530 Subject: [PATCH 205/773] regmap-irq: Update interrupt clear register for proper reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the existing logic where clear_ack is true (HW doesn’t support auto clear for ICR), interrupt clear register reset is not handled properly. Due to this only the first interrupts get processed properly and further interrupts are blocked due to not resetting interrupt clear register. Example for issue case where Invert_ack is false and clear_ack is true: Say Default ISR=0x00 & ICR=0x00 and ISR is triggered with 2 interrupts making ISR = 0x11. Step 1: Say ISR is set 0x11 (store status_buff = ISR). ISR needs to be cleared with the help of ICR once the Interrupt is processed. Step 2: Write ICR = 0x11 (status_buff), this will clear the ISR to 0x00. Step 3: Issue - In the existing code, ICR is written with ICR = ~(status_buff) i.e ICR = 0xEE -> This will block all the interrupts from raising except for interrupts 0 and 4. So expectation here is to reset ICR, which will unblock all the interrupts. if (chip->clear_ack) { if (chip->ack_invert && !ret) ........ else if (!ret) ret = regmap_write(map, reg, ~data->status_buf[i]); So writing 0 and 0xff (when ack_invert is true) should have no effect, other than clearing the ACKs just set. Fixes: 3a6f0fb7b8eb ("regmap: irq: Add support to clear ack registers") Signed-off-by: Prasad Kumpatla Reviewed-by: Charles Keepax Tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/20220217085007.30218-1-quic_pkumpatl@quicinc.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-irq.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index d2656581a608..4a446259a184 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -189,11 +189,9 @@ static void regmap_irq_sync_unlock(struct irq_data *data) ret = regmap_write(map, reg, d->mask_buf[i]); if (d->chip->clear_ack) { if (d->chip->ack_invert && !ret) - ret = regmap_write(map, reg, - d->mask_buf[i]); + ret = regmap_write(map, reg, UINT_MAX); else if (!ret) - ret = regmap_write(map, reg, - ~d->mask_buf[i]); + ret = regmap_write(map, reg, 0); } if (ret != 0) dev_err(d->map->dev, "Failed to ack 0x%x: %d\n", @@ -556,11 +554,9 @@ static irqreturn_t regmap_irq_thread(int irq, void *d) data->status_buf[i]); if (chip->clear_ack) { if (chip->ack_invert && !ret) - ret = regmap_write(map, reg, - data->status_buf[i]); + ret = regmap_write(map, reg, UINT_MAX); else if (!ret) - ret = regmap_write(map, reg, - ~data->status_buf[i]); + ret = regmap_write(map, reg, 0); } if (ret != 0) dev_err(map->dev, "Failed to ack 0x%x: %d\n", @@ -817,13 +813,9 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, d->status_buf[i] & d->mask_buf[i]); if (chip->clear_ack) { if (chip->ack_invert && !ret) - ret = regmap_write(map, reg, - (d->status_buf[i] & - d->mask_buf[i])); + ret = regmap_write(map, reg, UINT_MAX); else if (!ret) - ret = regmap_write(map, reg, - ~(d->status_buf[i] & - d->mask_buf[i])); + ret = regmap_write(map, reg, 0); } if (ret != 0) { dev_err(map->dev, "Failed to ack 0x%x: %d\n", From 9382df0a98aad5bbcd4d634790305a1d786ad224 Mon Sep 17 00:00:00 2001 From: Jon Lin Date: Wed, 16 Feb 2022 09:40:23 +0800 Subject: [PATCH 206/773] spi: rockchip: Fix error in getting num-cs property Get num-cs u32 from dts of_node property rather than u16. Signed-off-by: Jon Lin Link: https://lore.kernel.org/r/20220216014028.8123-2-jon.lin@rock-chips.com Signed-off-by: Mark Brown --- drivers/spi/spi-rockchip.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index 553b6b9d0222..4f65ba3dd19c 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -654,7 +654,7 @@ static int rockchip_spi_probe(struct platform_device *pdev) struct spi_controller *ctlr; struct resource *mem; struct device_node *np = pdev->dev.of_node; - u32 rsd_nsecs; + u32 rsd_nsecs, num_cs; bool slave_mode; slave_mode = of_property_read_bool(np, "spi-slave"); @@ -764,8 +764,9 @@ static int rockchip_spi_probe(struct platform_device *pdev) * rk spi0 has two native cs, spi1..5 one cs only * if num-cs is missing in the dts, default to 1 */ - if (of_property_read_u16(np, "num-cs", &ctlr->num_chipselect)) - ctlr->num_chipselect = 1; + if (of_property_read_u32(np, "num-cs", &num_cs)) + num_cs = 1; + ctlr->num_chipselect = num_cs; ctlr->use_gpio_descriptors = true; } ctlr->dev.of_node = pdev->dev.of_node; From 80808768e41324d2e23de89972b5406c1020e6e4 Mon Sep 17 00:00:00 2001 From: Jon Lin Date: Wed, 16 Feb 2022 09:40:24 +0800 Subject: [PATCH 207/773] spi: rockchip: terminate dma transmission when slave abort After slave abort, all DMA should be stopped, or it will affect the next transmission and maybe abort again. Signed-off-by: Jon Lin Link: https://lore.kernel.org/r/20220216014028.8123-3-jon.lin@rock-chips.com Signed-off-by: Mark Brown --- drivers/spi/spi-rockchip.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index 4f65ba3dd19c..c6a1bb09be05 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -585,6 +585,12 @@ static int rockchip_spi_slave_abort(struct spi_controller *ctlr) { struct rockchip_spi *rs = spi_controller_get_devdata(ctlr); + if (atomic_read(&rs->state) & RXDMA) + dmaengine_terminate_sync(ctlr->dma_rx); + if (atomic_read(&rs->state) & TXDMA) + dmaengine_terminate_sync(ctlr->dma_tx); + atomic_set(&rs->state, 0); + spi_enable_chip(rs, false); rs->slave_abort = true; spi_finalize_current_transfer(ctlr); From 32f57cb1b2c8d6f20aefec7052b1bfeb7e3b69d4 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 17 Feb 2022 08:59:32 -0500 Subject: [PATCH 208/773] IB/qib: Fix duplicate sysfs directory name The qib driver load has been failing with the following message: sysfs: cannot create duplicate filename '/devices/pci0000:80/0000:80:02.0/0000:81:00.0/infiniband/qib0/ports/1/linkcontrol' The patch below has two "linkcontrol" names causing the duplication. Fix by using the correct "diag_counters" name on the second instance. Fixes: 4a7aaf88c89f ("RDMA/qib: Use attributes for the port sysfs") Link: https://lore.kernel.org/r/1645106372-23004-1-git-send-email-mike.marciniszyn@cornelisnetworks.com Cc: Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 0a3b28142c05..41c272980f91 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -541,7 +541,7 @@ static struct attribute *port_diagc_attributes[] = { }; static const struct attribute_group port_diagc_group = { - .name = "linkcontrol", + .name = "diag_counters", .attrs = port_diagc_attributes, }; From e40945ab7c7f966d0c37b7bd7b0596497dfe228d Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 27 Jan 2022 12:14:52 +0100 Subject: [PATCH 209/773] drm/vc4: hdmi: Unregister codec device on unbind On bind we will register the HDMI codec device but we don't unregister it on unbind, leading to a device leakage. Unregister our device at unbind. Signed-off-by: Maxime Ripard Reviewed-by: Javier Martinez Canillas Link: https://patchwork.freedesktop.org/patch/msgid/20220127111452.222002-1-maxime@cerno.tech --- drivers/gpu/drm/vc4/vc4_hdmi.c | 8 ++++++++ drivers/gpu/drm/vc4/vc4_hdmi.h | 1 + 2 files changed, 9 insertions(+) diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index b30500405fa7..3a1626f261e5 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -1749,6 +1749,7 @@ static int vc4_hdmi_audio_init(struct vc4_hdmi *vc4_hdmi) dev_err(dev, "Couldn't register the HDMI codec: %ld\n", PTR_ERR(codec_pdev)); return PTR_ERR(codec_pdev); } + vc4_hdmi->audio.codec_pdev = codec_pdev; dai_link->cpus = &vc4_hdmi->audio.cpu; dai_link->codecs = &vc4_hdmi->audio.codec; @@ -1788,6 +1789,12 @@ static int vc4_hdmi_audio_init(struct vc4_hdmi *vc4_hdmi) } +static void vc4_hdmi_audio_exit(struct vc4_hdmi *vc4_hdmi) +{ + platform_device_unregister(vc4_hdmi->audio.codec_pdev); + vc4_hdmi->audio.codec_pdev = NULL; +} + static irqreturn_t vc4_hdmi_hpd_irq_thread(int irq, void *priv) { struct vc4_hdmi *vc4_hdmi = priv; @@ -2660,6 +2667,7 @@ static void vc4_hdmi_unbind(struct device *dev, struct device *master, kfree(vc4_hdmi->hdmi_regset.regs); kfree(vc4_hdmi->hd_regset.regs); + vc4_hdmi_audio_exit(vc4_hdmi); vc4_hdmi_cec_exit(vc4_hdmi); vc4_hdmi_hotplug_exit(vc4_hdmi); vc4_hdmi_connector_destroy(&vc4_hdmi->connector); diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.h b/drivers/gpu/drm/vc4/vc4_hdmi.h index 31b77a94c526..6ffdd4ec5fb6 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.h +++ b/drivers/gpu/drm/vc4/vc4_hdmi.h @@ -116,6 +116,7 @@ struct vc4_hdmi_audio { struct snd_soc_dai_link_component platform; struct snd_dmaengine_dai_dma_data dma_data; struct hdmi_audio_infoframe infoframe; + struct platform_device *codec_pdev; bool streaming; }; From 6764eb690e77ecded48587d6d4e346ba2e196546 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 3 Feb 2022 11:20:03 +0100 Subject: [PATCH 210/773] drm/vc4: crtc: Fix runtime_pm reference counting At boot on the BCM2711, if the HDMI controllers are running, the CRTC driver will disable itself and its associated HDMI controller to work around a hardware bug that would leave some pixels stuck in a FIFO. In order to avoid that issue, we need to run some operations in lockstep between the CRTC and HDMI controller, and we need to make sure the HDMI controller will be powered properly. However, since we haven't enabled it through KMS, the runtime_pm state is off at this point so we need to make sure the device is powered through pm_runtime_resume_and_get, and once the operations are complete, we call pm_runtime_put. However, the HDMI controller will do that itself in its post_crtc_powerdown, which means we'll end up calling pm_runtime_put for a single pm_runtime_get, throwing the reference counting off. Let's remove the pm_runtime_put call in the CRTC code in order to have the proper counting. Fixes: bca10db67bda ("drm/vc4: crtc: Make sure the HDMI controller is powered when disabling") Signed-off-by: Maxime Ripard Reviewed-by: Javier Martinez Canillas Link: https://patchwork.freedesktop.org/patch/msgid/20220203102003.1114673-1-maxime@cerno.tech --- drivers/gpu/drm/vc4/vc4_crtc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c index e6cc47470e03..783890e8d43a 100644 --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -525,9 +525,11 @@ int vc4_crtc_disable_at_boot(struct drm_crtc *crtc) if (ret) return ret; - ret = pm_runtime_put(&vc4_hdmi->pdev->dev); - if (ret) - return ret; + /* + * post_crtc_powerdown will have called pm_runtime_put, so we + * don't need it here otherwise we'll get the reference counting + * wrong. + */ return 0; } From e5733d8c89c3b57c8fcd40b8acf508388fabaa42 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 2 Feb 2022 11:41:12 -0800 Subject: [PATCH 211/773] x86/sgx: Fix missing poison handling in reclaimer The SGX reclaimer code lacks page poison handling in its main free path. This can lead to avoidable machine checks if a poisoned page is freed and reallocated instead of being isolated. A troublesome scenario is: 1. Machine check (#MC) occurs (asynchronous, !MF_ACTION_REQUIRED) 2. arch_memory_failure() is eventually called 3. (SGX) page->poison set to 1 4. Page is reclaimed 5. Page added to normal free lists by sgx_reclaim_pages() ^ This is the bug (poison pages should be isolated on the sgx_poison_page_list instead) 6. Page is reallocated by some innocent enclave, a second (synchronous) in-kernel #MC is induced, probably during EADD instruction. ^ This is the fallout from the bug (6) is unfortunate and can be avoided by replacing the open coded enclave page freeing code in the reclaimer with sgx_free_epc_page() to obtain support for poison page handling that includes placing the poisoned page on the correct list. Fixes: d6d261bded8a ("x86/sgx: Add new sgx_epc_page flag bit to mark free pages") Fixes: 992801ae9243 ("x86/sgx: Initial poison handling for dirty and free pages") Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/dcc95eb2aaefb042527ac50d0a50738c7c160dac.1643830353.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/sgx/main.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 4b41efc9e367..8e4bc6453d26 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -344,10 +344,8 @@ static void sgx_reclaim_pages(void) { struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; struct sgx_backing backing[SGX_NR_TO_SCAN]; - struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; - struct sgx_numa_node *node; pgoff_t page_index; int cnt = 0; int ret; @@ -418,13 +416,7 @@ skip: kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; - section = &sgx_epc_sections[epc_page->section]; - node = section->node; - - spin_lock(&node->lock); - list_add_tail(&epc_page->list, &node->free_page_list); - spin_unlock(&node->lock); - atomic_long_inc(&sgx_nr_free_pages); + sgx_free_epc_page(epc_page); } } From 75134f16e7dd0007aa474b281935c5f42e79f2c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2022 10:19:02 -0800 Subject: [PATCH 212/773] bpf: Add schedule points in batch ops syzbot reported various soft lockups caused by bpf batch operations. INFO: task kworker/1:1:27 blocked for more than 140 seconds. INFO: task hung in rcu_barrier Nothing prevents batch ops to process huge amount of data, we need to add schedule points in them. Note that maybe_wait_bpf_programs(map) calls from generic_map_delete_batch() can be factorized by moving the call after the loop. This will be done later in -next tree once we get this fix merged, unless there is strong opinion doing this optimization sooner. Fixes: aa2e93b8e58e ("bpf: Add generic support for update and delete batch ops") Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Reviewed-by: Stanislav Fomichev Acked-by: Brian Vazquez Link: https://lore.kernel.org/bpf/20220217181902.808742-1-eric.dumazet@gmail.com --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fa4505f9b611..ca70fe6fba38 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1355,6 +1355,7 @@ int generic_map_delete_batch(struct bpf_map *map, maybe_wait_bpf_programs(map); if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; @@ -1412,6 +1413,7 @@ int generic_map_update_batch(struct bpf_map *map, if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) @@ -1509,6 +1511,7 @@ int generic_map_lookup_batch(struct bpf_map *map, swap(prev_key, key); retry = MAP_LOOKUP_RETRIES; cp++; + cond_resched(); } if (err == -EFAULT) From 132507ed04ce0c5559be04dd378fec4f3bbc00e8 Mon Sep 17 00:00:00 2001 From: Nikhil Gupta Date: Fri, 28 Jan 2022 09:53:21 +0530 Subject: [PATCH 213/773] of/fdt: move elfcorehdr reservation early for crash dump kernel elfcorehdr_addr is fixed address passed to Second kernel which may be conflicted with potential reserved memory in Second kernel,so fdt_reserve_elfcorehdr() ahead of fdt_init_reserved_mem() can relieve this situation. Signed-off-by: Nikhil Gupta Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220128042321.15228-1-nikhil.gupta@nxp.com --- drivers/of/fdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index ad85ff6474ff..ec315b060cd5 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -648,8 +648,8 @@ void __init early_init_fdt_scan_reserved_mem(void) } fdt_scan_reserved_mem(); - fdt_init_reserved_mem(); fdt_reserve_elfcorehdr(); + fdt_init_reserved_mem(); } /** From 3494894afff4ad11f25d8342cc99699be496d082 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 18 Feb 2022 00:24:08 +0100 Subject: [PATCH 214/773] clk: qcom: gcc-msm8994: Remove NoC clocks Just like in commit 05cf3ec00d46 ("clk: qcom: gcc-msm8996: Drop (again) gcc_aggre1_pnoc_ahb_clk") adding NoC clocks turned out to be a huge mistake, as they cause a lot of issues at little benefit (basically letting Linux know about their children's frequencies), especially when mishandled or misconfigured. Adding these ones broke SDCC approx 99 out of 100 times, but that somehow went unnoticed. To prevent further issues like this one, remove them. This commit is effectively a revert of 74a33fac3aab ("clk: qcom: gcc-msm8994: Add missing NoC clocks") with ABI preservation. Fixes: 74a33fac3aab ("clk: qcom: gcc-msm8994: Add missing NoC clocks") Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20220217232408.78932-1-konrad.dybcio@somainline.org Signed-off-by: Stephen Boyd --- drivers/clk/qcom/gcc-msm8994.c | 106 +++------------------------------ 1 file changed, 9 insertions(+), 97 deletions(-) diff --git a/drivers/clk/qcom/gcc-msm8994.c b/drivers/clk/qcom/gcc-msm8994.c index 71aa630fa4bd..f09499999eb3 100644 --- a/drivers/clk/qcom/gcc-msm8994.c +++ b/drivers/clk/qcom/gcc-msm8994.c @@ -108,42 +108,6 @@ static const struct clk_parent_data gcc_xo_gpll0_gpll4[] = { { .hw = &gpll4.clkr.hw }, }; -static struct clk_rcg2 system_noc_clk_src = { - .cmd_rcgr = 0x0120, - .hid_width = 5, - .parent_map = gcc_xo_gpll0_map, - .clkr.hw.init = &(struct clk_init_data){ - .name = "system_noc_clk_src", - .parent_data = gcc_xo_gpll0, - .num_parents = ARRAY_SIZE(gcc_xo_gpll0), - .ops = &clk_rcg2_ops, - }, -}; - -static struct clk_rcg2 config_noc_clk_src = { - .cmd_rcgr = 0x0150, - .hid_width = 5, - .parent_map = gcc_xo_gpll0_map, - .clkr.hw.init = &(struct clk_init_data){ - .name = "config_noc_clk_src", - .parent_data = gcc_xo_gpll0, - .num_parents = ARRAY_SIZE(gcc_xo_gpll0), - .ops = &clk_rcg2_ops, - }, -}; - -static struct clk_rcg2 periph_noc_clk_src = { - .cmd_rcgr = 0x0190, - .hid_width = 5, - .parent_map = gcc_xo_gpll0_map, - .clkr.hw.init = &(struct clk_init_data){ - .name = "periph_noc_clk_src", - .parent_data = gcc_xo_gpll0, - .num_parents = ARRAY_SIZE(gcc_xo_gpll0), - .ops = &clk_rcg2_ops, - }, -}; - static struct freq_tbl ftbl_ufs_axi_clk_src[] = { F(50000000, P_GPLL0, 12, 0, 0), F(100000000, P_GPLL0, 6, 0, 0), @@ -1150,8 +1114,6 @@ static struct clk_branch gcc_blsp1_ahb_clk = { .enable_mask = BIT(17), .hw.init = &(struct clk_init_data){ .name = "gcc_blsp1_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -1435,8 +1397,6 @@ static struct clk_branch gcc_blsp2_ahb_clk = { .enable_mask = BIT(15), .hw.init = &(struct clk_init_data){ .name = "gcc_blsp2_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -1764,8 +1724,6 @@ static struct clk_branch gcc_lpass_q6_axi_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_lpass_q6_axi_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -1778,8 +1736,6 @@ static struct clk_branch gcc_mss_q6_bimc_axi_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_mss_q6_bimc_axi_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -1807,9 +1763,6 @@ static struct clk_branch gcc_pcie_0_cfg_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_pcie_0_cfg_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &config_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -1822,9 +1775,6 @@ static struct clk_branch gcc_pcie_0_mstr_axi_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_pcie_0_mstr_axi_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -1854,9 +1804,6 @@ static struct clk_branch gcc_pcie_0_slv_axi_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_pcie_0_slv_axi_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -1884,9 +1831,6 @@ static struct clk_branch gcc_pcie_1_cfg_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_pcie_1_cfg_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &config_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -1899,9 +1843,6 @@ static struct clk_branch gcc_pcie_1_mstr_axi_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_pcie_1_mstr_axi_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -1930,9 +1871,6 @@ static struct clk_branch gcc_pcie_1_slv_axi_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_pcie_1_slv_axi_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -1960,8 +1898,6 @@ static struct clk_branch gcc_pdm_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_pdm_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -1989,9 +1925,6 @@ static struct clk_branch gcc_sdcc1_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_sdcc1_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -2004,9 +1937,6 @@ static struct clk_branch gcc_sdcc2_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_sdcc2_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -2034,9 +1964,6 @@ static struct clk_branch gcc_sdcc3_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_sdcc3_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -2064,9 +1991,6 @@ static struct clk_branch gcc_sdcc4_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_sdcc4_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, - .flags = CLK_SET_RATE_PARENT, .ops = &clk_branch2_ops, }, }, @@ -2124,8 +2048,6 @@ static struct clk_branch gcc_tsif_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_tsif_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2153,8 +2075,6 @@ static struct clk_branch gcc_ufs_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_ufs_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &config_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2198,8 +2118,6 @@ static struct clk_branch gcc_ufs_rx_symbol_0_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_ufs_rx_symbol_0_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2213,8 +2131,6 @@ static struct clk_branch gcc_ufs_rx_symbol_1_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_ufs_rx_symbol_1_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2243,8 +2159,6 @@ static struct clk_branch gcc_ufs_tx_symbol_0_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_ufs_tx_symbol_0_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2258,8 +2172,6 @@ static struct clk_branch gcc_ufs_tx_symbol_1_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_ufs_tx_symbol_1_clk", - .parent_hws = (const struct clk_hw *[]){ &system_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2364,8 +2276,6 @@ static struct clk_branch gcc_usb_hs_ahb_clk = { .enable_mask = BIT(0), .hw.init = &(struct clk_init_data){ .name = "gcc_usb_hs_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2488,8 +2398,6 @@ static struct clk_branch gcc_boot_rom_ahb_clk = { .enable_mask = BIT(10), .hw.init = &(struct clk_init_data){ .name = "gcc_boot_rom_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &config_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2503,8 +2411,6 @@ static struct clk_branch gcc_prng_ahb_clk = { .enable_mask = BIT(13), .hw.init = &(struct clk_init_data){ .name = "gcc_prng_ahb_clk", - .parent_hws = (const struct clk_hw *[]){ &periph_noc_clk_src.clkr.hw }, - .num_parents = 1, .ops = &clk_branch2_ops, }, }, @@ -2547,9 +2453,6 @@ static struct clk_regmap *gcc_msm8994_clocks[] = { [GPLL0] = &gpll0.clkr, [GPLL4_EARLY] = &gpll4_early.clkr, [GPLL4] = &gpll4.clkr, - [CONFIG_NOC_CLK_SRC] = &config_noc_clk_src.clkr, - [PERIPH_NOC_CLK_SRC] = &periph_noc_clk_src.clkr, - [SYSTEM_NOC_CLK_SRC] = &system_noc_clk_src.clkr, [UFS_AXI_CLK_SRC] = &ufs_axi_clk_src.clkr, [USB30_MASTER_CLK_SRC] = &usb30_master_clk_src.clkr, [BLSP1_QUP1_I2C_APPS_CLK_SRC] = &blsp1_qup1_i2c_apps_clk_src.clkr, @@ -2696,6 +2599,15 @@ static struct clk_regmap *gcc_msm8994_clocks[] = { [USB_SS_PHY_LDO] = &usb_ss_phy_ldo.clkr, [GCC_BOOT_ROM_AHB_CLK] = &gcc_boot_rom_ahb_clk.clkr, [GCC_PRNG_AHB_CLK] = &gcc_prng_ahb_clk.clkr, + + /* + * The following clocks should NOT be managed by this driver, but they once were + * mistakengly added. Now they are only here to indicate that they are not defined + * on purpose, even though the names will stay in the header file (for ABI sanity). + */ + [CONFIG_NOC_CLK_SRC] = NULL, + [PERIPH_NOC_CLK_SRC] = NULL, + [SYSTEM_NOC_CLK_SRC] = NULL, }; static struct gdsc *gcc_msm8994_gdscs[] = { From 2f0754f27a230fee6e6d753f07585cee03bedfe3 Mon Sep 17 00:00:00 2001 From: Siarhei Volkau Date: Sat, 5 Feb 2022 20:18:49 +0300 Subject: [PATCH 215/773] clk: jz4725b: fix mmc0 clock gating The mmc0 clock gate bit was mistakenly assigned to "i2s" clock. You can find that the same bit is assigned to "mmc0" too. It leads to mmc0 hang for a long time after any sound activity also it prevented PM_SLEEP to work properly. I guess it was introduced by copy-paste from jz4740 driver where it is really controls I2S clock gate. Fixes: 226dfa4726eb ("clk: Add Ingenic jz4725b CGU driver") Signed-off-by: Siarhei Volkau Tested-by: Siarhei Volkau Reviewed-by: Paul Cercueil Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220205171849.687805-2-lis8215@gmail.com Signed-off-by: Stephen Boyd --- drivers/clk/ingenic/jz4725b-cgu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/clk/ingenic/jz4725b-cgu.c b/drivers/clk/ingenic/jz4725b-cgu.c index 744d136b721b..15d61793f53b 100644 --- a/drivers/clk/ingenic/jz4725b-cgu.c +++ b/drivers/clk/ingenic/jz4725b-cgu.c @@ -139,11 +139,10 @@ static const struct ingenic_cgu_clk_info jz4725b_cgu_clocks[] = { }, [JZ4725B_CLK_I2S] = { - "i2s", CGU_CLK_MUX | CGU_CLK_DIV | CGU_CLK_GATE, + "i2s", CGU_CLK_MUX | CGU_CLK_DIV, .parents = { JZ4725B_CLK_EXT, JZ4725B_CLK_PLL_HALF, -1, -1 }, .mux = { CGU_REG_CPCCR, 31, 1 }, .div = { CGU_REG_I2SCDR, 0, 1, 9, -1, -1, -1 }, - .gate = { CGU_REG_CLKGR, 6 }, }, [JZ4725B_CLK_SPI] = { From 64324ef337d0caa5798fa8fa3f6bbfbd3245868a Mon Sep 17 00:00:00 2001 From: Anthoine Bourgeois Date: Tue, 25 Jan 2022 20:11:38 +0100 Subject: [PATCH 216/773] ARM: dts: switch timer config to common devkit8000 devicetree This patch allow lcd43 and lcd70 flavors to benefit from timer evolution. Fixes: e428e250fde6 ("ARM: dts: Configure system timers for omap3") Signed-off-by: Anthoine Bourgeois Signed-off-by: Tony Lindgren --- .../arm/boot/dts/omap3-devkit8000-common.dtsi | 33 +++++++++++++++++++ arch/arm/boot/dts/omap3-devkit8000.dts | 33 ------------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/arm/boot/dts/omap3-devkit8000-common.dtsi b/arch/arm/boot/dts/omap3-devkit8000-common.dtsi index 5e55198e4576..f5197bb31ed8 100644 --- a/arch/arm/boot/dts/omap3-devkit8000-common.dtsi +++ b/arch/arm/boot/dts/omap3-devkit8000-common.dtsi @@ -158,6 +158,39 @@ status = "disabled"; }; +/* Unusable as clocksource because of unreliable oscillator */ +&counter32k { + status = "disabled"; +}; + +/* Unusable as clockevent because if unreliable oscillator, allow to idle */ +&timer1_target { + /delete-property/ti,no-reset-on-init; + /delete-property/ti,no-idle; + timer@0 { + /delete-property/ti,timer-alwon; + }; +}; + +/* Preferred always-on timer for clocksource */ +&timer12_target { + ti,no-reset-on-init; + ti,no-idle; + timer@0 { + /* Always clocked by secure_32k_fck */ + }; +}; + +/* Preferred timer for clockevent */ +&timer2_target { + ti,no-reset-on-init; + ti,no-idle; + timer@0 { + assigned-clocks = <&gpt2_fck>; + assigned-clock-parents = <&sys_ck>; + }; +}; + &twl_gpio { ti,use-leds; /* diff --git a/arch/arm/boot/dts/omap3-devkit8000.dts b/arch/arm/boot/dts/omap3-devkit8000.dts index c2995a280729..162d0726b008 100644 --- a/arch/arm/boot/dts/omap3-devkit8000.dts +++ b/arch/arm/boot/dts/omap3-devkit8000.dts @@ -14,36 +14,3 @@ display2 = &tv0; }; }; - -/* Unusable as clocksource because of unreliable oscillator */ -&counter32k { - status = "disabled"; -}; - -/* Unusable as clockevent because if unreliable oscillator, allow to idle */ -&timer1_target { - /delete-property/ti,no-reset-on-init; - /delete-property/ti,no-idle; - timer@0 { - /delete-property/ti,timer-alwon; - }; -}; - -/* Preferred always-on timer for clocksource */ -&timer12_target { - ti,no-reset-on-init; - ti,no-idle; - timer@0 { - /* Always clocked by secure_32k_fck */ - }; -}; - -/* Preferred timer for clockevent */ -&timer2_target { - ti,no-reset-on-init; - ti,no-idle; - timer@0 { - assigned-clocks = <&gpt2_fck>; - assigned-clock-parents = <&sys_ck>; - }; -}; From 8840f5460a23759403f1f2860429dcbcc2f04a65 Mon Sep 17 00:00:00 2001 From: Anthoine Bourgeois Date: Tue, 25 Jan 2022 20:11:39 +0100 Subject: [PATCH 217/773] ARM: dts: Use 32KiHz oscillator on devkit8000 Devkit8000 board seems to always used 32k_counter as clocksource. Restore this behavior. If clocksource is back to 32k_counter, timer12 is now the clockevent source (as before) and timer2 is not longer needed here. This commit fixes the same issue observed with commit 23885389dbbb ("ARM: dts: Fix timer regression for beagleboard revision c") when sleep is blocked until hitting keys over serial console. Fixes: aba1ad05da08 ("clocksource/drivers/timer-ti-dm: Add clockevent and clocksource support") Fixes: e428e250fde6 ("ARM: dts: Configure system timers for omap3") Signed-off-by: Anthoine Bourgeois Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/omap3-devkit8000-common.dtsi | 17 +---------------- drivers/clocksource/timer-ti-dm-systimer.c | 3 +-- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/arch/arm/boot/dts/omap3-devkit8000-common.dtsi b/arch/arm/boot/dts/omap3-devkit8000-common.dtsi index f5197bb31ed8..54cd37336be7 100644 --- a/arch/arm/boot/dts/omap3-devkit8000-common.dtsi +++ b/arch/arm/boot/dts/omap3-devkit8000-common.dtsi @@ -158,11 +158,6 @@ status = "disabled"; }; -/* Unusable as clocksource because of unreliable oscillator */ -&counter32k { - status = "disabled"; -}; - /* Unusable as clockevent because if unreliable oscillator, allow to idle */ &timer1_target { /delete-property/ti,no-reset-on-init; @@ -172,7 +167,7 @@ }; }; -/* Preferred always-on timer for clocksource */ +/* Preferred timer for clockevent */ &timer12_target { ti,no-reset-on-init; ti,no-idle; @@ -181,16 +176,6 @@ }; }; -/* Preferred timer for clockevent */ -&timer2_target { - ti,no-reset-on-init; - ti,no-idle; - timer@0 { - assigned-clocks = <&gpt2_fck>; - assigned-clock-parents = <&sys_ck>; - }; -}; - &twl_gpio { ti,use-leds; /* diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c index 5c40ca1d4740..1fccb457fcc5 100644 --- a/drivers/clocksource/timer-ti-dm-systimer.c +++ b/drivers/clocksource/timer-ti-dm-systimer.c @@ -241,8 +241,7 @@ static void __init dmtimer_systimer_assign_alwon(void) bool quirk_unreliable_oscillator = false; /* Quirk unreliable 32 KiHz oscillator with incomplete dts */ - if (of_machine_is_compatible("ti,omap3-beagle-ab4") || - of_machine_is_compatible("timll,omap3-devkit8000")) { + if (of_machine_is_compatible("ti,omap3-beagle-ab4")) { quirk_unreliable_oscillator = true; counter_32k = -ENODEV; } From ba1f77c546966c12d86220d078e5838dcaeab348 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Fri, 18 Feb 2022 00:41:00 -0300 Subject: [PATCH 218/773] x86/kvm: Fix compilation warning in non-x86_64 builds On non-x86_64 builds, helpers gtod_is_based_on_tsc() and kvm_guest_supported_xfd() are defined but never used. Because these are static inline but are in a .c file, some compilers do warn for them with -Wunused-function, which becomes an error if -Werror is present. Add #ifdef so they are only defined in x86_64 builds. Reported-by: kernel test robot Signed-off-by: Leonardo Bras Message-Id: <20220218034100.115702-1-leobras@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 20ad6fe391bf..82a9dcd8c67f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -989,10 +989,12 @@ static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu) return vcpu->arch.guest_fpu.fpstate->user_xfeatures; } +#ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) { return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC; } +#endif static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { @@ -2361,10 +2363,12 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } +#ifdef CONFIG_X86_64 static inline int gtod_is_based_on_tsc(int mode) { return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } +#endif static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { From ec756e40e271866f951d77c5e923d8deb6002b15 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 18 Feb 2022 00:10:38 -0800 Subject: [PATCH 219/773] x86/kvm: Don't use pv tlb/ipi/sched_yield if on 1 vCPU Inspired by commit 3553ae5690a (x86/kvm: Don't use pvqspinlock code if only 1 vCPU), on a VM with only 1 vCPU, there is no need to enable pv tlb/ipi/sched_yield and we can save the memory for __pv_cpu_mask. Signed-off-by: Wanpeng Li Message-Id: <1645171838-2855-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a438217cbfac..f734e3b0cfec 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -462,19 +462,22 @@ static bool pv_tlb_flush_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + (num_possible_cpus() != 1)); } static bool pv_ipi_supported(void) { - return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); + return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) && + (num_possible_cpus() != 1)); } static bool pv_sched_yield_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + (num_possible_cpus() != 1)); } #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) From 1aae05754fca861ce17f1f17fddcfbf8c0fc25b6 Mon Sep 17 00:00:00 2001 From: Rudi Heitbaum Date: Wed, 16 Feb 2022 21:22:28 +0000 Subject: [PATCH 220/773] drm/imx/dcss: i.MX8MQ DCSS select DRM_GEM_CMA_HELPER Without DRM_GEM_CMA_HELPER i.MX8MQ DCSS won't build. This needs to be there. Signed-off-by: Rudi Heitbaum Reviewed-by: Laurentiu Palcu Signed-off-by: Laurentiu Palcu Link: https://patchwork.freedesktop.org/patch/msgid/20220216212228.1217831-1-rudi@heitbaum.com --- drivers/gpu/drm/imx/dcss/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/imx/dcss/Kconfig b/drivers/gpu/drm/imx/dcss/Kconfig index 7374f1952762..5c2b2277afbf 100644 --- a/drivers/gpu/drm/imx/dcss/Kconfig +++ b/drivers/gpu/drm/imx/dcss/Kconfig @@ -2,6 +2,7 @@ config DRM_IMX_DCSS tristate "i.MX8MQ DCSS" select IMX_IRQSTEER select DRM_KMS_HELPER + select DRM_GEM_CMA_HELPER select VIDEOMODE_HELPERS depends on DRM && ARCH_MXC && ARM64 help From 834cea3a252ed4847db076a769ad9efe06afe2d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 15 Feb 2022 08:27:35 +0100 Subject: [PATCH 221/773] i2c: brcmstb: fix support for DSL and CM variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DSL and CM (Cable Modem) support 8 B max transfer size and have a custom DT binding for that reason. This driver was checking for a wrong "compatible" however which resulted in an incorrect setup. Fixes: e2e5a2c61837 ("i2c: brcmstb: Adding support for CM and DSL SoCs") Signed-off-by: Rafał Miłecki Acked-by: Florian Fainelli Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-brcmstb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-brcmstb.c b/drivers/i2c/busses/i2c-brcmstb.c index 490ee3962645..b00f35c0b066 100644 --- a/drivers/i2c/busses/i2c-brcmstb.c +++ b/drivers/i2c/busses/i2c-brcmstb.c @@ -673,7 +673,7 @@ static int brcmstb_i2c_probe(struct platform_device *pdev) /* set the data in/out register size for compatible SoCs */ if (of_device_is_compatible(dev->device->of_node, - "brcmstb,brcmper-i2c")) + "brcm,brcmper-i2c")) dev->data_regsz = sizeof(u8); else dev->data_regsz = sizeof(u32); From 44cad52cc14ae10062f142ec16ede489bccf4469 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 14 Feb 2022 13:05:49 +0100 Subject: [PATCH 222/773] x86/ptrace: Fix xfpregs_set()'s incorrect xmm clearing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xfpregs_set() handles 32-bit REGSET_XFP and 64-bit REGSET_FP. The actual code treats these regsets as modern FX state (i.e. the beginning part of XSTATE). The declarations of the regsets thought they were the legacy i387 format. The code thought they were the 32-bit (no xmm8..15) variant of XSTATE and, for good measure, made the high bits disappear by zeroing the wrong part of the buffer. The latter broke ptrace, and everything else confused anyone trying to understand the code. In particular, the nonsense definitions of the regsets confused me when I wrote this code. Clean this all up. Change the declarations to match reality (which shouldn't change the generated code, let alone the ABI) and fix xfpregs_set() to clear the correct bits and to only do so for 32-bit callers. Fixes: 6164331d15f7 ("x86/fpu: Rewrite xfpregs_set()") Reported-by: Luís Ferreira Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Link: https://bugzilla.kernel.org/show_bug.cgi?id=215524 Link: https://lore.kernel.org/r/YgpFnZpF01WwR8wU@zn.tnic --- arch/x86/kernel/fpu/regset.c | 9 ++++----- arch/x86/kernel/ptrace.c | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 437d7c930c0b..75ffaef8c299 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -91,11 +91,9 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; - struct user32_fxsr_struct newstate; + struct fxregs_state newstate; int ret; - BUILD_BUG_ON(sizeof(newstate) != sizeof(struct fxregs_state)); - if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; @@ -116,9 +114,10 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, /* Copy the state */ memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); - /* Clear xmm8..15 */ + /* Clear xmm8..15 for 32-bit callers */ BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); - memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16); + if (in_ia32_syscall()) + memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16); /* Mark FP and SSE as in use when XSAVE is enabled */ if (use_xsave()) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 6d2244c94799..8d2f2f995539 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1224,7 +1224,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { }, [REGSET_FP] = { .core_note_type = NT_PRFPREG, - .n = sizeof(struct user_i387_struct) / sizeof(long), + .n = sizeof(struct fxregs_state) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, @@ -1271,7 +1271,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { }, [REGSET_XFP] = { .core_note_type = NT_PRXFPREG, - .n = sizeof(struct user32_fxsr_struct) / sizeof(u32), + .n = sizeof(struct fxregs_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, From a679a61520d8a7b0211a1da990404daf5cc80b72 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 18 Feb 2022 11:47:51 +0100 Subject: [PATCH 223/773] fuse: fix fileattr op failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fileattr API conversion broke lsattr on ntfs3g. Previously the ioctl(... FS_IOC_GETFLAGS) returned an EINVAL error, but after the conversion the error returned by the fuse filesystem was not propagated back to the ioctl() system call, resulting in success being returned with bogus values. Fix by checking for outarg.result in fuse_priv_ioctl(), just as generic ioctl code does. Reported-by: Jean-Pierre André Fixes: 72227eac177d ("fuse: convert to fileattr") Cc: # v5.13 Signed-off-by: Miklos Szeredi --- fs/fuse/ioctl.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fbc09dab1f85..df58966bc874 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].value = ptr; err = fuse_simple_request(fm, &args); - if (!err && outarg.flags & FUSE_IOCTL_RETRY) - err = -EIO; - + if (!err) { + if (outarg.result < 0) + err = outarg.result; + else if (outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + } return err; } From efe4186e6a1b54bf38b9e05450d43b0da1fd7739 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Thu, 17 Feb 2022 09:43:03 +0800 Subject: [PATCH 224/773] drivers: hamradio: 6pack: fix UAF bug caused by mod_timer() When a 6pack device is detaching, the sixpack_close() will act to cleanup necessary resources. Although del_timer_sync() in sixpack_close() won't return if there is an active timer, one could use mod_timer() in sp_xmit_on_air() to wake up timer again by calling userspace syscall such as ax25_sendmsg(), ax25_connect() and ax25_ioctl(). This unexpected waked handler, sp_xmit_on_air(), realizes nothing about the undergoing cleanup and may still call pty_write() to use driver layer resources that have already been released. One of the possible race conditions is shown below: (USE) | (FREE) ax25_sendmsg() | ax25_queue_xmit() | ... | sp_xmit() | sp_encaps() | sixpack_close() sp_xmit_on_air() | del_timer_sync(&sp->tx_t) mod_timer(&sp->tx_t,...) | ... | unregister_netdev() | ... (wait a while) | tty_release() | tty_release_struct() | release_tty() sp_xmit_on_air() | tty_kref_put(tty_struct) //FREE pty_write(tty_struct) //USE | ... The corresponding fail log is shown below: =============================================================== BUG: KASAN: use-after-free in __run_timers.part.0+0x170/0x470 Write of size 8 at addr ffff88800a652ab8 by task swapper/2/0 ... Call Trace: ... queue_work_on+0x3f/0x50 pty_write+0xcd/0xe0pty_write+0xcd/0xe0 sp_xmit_on_air+0xb2/0x1f0 call_timer_fn+0x28/0x150 __run_timers.part.0+0x3c2/0x470 run_timer_softirq+0x3b/0x80 __do_softirq+0xf1/0x380 ... This patch reorders the del_timer_sync() after the unregister_netdev() to avoid UAF bugs. Because the unregister_netdev() is well synchronized, it flushs out any pending queues, waits the refcount of net_device decreases to zero and removes net_device from kernel. There is not any running routines after executing unregister_netdev(). Therefore, we could not arouse timer from userspace again. Signed-off-by: Duoming Zhou Reviewed-by: Lin Ma Signed-off-by: David S. Miller --- drivers/net/hamradio/6pack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index b1fc153125d9..45c3c4a1101b 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -668,11 +668,11 @@ static void sixpack_close(struct tty_struct *tty) */ netif_stop_queue(sp->dev); + unregister_netdev(sp->dev); + del_timer_sync(&sp->tx_t); del_timer_sync(&sp->resync_t); - unregister_netdev(sp->dev); - /* Free all 6pack frame buffers after unreg. */ kfree(sp->rbuff); kfree(sp->xbuff); From 4224cfd7fb6523f7a9d1c8bb91bb5df1e38eb624 Mon Sep 17 00:00:00 2001 From: suresh kumar Date: Thu, 17 Feb 2022 07:25:18 +0530 Subject: [PATCH 225/773] net-sysfs: add check for netdevice being present to speed_show When bringing down the netdevice or system shutdown, a panic can be triggered while accessing the sysfs path because the device is already removed. [ 755.549084] mlx5_core 0000:12:00.1: Shutdown was called [ 756.404455] mlx5_core 0000:12:00.0: Shutdown was called ... [ 757.937260] BUG: unable to handle kernel NULL pointer dereference at (null) [ 758.031397] IP: [] dma_pool_alloc+0x1ab/0x280 crash> bt ... PID: 12649 TASK: ffff8924108f2100 CPU: 1 COMMAND: "amsd" ... #9 [ffff89240e1a38b0] page_fault at ffffffff8f38c778 [exception RIP: dma_pool_alloc+0x1ab] RIP: ffffffff8ee11acb RSP: ffff89240e1a3968 RFLAGS: 00010046 RAX: 0000000000000246 RBX: ffff89243d874100 RCX: 0000000000001000 RDX: 0000000000000000 RSI: 0000000000000246 RDI: ffff89243d874090 RBP: ffff89240e1a39c0 R8: 000000000001f080 R9: ffff8905ffc03c00 R10: ffffffffc04680d4 R11: ffffffff8edde9fd R12: 00000000000080d0 R13: ffff89243d874090 R14: ffff89243d874080 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #10 [ffff89240e1a39c8] mlx5_alloc_cmd_msg at ffffffffc04680f3 [mlx5_core] #11 [ffff89240e1a3a18] cmd_exec at ffffffffc046ad62 [mlx5_core] #12 [ffff89240e1a3ab8] mlx5_cmd_exec at ffffffffc046b4fb [mlx5_core] #13 [ffff89240e1a3ae8] mlx5_core_access_reg at ffffffffc0475434 [mlx5_core] #14 [ffff89240e1a3b40] mlx5e_get_fec_caps at ffffffffc04a7348 [mlx5_core] #15 [ffff89240e1a3bb0] get_fec_supported_advertised at ffffffffc04992bf [mlx5_core] #16 [ffff89240e1a3c08] mlx5e_get_link_ksettings at ffffffffc049ab36 [mlx5_core] #17 [ffff89240e1a3ce8] __ethtool_get_link_ksettings at ffffffff8f25db46 #18 [ffff89240e1a3d48] speed_show at ffffffff8f277208 #19 [ffff89240e1a3dd8] dev_attr_show at ffffffff8f0b70e3 #20 [ffff89240e1a3df8] sysfs_kf_seq_show at ffffffff8eedbedf #21 [ffff89240e1a3e18] kernfs_seq_show at ffffffff8eeda596 #22 [ffff89240e1a3e28] seq_read at ffffffff8ee76d10 #23 [ffff89240e1a3e98] kernfs_fop_read at ffffffff8eedaef5 #24 [ffff89240e1a3ed8] vfs_read at ffffffff8ee4e3ff #25 [ffff89240e1a3f08] sys_read at ffffffff8ee4f27f #26 [ffff89240e1a3f50] system_call_fastpath at ffffffff8f395f92 crash> net_device.state ffff89443b0c0000 state = 0x5 (__LINK_STATE_START| __LINK_STATE_NOCARRIER) To prevent this scenario, we also make sure that the netdevice is present. Signed-off-by: suresh kumar Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 53ea262ecafd..fbddf966206b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -213,7 +213,7 @@ static ssize_t speed_show(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev)) { + if (netif_running(netdev) && netif_device_present(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) From 2f131de361f6d0eaff17db26efdb844c178432f8 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 17 Feb 2022 11:30:48 +0200 Subject: [PATCH 226/773] net/sched: act_ct: Fix flow table lookup after ct clear or switching zones Flow table lookup is skipped if packet either went through ct clear action (which set the IP_CT_UNTRACKED flag on the packet), or while switching zones and there is already a connection associated with the packet. This will result in no SW offload of the connection, and the and connection not being removed from flow table with TCP teardown (fin/rst packet). To fix the above, remove these unneccary checks in flow table lookup. Fixes: 46475bb20f4b ("net/sched: act_ct: Software offload of established flows") Signed-off-by: Paul Blakey Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sched/act_ct.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index f99247fc6468..33e70d60f0bf 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -533,11 +533,6 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, struct nf_conn *ct; u8 dir; - /* Previously seen or loopback */ - ct = nf_ct_get(skb, &ctinfo); - if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) - return false; - switch (family) { case NFPROTO_IPV4: if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph)) From e9da0b56fe27206b49f39805f7dcda8a89379062 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 17 Feb 2022 14:10:44 +0100 Subject: [PATCH 227/773] sr9700: sanity check for packet length A malicious device can leak heap data to user space providing bogus frame lengths. Introduce a sanity check. Signed-off-by: Oliver Neukum Reviewed-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/usb/sr9700.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c index b658510cc9a4..5a53e63d33a6 100644 --- a/drivers/net/usb/sr9700.c +++ b/drivers/net/usb/sr9700.c @@ -413,7 +413,7 @@ static int sr9700_rx_fixup(struct usbnet *dev, struct sk_buff *skb) /* ignore the CRC length */ len = (skb->data[1] | (skb->data[2] << 8)) - 4; - if (len > ETH_FRAME_LEN) + if (len > ETH_FRAME_LEN || len > skb->len) return 0; /* the last packet of current skb */ From a1cdec57e03a1352e92fbbe7974039dda4efcec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2022 09:05:02 -0800 Subject: [PATCH 228/773] net-timestamp: convert sk->sk_tskey to atomic_t UDP sendmsg() can be lockless, this is causing all kinds of data races. This patch converts sk->sk_tskey to remove one of these races. BUG: KCSAN: data-race in __ip_append_data / __ip_append_data read to 0xffff8881035d4b6c of 4 bytes by task 8877 on cpu 1: __ip_append_data+0x1c1/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881035d4b6c of 4 bytes by task 8880 on cpu 0: __ip_append_data+0x1d8/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000054d -> 0x0000054e Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8880 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00167-gdcb85f85fa6f-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 09c2d251b707 ("net-timestamp: add key to disambiguate concurrent datagrams") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- net/can/j1939/transport.c | 2 +- net/core/skbuff.c | 2 +- net/core/sock.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..50aecd28b355 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -507,7 +507,7 @@ struct sock { #endif u16 sk_tsflags; u8 sk_shutdown; - u32 sk_tskey; + atomic_t sk_tskey; atomic_t sk_zckey; u8 sk_clockid; @@ -2667,7 +2667,7 @@ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = sk->sk_tskey++; + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index a271688780a2..307ee1174a6e 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2006,7 +2006,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, /* set the end-packet for broadcast */ session->pkt.last = session->pkt.total; - skcb->tskey = session->sk->sk_tskey++; + skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1; session->tskey = skcb->tskey; return session; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9d0388bed0c1..6a15ce3eb1d3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4730,7 +4730,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) - serr->ee.ee_data -= sk->sk_tskey; + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); } err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 4ff806d71921..6eb174805bf0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -879,9 +879,9 @@ int sock_set_timestamping(struct sock *sk, int optname, if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - sk->sk_tskey = tcp_sk(sk)->snd_una; + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { - sk->sk_tskey = 0; + atomic_set(&sk->sk_tskey, 0); } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 139cec29ed06..7911916a480b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -991,7 +991,7 @@ static int __ip_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2995f8d89e7e..304a295de84f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1465,7 +1465,7 @@ static int __ip6_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); From b352c3465bb808ab700d03f5bac2f7a6f37c5350 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Fri, 18 Feb 2022 10:19:39 +0800 Subject: [PATCH 229/773] net: ll_temac: check the return value of devm_kmalloc() devm_kmalloc() returns a pointer to allocated memory on success, NULL on failure. While lp->indirect_lock is allocated by devm_kmalloc() without proper check. It is better to check the value of it to prevent potential wrong memory access. Fixes: f14f5c11f051 ("net: ll_temac: Support indirect_mutex share within TEMAC IP") Signed-off-by: Xiaoke Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/ll_temac_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index b900ab5aef2a..64c7e26c3b75 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1433,6 +1433,8 @@ static int temac_probe(struct platform_device *pdev) lp->indirect_lock = devm_kmalloc(&pdev->dev, sizeof(*lp->indirect_lock), GFP_KERNEL); + if (!lp->indirect_lock) + return -ENOMEM; spin_lock_init(lp->indirect_lock); } From f268088f65af69aa6ae1fb65e696cbb6478bcc9a Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 18 Feb 2022 17:33:33 +0800 Subject: [PATCH 230/773] perf test: Skip failing sigtrap test for arm+aarch64 Skip the Sigtrap test for arm + arm64, same as was done for s390 in commit a840974e96fd ("perf test: Test 73 Sig_trap fails on s390"). For this, reuse BP_SIGNAL_IS_SUPPORTED - meaning that the arch can use BP to generate signals - instead of BP_ACCOUNT_IS_SUPPORTED, which is appropriate. As described by Will at [0], in the test we get stuck in a loop of handling the HW breakpoint exception and never making progress. GDB handles this by stepping over the faulting instruction, but with perf the kernel is expected to handle the step (which it doesn't for arm). Dmitry made an attempt to get this work, also mentioned in the same thread as [0], which was appreciated. But the best thing to do is skip the test for now. [0] https://lore.kernel.org/linux-perf-users/20220118124343.GC98966@leoy-ThinkPad-X240s/T/#m13b06c39d2a5100d340f009435df6f4d8ee57b5a Fixes: 5504f67944484495 ("perf test sigtrap: Add basic stress test for sigtrap handling") Signed-off-by: John Garry Tested-by: Leo Yan Acked-by: Marco Elver Cc: Dmitriy Vyukov Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Marco Elver Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Richter Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux@armlinux.org.uk Link: https://lore.kernel.org/r/1645176813-202756-1-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/sigtrap.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c index 1f147fe6595f..e32ece90e164 100644 --- a/tools/perf/tests/sigtrap.c +++ b/tools/perf/tests/sigtrap.c @@ -22,19 +22,6 @@ #include "tests.h" #include "../perf-sys.h" -/* - * PowerPC and S390 do not support creation of instruction breakpoints using the - * perf_event interface. - * - * Just disable the test for these architectures until these issues are - * resolved. - */ -#if defined(__powerpc__) || defined(__s390x__) -#define BP_ACCOUNT_IS_SUPPORTED 0 -#else -#define BP_ACCOUNT_IS_SUPPORTED 1 -#endif - #define NUM_THREADS 5 static struct { @@ -135,7 +122,7 @@ static int test__sigtrap(struct test_suite *test __maybe_unused, int subtest __m char sbuf[STRERR_BUFSIZE]; int i, fd, ret = TEST_FAIL; - if (!BP_ACCOUNT_IS_SUPPORTED) { + if (!BP_SIGNAL_IS_SUPPORTED) { pr_debug("Test not supported on this architecture"); return TEST_SKIP; } From 8a3d2ee0de3828e0d01f9682d35ee53704659bd0 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Fri, 18 Feb 2022 17:31:27 +0800 Subject: [PATCH 231/773] perf evlist: Fix failed to use cpu list for uncore events The 'perf record' and 'perf stat' commands have supported the option '-C/--cpus' to count or collect only on the list of CPUs provided. Commit 1d3351e631fc34d7 ("perf tools: Enable on a list of CPUs for hybrid") add it to be supported for hybrid. For hybrid support, it checks the cpu list are available on hybrid PMU. But when we test only uncore events(or events not in cpu_core and cpu_atom), there is a bug: Before: # perf stat -C0 -e uncore_clock/clockticks/ sleep 1 failed to use cpu list 0 In this case, for uncore event, its pmu_name is not cpu_core or cpu_atom, so in evlist__fix_hybrid_cpus, perf_pmu__find_hybrid_pmu should return NULL,both events_nr and unmatched_count should be 0 ,then the cpu list check function evlist__fix_hybrid_cpus return -1 and the error "failed to use cpu list 0" will happen. Bypass "events_nr=0" case then the issue is fixed. After: # perf stat -C0 -e uncore_clock/clockticks/ sleep 1 Performance counter stats for 'CPU(s) 0': 195,476,873 uncore_clock/clockticks/ 1.004518677 seconds time elapsed When testing with at least one core event and uncore events, it has no issue. # perf stat -C0 -e cpu_core/cpu-cycles/,uncore_clock/clockticks/ sleep 1 Performance counter stats for 'CPU(s) 0': 5,993,774 cpu_core/cpu-cycles/ 301,025,912 uncore_clock/clockticks/ 1.003964934 seconds time elapsed Fixes: 1d3351e631fc34d7 ("perf tools: Enable on a list of CPUs for hybrid") Reviewed-by: Kan Liang Signed-off-by: Zhengjun Xing Cc: Adrian Hunter Cc: alexander.shishkin@intel.com Cc: Andi Kleen Cc: Ian Rogers Cc: Jin Yao Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20220218093127.1844241-1-zhengjun.xing@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist-hybrid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c index 7f234215147d..57f02beef023 100644 --- a/tools/perf/util/evlist-hybrid.c +++ b/tools/perf/util/evlist-hybrid.c @@ -154,8 +154,8 @@ int evlist__fix_hybrid_cpus(struct evlist *evlist, const char *cpu_list) perf_cpu_map__put(matched_cpus); perf_cpu_map__put(unmatched_cpus); } - - ret = (unmatched_count == events_nr) ? -1 : 0; + if (events_nr) + ret = (unmatched_count == events_nr) ? -1 : 0; out: perf_cpu_map__put(cpus); return ret; From 8700af2cc18c919b2a83e74e0479038fd113c15d Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 17 Feb 2022 04:09:28 +0100 Subject: [PATCH 232/773] RDMA/rtrs-clt: Fix possible double free in error case Callback function rtrs_clt_dev_release() for put_device() calls kfree(clt) to free memory. We shouldn't call kfree(clt) again, and we can't use the clt after kfree too. Replace device_register() with device_initialize() and device_add() so that dev_set_name can() be used appropriately. Move mutex_destroy() to the release function so it can be called in the alloc_clt err path. Fixes: eab098246625 ("RDMA/rtrs-clt: Refactor the failure cases in alloc_clt") Link: https://lore.kernel.org/r/20220217030929.323849-1-haris.iqbal@ionos.com Reported-by: Miaoqian Lin Signed-off-by: Md Haris Iqbal Reviewed-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 37 ++++++++++++++------------ 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 7c3f98e57889..6164b07a1644 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -2682,6 +2682,8 @@ static void rtrs_clt_dev_release(struct device *dev) struct rtrs_clt_sess *clt = container_of(dev, struct rtrs_clt_sess, dev); + mutex_destroy(&clt->paths_ev_mutex); + mutex_destroy(&clt->paths_mutex); kfree(clt); } @@ -2711,6 +2713,8 @@ static struct rtrs_clt_sess *alloc_clt(const char *sessname, size_t paths_num, return ERR_PTR(-ENOMEM); } + clt->dev.class = rtrs_clt_dev_class; + clt->dev.release = rtrs_clt_dev_release; uuid_gen(&clt->paths_uuid); INIT_LIST_HEAD_RCU(&clt->paths_list); clt->paths_num = paths_num; @@ -2727,43 +2731,41 @@ static struct rtrs_clt_sess *alloc_clt(const char *sessname, size_t paths_num, init_waitqueue_head(&clt->permits_wait); mutex_init(&clt->paths_ev_mutex); mutex_init(&clt->paths_mutex); + device_initialize(&clt->dev); - clt->dev.class = rtrs_clt_dev_class; - clt->dev.release = rtrs_clt_dev_release; err = dev_set_name(&clt->dev, "%s", sessname); if (err) - goto err; + goto err_put; + /* * Suppress user space notification until * sysfs files are created */ dev_set_uevent_suppress(&clt->dev, true); - err = device_register(&clt->dev); - if (err) { - put_device(&clt->dev); - goto err; - } + err = device_add(&clt->dev); + if (err) + goto err_put; clt->kobj_paths = kobject_create_and_add("paths", &clt->dev.kobj); if (!clt->kobj_paths) { err = -ENOMEM; - goto err_dev; + goto err_del; } err = rtrs_clt_create_sysfs_root_files(clt); if (err) { kobject_del(clt->kobj_paths); kobject_put(clt->kobj_paths); - goto err_dev; + goto err_del; } dev_set_uevent_suppress(&clt->dev, false); kobject_uevent(&clt->dev.kobj, KOBJ_ADD); return clt; -err_dev: - device_unregister(&clt->dev); -err: +err_del: + device_del(&clt->dev); +err_put: free_percpu(clt->pcpu_path); - kfree(clt); + put_device(&clt->dev); return ERR_PTR(err); } @@ -2771,9 +2773,10 @@ static void free_clt(struct rtrs_clt_sess *clt) { free_permits(clt); free_percpu(clt->pcpu_path); - mutex_destroy(&clt->paths_ev_mutex); - mutex_destroy(&clt->paths_mutex); - /* release callback will free clt in last put */ + + /* + * release callback will free clt and destroy mutexes in last put + */ device_unregister(&clt->dev); } From c46fa8911b17e3f808679061a8af8bee219f4602 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 17 Feb 2022 04:09:29 +0100 Subject: [PATCH 233/773] RDMA/rtrs-clt: Move free_permit from free_clt to rtrs_clt_close Error path of rtrs_clt_open() calls free_clt(), where free_permit is called. This is wrong since error path of rtrs_clt_open() does not need to call free_permit(). Also, moving free_permits() call to rtrs_clt_close(), makes it more aligned with the call to alloc_permit() in rtrs_clt_open(). Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Link: https://lore.kernel.org/r/20220217030929.323849-2-haris.iqbal@ionos.com Signed-off-by: Md Haris Iqbal Reviewed-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 6164b07a1644..759b85f03331 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -2771,7 +2771,6 @@ err_put: static void free_clt(struct rtrs_clt_sess *clt) { - free_permits(clt); free_percpu(clt->pcpu_path); /* @@ -2893,6 +2892,7 @@ void rtrs_clt_close(struct rtrs_clt_sess *clt) rtrs_clt_destroy_path_files(clt_path, NULL); kobject_put(&clt_path->kobj); } + free_permits(clt); free_clt(clt); } EXPORT_SYMBOL(rtrs_clt_close); From b70bc066d77b460a63a8c3fb2ea0d811ce862a83 Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 17 Dec 2021 12:36:25 +0100 Subject: [PATCH 234/773] ice: Match on all profiles in slow-path In switchdev mode, slow-path rules need to match all protocols, in order to correctly redirect unfiltered or missed packets to the uplink. To set this up for the virtual function to uplink flow, the rule that redirects packets to the control VSI must have the tunnel type set to ICE_SW_TUN_AND_NON_TUN. As a result of that new tunnel type being set, ice_get_compat_fv_bitmap will select ICE_PROF_ALL. At that point all profiles would be selected for this rule, resulting in the desired behavior. Without this change slow-path would not work with tunnel protocols. Fixes: 8b032a55c1bd ("ice: low level support for tunnels") Signed-off-by: Wojciech Drewek Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 1 + drivers/net/ethernet/intel/ice/ice_protocol_type.h | 1 + drivers/net/ethernet/intel/ice/ice_switch.c | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 864692b157b6..73edc24d81d5 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -44,6 +44,7 @@ ice_eswitch_add_vf_mac_rule(struct ice_pf *pf, struct ice_vf *vf, const u8 *mac) ctrl_vsi->rxq_map[vf->vf_id]; rule_info.flags_info.act |= ICE_SINGLE_ACT_LB_ENABLE; rule_info.flags_info.act_valid = true; + rule_info.tun_type = ICE_SW_TUN_AND_NON_TUN; err = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, vf->repr->mac_rule); diff --git a/drivers/net/ethernet/intel/ice/ice_protocol_type.h b/drivers/net/ethernet/intel/ice/ice_protocol_type.h index dc1b0e9e6df5..695b6dd61dc2 100644 --- a/drivers/net/ethernet/intel/ice/ice_protocol_type.h +++ b/drivers/net/ethernet/intel/ice/ice_protocol_type.h @@ -47,6 +47,7 @@ enum ice_protocol_type { enum ice_sw_tunnel_type { ICE_NON_TUN = 0, + ICE_SW_TUN_AND_NON_TUN, ICE_SW_TUN_VXLAN, ICE_SW_TUN_GENEVE, ICE_SW_TUN_NVGRE, diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 11ae0bee3590..475ec2afa210 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -4537,6 +4537,7 @@ ice_get_compat_fv_bitmap(struct ice_hw *hw, struct ice_adv_rule_info *rinfo, case ICE_SW_TUN_NVGRE: prof_type = ICE_PROF_TUN_GRE; break; + case ICE_SW_TUN_AND_NON_TUN: default: prof_type = ICE_PROF_ALL; break; @@ -5305,7 +5306,8 @@ ice_add_adv_rule(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups, if (status) goto err_ice_add_adv_rule; - if (rinfo->tun_type != ICE_NON_TUN) { + if (rinfo->tun_type != ICE_NON_TUN && + rinfo->tun_type != ICE_SW_TUN_AND_NON_TUN) { status = ice_fill_adv_packet_tun(hw, rinfo->tun_type, s_rule->pdata.lkup_tx_rx.hdr, pkt_offsets); From 932645c298c41aad64ef13016ff4c2034eef5aed Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Mon, 3 Jan 2022 07:41:21 +0100 Subject: [PATCH 235/773] ice: fix setting l4 port flag when adding filter Accidentally filter flag for none encapsulated l4 port field is always set. Even if user wants to add encapsulated l4 port field. Remove this unnecessary flag setting. Fixes: 9e300987d4a81 ("ice: VXLAN and Geneve TC support") Signed-off-by: Michal Swiatkowski Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index e8aab664270a..65cf32eb4046 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -709,7 +709,7 @@ ice_tc_set_port(struct flow_match_ports match, fltr->flags |= ICE_TC_FLWR_FIELD_ENC_DEST_L4_PORT; else fltr->flags |= ICE_TC_FLWR_FIELD_DEST_L4_PORT; - fltr->flags |= ICE_TC_FLWR_FIELD_DEST_L4_PORT; + headers->l4_key.dst_port = match.key->dst; headers->l4_mask.dst_port = match.mask->dst; } @@ -718,7 +718,7 @@ ice_tc_set_port(struct flow_match_ports match, fltr->flags |= ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT; else fltr->flags |= ICE_TC_FLWR_FIELD_SRC_L4_PORT; - fltr->flags |= ICE_TC_FLWR_FIELD_SRC_L4_PORT; + headers->l4_key.src_port = match.key->src; headers->l4_mask.src_port = match.mask->src; } From fadead80fe4c033b5e514fcbadd20b55c4494112 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 7 Feb 2022 10:23:29 -0800 Subject: [PATCH 236/773] ice: fix concurrent reset and removal of VFs Commit c503e63200c6 ("ice: Stop processing VF messages during teardown") introduced a driver state flag, ICE_VF_DEINIT_IN_PROGRESS, which is intended to prevent some issues with concurrently handling messages from VFs while tearing down the VFs. This change was motivated by crashes caused while tearing down and bringing up VFs in rapid succession. It turns out that the fix actually introduces issues with the VF driver caused because the PF no longer responds to any messages sent by the VF during its .remove routine. This results in the VF potentially removing its DMA memory before the PF has shut down the device queues. Additionally, the fix doesn't actually resolve concurrency issues within the ice driver. It is possible for a VF to initiate a reset just prior to the ice driver removing VFs. This can result in the remove task concurrently operating while the VF is being reset. This results in similar memory corruption and panics purportedly fixed by that commit. Fix this concurrency at its root by protecting both the reset and removal flows using the existing VF cfg_lock. This ensures that we cannot remove the VF while any outstanding critical tasks such as a virtchnl message or a reset are occurring. This locking change also fixes the root cause originally fixed by commit c503e63200c6 ("ice: Stop processing VF messages during teardown"), so we can simply revert it. Note that I kept these two changes together because simply reverting the original commit alone would leave the driver vulnerable to worse race conditions. Fixes: c503e63200c6 ("ice: Stop processing VF messages during teardown") Signed-off-by: Jacob Keller Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 - drivers/net/ethernet/intel/ice/ice_main.c | 2 + .../net/ethernet/intel/ice/ice_virtchnl_pf.c | 42 +++++++++++-------- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index a9fa701aaa95..473b1f6be9de 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -280,7 +280,6 @@ enum ice_pf_state { ICE_VFLR_EVENT_PENDING, ICE_FLTR_OVERFLOW_PROMISC, ICE_VF_DIS, - ICE_VF_DEINIT_IN_PROGRESS, ICE_CFG_BUSY, ICE_SERVICE_SCHED, ICE_SERVICE_DIS, diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 17a9bb461dc3..f3c346e13b7a 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1799,7 +1799,9 @@ static void ice_handle_mdd_event(struct ice_pf *pf) * reset, so print the event prior to reset. */ ice_print_vf_rx_mdd_event(vf); + mutex_lock(&pf->vf[i].cfg_lock); ice_reset_vf(&pf->vf[i], false); + mutex_unlock(&pf->vf[i].cfg_lock); } } } diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 39b80124d282..408f78e3eb13 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -500,8 +500,6 @@ void ice_free_vfs(struct ice_pf *pf) struct ice_hw *hw = &pf->hw; unsigned int tmp, i; - set_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state); - if (!pf->vf) return; @@ -519,22 +517,26 @@ void ice_free_vfs(struct ice_pf *pf) else dev_warn(dev, "VFs are assigned - not disabling SR-IOV\n"); - /* Avoid wait time by stopping all VFs at the same time */ - ice_for_each_vf(pf, i) - ice_dis_vf_qs(&pf->vf[i]); - tmp = pf->num_alloc_vfs; pf->num_qps_per_vf = 0; pf->num_alloc_vfs = 0; for (i = 0; i < tmp; i++) { - if (test_bit(ICE_VF_STATE_INIT, pf->vf[i].vf_states)) { + struct ice_vf *vf = &pf->vf[i]; + + mutex_lock(&vf->cfg_lock); + + ice_dis_vf_qs(vf); + + if (test_bit(ICE_VF_STATE_INIT, vf->vf_states)) { /* disable VF qp mappings and set VF disable state */ - ice_dis_vf_mappings(&pf->vf[i]); - set_bit(ICE_VF_STATE_DIS, pf->vf[i].vf_states); - ice_free_vf_res(&pf->vf[i]); + ice_dis_vf_mappings(vf); + set_bit(ICE_VF_STATE_DIS, vf->vf_states); + ice_free_vf_res(vf); } - mutex_destroy(&pf->vf[i].cfg_lock); + mutex_unlock(&vf->cfg_lock); + + mutex_destroy(&vf->cfg_lock); } if (ice_sriov_free_msix_res(pf)) @@ -570,7 +572,6 @@ void ice_free_vfs(struct ice_pf *pf) i); clear_bit(ICE_VF_DIS, pf->state); - clear_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state); clear_bit(ICE_FLAG_SRIOV_ENA, pf->flags); } @@ -1498,6 +1499,8 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) ice_for_each_vf(pf, v) { vf = &pf->vf[v]; + mutex_lock(&vf->cfg_lock); + vf->driver_caps = 0; ice_vc_set_default_allowlist(vf); @@ -1512,6 +1515,8 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) ice_vf_pre_vsi_rebuild(vf); ice_vf_rebuild_vsi(vf); ice_vf_post_vsi_rebuild(vf); + + mutex_unlock(&vf->cfg_lock); } if (ice_is_eswitch_mode_switchdev(pf)) @@ -1562,6 +1567,8 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr) u32 reg; int i; + lockdep_assert_held(&vf->cfg_lock); + dev = ice_pf_to_dev(pf); if (test_bit(ICE_VF_RESETS_DISABLED, pf->state)) { @@ -2061,9 +2068,12 @@ void ice_process_vflr_event(struct ice_pf *pf) bit_idx = (hw->func_caps.vf_base_id + vf_id) % 32; /* read GLGEN_VFLRSTAT register to find out the flr VFs */ reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx)); - if (reg & BIT(bit_idx)) + if (reg & BIT(bit_idx)) { /* GLGEN_VFLRSTAT bit will be cleared in ice_reset_vf */ + mutex_lock(&vf->cfg_lock); ice_reset_vf(vf, true); + mutex_unlock(&vf->cfg_lock); + } } } @@ -2140,7 +2150,9 @@ ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event) if (!vf) return; + mutex_lock(&vf->cfg_lock); ice_vc_reset_vf(vf); + mutex_unlock(&vf->cfg_lock); } /** @@ -4625,10 +4637,6 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event) struct device *dev; int err = 0; - /* if de-init is underway, don't process messages from VF */ - if (test_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state)) - return; - dev = ice_pf_to_dev(pf); if (ice_validate_vf_id(pf, vf_id)) { err = -EINVAL; From ed22d9c8d128293fc7b0b086c7d3654bcb99a8dd Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 14 Feb 2022 06:33:27 -0800 Subject: [PATCH 237/773] ice: check the return of ice_ptp_gettimex64 Clang static analysis reports this issue time64.h:69:50: warning: The left operand of '+' is a garbage value set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, ~~~~~~~~~~ ^ In ice_ptp_adjtime_nonatomic(), the timespec64 variable 'now' is set by ice_ptp_gettimex64(). This function can fail with -EBUSY, so 'now' can have a gargbage value. So check the return. Fixes: 06c16d89d2cb ("ice: register 1588 PTP clock device object for E810 devices") Signed-off-by: Tom Rix Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ptp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index ae291d442539..000c39d163a2 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1533,9 +1533,12 @@ exit: static int ice_ptp_adjtime_nonatomic(struct ptp_clock_info *info, s64 delta) { struct timespec64 now, then; + int ret; then = ns_to_timespec64(delta); - ice_ptp_gettimex64(info, &now, NULL); + ret = ice_ptp_gettimex64(info, &now, NULL); + if (ret) + return ret; now = timespec64_add(now, then); return ice_ptp_settime64(info, (const struct timespec64 *)&now); From 5950bdc88dd1d158f2845fdff8fb1de86476806c Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 14 Feb 2022 07:40:43 -0800 Subject: [PATCH 238/773] ice: initialize local variable 'tlv' Clang static analysis reports this issues ice_common.c:5008:21: warning: The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage ldo->phy_type_low |= ((u64)buf << (i * 16)); ~~~~~~~~~~~~~~~~~ ^ When called from ice_cfg_phy_fec() ldo is the uninitialized local variable tlv. So initialize. Fixes: ea78ce4dab05 ("ice: add link lenient and default override support") Signed-off-by: Tom Rix Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index a6d7d3eff186..e2af99a763ed 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -3340,7 +3340,7 @@ ice_cfg_phy_fec(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg, if (fec == ICE_FEC_AUTO && ice_fw_supports_link_override(hw) && !ice_fw_supports_report_dflt_cfg(hw)) { - struct ice_link_default_override_tlv tlv; + struct ice_link_default_override_tlv tlv = { 0 }; status = ice_get_link_default_override(&tlv, pi); if (status) From 7b1f781f2d2460693f43d5f764198df558e3494b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 15 Feb 2022 13:32:26 -0800 Subject: [PATCH 239/773] Input: psmouse - set up dependency between PS/2 and SMBus companions When we switch from emulated PS/2 to native (RMI4 or Elan) protocols, we create SMBus companion devices that are attached to I2C/SMBus controllers. However, when suspending and resuming, we also need to make sure that we take into account the PS/2 device they are associated with, so that PS/2 device is suspended after the companion and resumed before it, otherwise companions will not work properly. Before I2C devices were marked for asynchronous suspend/resume, this ordering happened naturally, but now we need to enforce it by establishing device links, with PS/2 devices being suppliers and SMBus companions being consumers. Fixes: 172d931910e1 ("i2c: enable async suspend/resume on i2c client devices") Reported-and-tested-by: Hugh Dickins Tested-by: Jarkko Nikula Link: https://lore.kernel.org/r/89456fcd-a113-4c82-4b10-a9bcaefac68f@google.com Link: https://lore.kernel.org/r/YgwQN8ynO88CPMju@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/psmouse-smbus.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/input/mouse/psmouse-smbus.c b/drivers/input/mouse/psmouse-smbus.c index a472489ccbad..164f6c757f6b 100644 --- a/drivers/input/mouse/psmouse-smbus.c +++ b/drivers/input/mouse/psmouse-smbus.c @@ -75,6 +75,8 @@ static void psmouse_smbus_detach_i2c_client(struct i2c_client *client) "Marking SMBus companion %s as gone\n", dev_name(&smbdev->client->dev)); smbdev->dead = true; + device_link_remove(&smbdev->client->dev, + &smbdev->psmouse->ps2dev.serio->dev); serio_rescan(smbdev->psmouse->ps2dev.serio); } else { list_del(&smbdev->node); @@ -174,6 +176,8 @@ static void psmouse_smbus_disconnect(struct psmouse *psmouse) kfree(smbdev); } else { smbdev->dead = true; + device_link_remove(&smbdev->client->dev, + &psmouse->ps2dev.serio->dev); psmouse_dbg(smbdev->psmouse, "posting removal request for SMBus companion %s\n", dev_name(&smbdev->client->dev)); @@ -270,6 +274,12 @@ int psmouse_smbus_init(struct psmouse *psmouse, if (smbdev->client) { /* We have our companion device */ + if (!device_link_add(&smbdev->client->dev, + &psmouse->ps2dev.serio->dev, + DL_FLAG_STATELESS)) + psmouse_warn(psmouse, + "failed to set up link with iSMBus companion %s\n", + dev_name(&smbdev->client->dev)); return 0; } From efd12405f1801ef0458d908a844317fb1388c3bf Mon Sep 17 00:00:00 2001 From: Li Yang Date: Mon, 8 Nov 2021 18:07:51 -0600 Subject: [PATCH 240/773] dt-bindings: qoriq-clock: add missing compatible for lx2160a The compatible string is already in use, fix the binding to include it. Signed-off-by: Li Yang Acked-by: Rob Herring --- Documentation/devicetree/bindings/clock/qoriq-clock.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/clock/qoriq-clock.txt b/Documentation/devicetree/bindings/clock/qoriq-clock.txt index f7d48f23da44..10119d9ef4b1 100644 --- a/Documentation/devicetree/bindings/clock/qoriq-clock.txt +++ b/Documentation/devicetree/bindings/clock/qoriq-clock.txt @@ -44,6 +44,7 @@ Required properties: * "fsl,ls1046a-clockgen" * "fsl,ls1088a-clockgen" * "fsl,ls2080a-clockgen" + * "fsl,lx2160a-clockgen" Chassis-version clock strings include: * "fsl,qoriq-clockgen-1.0": for chassis 1.0 clocks * "fsl,qoriq-clockgen-2.0": for chassis 2.0 clocks From 6b4266b8deb857ce2dc2a9b769b242865b9a0bce Mon Sep 17 00:00:00 2001 From: Li Yang Date: Mon, 8 Nov 2021 18:10:18 -0600 Subject: [PATCH 241/773] dt-bindings: fsl,layerscape-dcfg: add missing compatible for lx2160a The compatible string is already in use, fix the chip list in binding to include it. Signed-off-by: Li Yang Acked-by: Rob Herring --- .../devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt index b5cb374dc47d..10a91cc8b997 100644 --- a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt +++ b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt @@ -8,7 +8,7 @@ Required properties: - compatible: Should contain a chip-specific compatible string, Chip-specific strings are of the form "fsl,-dcfg", The following s are known to be supported: - ls1012a, ls1021a, ls1043a, ls1046a, ls2080a. + ls1012a, ls1021a, ls1043a, ls1046a, ls2080a, lx2160a - reg : should contain base address and length of DCFG memory-mapped registers From 988f0a9045b0058a43ccee764a671dfab81e6d15 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 12:59:52 +0200 Subject: [PATCH 242/773] soc: fsl: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Signed-off-by: Li Yang --- include/soc/fsl/dpaa2-fd.h | 3 ++- include/soc/fsl/qe/immap_qe.h | 3 ++- include/soc/fsl/qe/qe_tdm.h | 4 +++- include/soc/fsl/qe/ucc_fast.h | 2 +- include/soc/fsl/qe/ucc_slow.h | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/soc/fsl/dpaa2-fd.h b/include/soc/fsl/dpaa2-fd.h index 90ae8d191f1a..bae490cac0aa 100644 --- a/include/soc/fsl/dpaa2-fd.h +++ b/include/soc/fsl/dpaa2-fd.h @@ -7,7 +7,8 @@ #ifndef __FSL_DPAA2_FD_H #define __FSL_DPAA2_FD_H -#include +#include +#include /** * DOC: DPAA2 FD - Frame Descriptor APIs for DPAA2 diff --git a/include/soc/fsl/qe/immap_qe.h b/include/soc/fsl/qe/immap_qe.h index 7614fee532f1..edd601f53f5d 100644 --- a/include/soc/fsl/qe/immap_qe.h +++ b/include/soc/fsl/qe/immap_qe.h @@ -13,7 +13,8 @@ #define _ASM_POWERPC_IMMAP_QE_H #ifdef __KERNEL__ -#include +#include + #include #define QE_IMMAP_SIZE (1024 * 1024) /* 1MB from 1MB+IMMR */ diff --git a/include/soc/fsl/qe/qe_tdm.h b/include/soc/fsl/qe/qe_tdm.h index b6febe225071..43ea830cfe1f 100644 --- a/include/soc/fsl/qe/qe_tdm.h +++ b/include/soc/fsl/qe/qe_tdm.h @@ -10,8 +10,8 @@ #ifndef _QE_TDM_H_ #define _QE_TDM_H_ -#include #include +#include #include #include @@ -19,6 +19,8 @@ #include #include +struct device_node; + /* SI RAM entries */ #define SIR_LAST 0x0001 #define SIR_BYTE 0x0002 diff --git a/include/soc/fsl/qe/ucc_fast.h b/include/soc/fsl/qe/ucc_fast.h index 9696a5b9b5d1..ad60b87a3c69 100644 --- a/include/soc/fsl/qe/ucc_fast.h +++ b/include/soc/fsl/qe/ucc_fast.h @@ -10,7 +10,7 @@ #ifndef __UCC_FAST_H__ #define __UCC_FAST_H__ -#include +#include #include #include diff --git a/include/soc/fsl/qe/ucc_slow.h b/include/soc/fsl/qe/ucc_slow.h index 11a216e4e919..7548ce8a202d 100644 --- a/include/soc/fsl/qe/ucc_slow.h +++ b/include/soc/fsl/qe/ucc_slow.h @@ -11,7 +11,7 @@ #ifndef __UCC_SLOW_H__ #define __UCC_SLOW_H__ -#include +#include #include #include From f2b70418ec6f104981b54709a4cfe3a3c46b7d8f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 12:59:50 +0200 Subject: [PATCH 243/773] soc: fsl: Correct MAINTAINERS database (QUICC ENGINE LIBRARY) MAINTAINERS lacks of proper coverage for FSL headers. Fix it accordingly. Fixes: 7aa1aa6ecec2 ("QE: Move QE from arch/powerpc to drivers/soc") Signed-off-by: Andy Shevchenko Signed-off-by: Li Yang --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..9abac76757ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7736,8 +7736,7 @@ M: Qiang Zhao L: linuxppc-dev@lists.ozlabs.org S: Maintained F: drivers/soc/fsl/qe/ -F: include/soc/fsl/*qe*.h -F: include/soc/fsl/*ucc*.h +F: include/soc/fsl/qe/ FREESCALE QUICC ENGINE UCC ETHERNET DRIVER M: Li Yang From b80af7564446c8ab96438cac00e0575eb86154ad Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Nov 2021 12:59:51 +0200 Subject: [PATCH 244/773] soc: fsl: Correct MAINTAINERS database (SOC) MAINTAINERS lacks of proper coverage for FSL headers. Fix it accordingly. Fixes: 1b48706f027c ("MAINTAINERS: add entry for Freescale SoC drivers") Signed-off-by: Andy Shevchenko Signed-off-by: Li Yang --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9abac76757ad..1ab143aab365 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7767,6 +7767,7 @@ F: Documentation/devicetree/bindings/misc/fsl,dpaa2-console.yaml F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ F: include/linux/fsl/ +F: include/soc/fsl/ FREESCALE SOC FS_ENET DRIVER M: Pantelis Antoniou From b113737cf12964a20cc3ba1ddabe6229099661c6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 3 Nov 2021 21:00:17 +0100 Subject: [PATCH 245/773] soc: fsl: guts: Revert commit 3c0d64e867ed This reverts commit 3c0d64e867ed ("soc: fsl: guts: reuse machine name from device tree"). A following patch will fix the missing memory allocation failure check instead. Suggested-by: Tyrel Datwyler Signed-off-by: Christophe JAILLET Signed-off-by: Li Yang --- drivers/soc/fsl/guts.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/soc/fsl/guts.c b/drivers/soc/fsl/guts.c index 072473a16f4d..af7741eafc57 100644 --- a/drivers/soc/fsl/guts.c +++ b/drivers/soc/fsl/guts.c @@ -28,7 +28,6 @@ struct fsl_soc_die_attr { static struct guts *guts; static struct soc_device_attribute soc_dev_attr; static struct soc_device *soc_dev; -static struct device_node *root; /* SoC die attribute definition for QorIQ platform */ @@ -138,7 +137,7 @@ static u32 fsl_guts_get_svr(void) static int fsl_guts_probe(struct platform_device *pdev) { - struct device_node *np = pdev->dev.of_node; + struct device_node *root, *np = pdev->dev.of_node; struct device *dev = &pdev->dev; const struct fsl_soc_die_attr *soc_die; const char *machine; @@ -159,8 +158,9 @@ static int fsl_guts_probe(struct platform_device *pdev) root = of_find_node_by_path("/"); if (of_property_read_string(root, "model", &machine)) of_property_read_string_index(root, "compatible", 0, &machine); + of_node_put(root); if (machine) - soc_dev_attr.machine = machine; + soc_dev_attr.machine = devm_kstrdup(dev, machine, GFP_KERNEL); svr = fsl_guts_get_svr(); soc_die = fsl_soc_die_match(svr, fsl_soc_die); @@ -195,7 +195,6 @@ static int fsl_guts_probe(struct platform_device *pdev) static int fsl_guts_remove(struct platform_device *dev) { soc_device_unregister(soc_dev); - of_node_put(root); return 0; } From b9abe942cda43a1d46a0fd96efb54f1aa909f757 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 3 Nov 2021 21:00:33 +0100 Subject: [PATCH 246/773] soc: fsl: guts: Add a missing memory allocation failure check If 'devm_kstrdup()' fails, we should return -ENOMEM. While at it, move the 'of_node_put()' call in the error handling path and after the 'machine' has been copied. Better safe than sorry. Fixes: a6fc3b698130 ("soc: fsl: add GUTS driver for QorIQ platforms") Depends-on: fddacc7ff4dd ("soc: fsl: guts: Revert commit 3c0d64e867ed") Suggested-by: Tyrel Datwyler Signed-off-by: Christophe JAILLET Signed-off-by: Li Yang --- drivers/soc/fsl/guts.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/soc/fsl/guts.c b/drivers/soc/fsl/guts.c index af7741eafc57..5ed2fc1c53a0 100644 --- a/drivers/soc/fsl/guts.c +++ b/drivers/soc/fsl/guts.c @@ -158,9 +158,14 @@ static int fsl_guts_probe(struct platform_device *pdev) root = of_find_node_by_path("/"); if (of_property_read_string(root, "model", &machine)) of_property_read_string_index(root, "compatible", 0, &machine); - of_node_put(root); - if (machine) + if (machine) { soc_dev_attr.machine = devm_kstrdup(dev, machine, GFP_KERNEL); + if (!soc_dev_attr.machine) { + of_node_put(root); + return -ENOMEM; + } + } + of_node_put(root); svr = fsl_guts_get_svr(); soc_die = fsl_soc_die_match(svr, fsl_soc_die); From 6385960501d9e0248a8745714674e86bd077e198 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Sat, 11 Dec 2021 17:08:45 +0800 Subject: [PATCH 247/773] soc: fsl: qe: fix typo in a comment The double `is' in the comment in line 150 is repeated. Remove one of them from the comment. Also removes a redundant tab in a new line. Signed-off-by: Jason Wang Signed-off-by: Li Yang --- drivers/soc/fsl/qe/qe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c index 4d38c80f8be8..b3c226eb5292 100644 --- a/drivers/soc/fsl/qe/qe.c +++ b/drivers/soc/fsl/qe/qe.c @@ -147,7 +147,7 @@ EXPORT_SYMBOL(qe_issue_cmd); * memory mapped space. * The BRG clock is the QE clock divided by 2. * It was set up long ago during the initial boot phase and is - * is given to us. + * given to us. * Baud rate clocks are zero-based in the driver code (as that maps * to port numbers). Documentation uses 1-based numbering. */ @@ -421,7 +421,7 @@ static void qe_upload_microcode(const void *base, for (i = 0; i < be32_to_cpu(ucode->count); i++) iowrite32be(be32_to_cpu(code[i]), &qe_immr->iram.idata); - + /* Set I-RAM Ready Register */ iowrite32be(QE_IRAM_READY, &qe_immr->iram.iready); } From a222fd8541394b36b13c89d1698d9530afd59a9c Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 30 Dec 2021 09:45:43 +0800 Subject: [PATCH 248/773] soc: fsl: qe: Check of ioremap return value As the possible failure of the ioremap(), the par_io could be NULL. Therefore it should be better to check it and return error in order to guarantee the success of the initiation. But, I also notice that all the caller like mpc85xx_qe_par_io_init() in `arch/powerpc/platforms/85xx/common.c` don't check the return value of the par_io_init(). Actually, par_io_init() needs to check to handle the potential error. I will submit another patch to fix that. Anyway, par_io_init() itsely should be fixed. Fixes: 7aa1aa6ecec2 ("QE: Move QE from arch/powerpc to drivers/soc") Signed-off-by: Jiasheng Jiang Signed-off-by: Li Yang --- drivers/soc/fsl/qe/qe_io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/soc/fsl/qe/qe_io.c b/drivers/soc/fsl/qe/qe_io.c index e277c827bdf3..a5e2d0e5ab51 100644 --- a/drivers/soc/fsl/qe/qe_io.c +++ b/drivers/soc/fsl/qe/qe_io.c @@ -35,6 +35,8 @@ int par_io_init(struct device_node *np) if (ret) return ret; par_io = ioremap(res.start, resource_size(&res)); + if (!par_io) + return -ENOMEM; if (!of_property_read_u32(np, "num-ports", &num_ports)) num_par_io_ports = num_ports; From 64fd52a4d3ce63a327948cefc8e4e5c7ef35e813 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 11 Feb 2022 09:23:45 +0000 Subject: [PATCH 249/773] pinctrl: starfive: Use a static name for the GPIO irq_chip Drop the device name used for the GPIO irq_chip and replace it with something static. The information is still available from debugfs and carried as part of the irqdomain. Suggested-by: Emil Renner Berthing Signed-off-by: Marc Zyngier Cc: Linus Walleij Cc: Bartosz Golaszewski Link: https://lore.kernel.org/r/20220211092345.1093332-1-maz@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-starfive.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-starfive.c b/drivers/pinctrl/pinctrl-starfive.c index 0b912152a405..266da41a6162 100644 --- a/drivers/pinctrl/pinctrl-starfive.c +++ b/drivers/pinctrl/pinctrl-starfive.c @@ -1164,6 +1164,7 @@ static int starfive_irq_set_type(struct irq_data *d, unsigned int trigger) } static struct irq_chip starfive_irq_chip = { + .name = "StarFive GPIO", .irq_ack = starfive_irq_ack, .irq_mask = starfive_irq_mask, .irq_mask_ack = starfive_irq_mask_ack, @@ -1308,7 +1309,6 @@ static int starfive_probe(struct platform_device *pdev) sfp->gc.ngpio = NR_GPIOS; starfive_irq_chip.parent_device = dev; - starfive_irq_chip.name = sfp->gc.label; sfp->gc.irq.chip = &starfive_irq_chip; sfp->gc.irq.parent_handler = starfive_gpio_irq_handler; From d1e972ace42390de739cde87d96043dcbe502286 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 11 Feb 2022 09:39:04 +0000 Subject: [PATCH 250/773] gpio: tegra186: Fix chip_data type confusion The tegra186 GPIO driver makes the assumption that the pointer returned by irq_data_get_irq_chip_data() is a pointer to a tegra_gpio structure. Unfortunately, it is actually a pointer to the inner gpio_chip structure, as mandated by the gpiolib infrastructure. Nice try. The saving grace is that the gpio_chip is the first member of tegra_gpio, so the bug has gone undetected since... forever. Fix it by performing a container_of() on the pointer. This results in no additional code, and makes it possible to understand how the whole thing works. Fixes: 5b2b135a87fc ("gpio: Add Tegra186 support") Signed-off-by: Marc Zyngier Cc: Thierry Reding Cc: Linus Walleij Cc: Bartosz Golaszewski Link: https://lore.kernel.org/r/20220211093904.1112679-1-maz@kernel.org Signed-off-by: Linus Walleij --- drivers/gpio/gpio-tegra186.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c index 34b36a8c035f..8d298beffd86 100644 --- a/drivers/gpio/gpio-tegra186.c +++ b/drivers/gpio/gpio-tegra186.c @@ -343,9 +343,12 @@ static int tegra186_gpio_of_xlate(struct gpio_chip *chip, return offset + pin; } +#define to_tegra_gpio(x) container_of((x), struct tegra_gpio, gpio) + static void tegra186_irq_ack(struct irq_data *data) { - struct tegra_gpio *gpio = irq_data_get_irq_chip_data(data); + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + struct tegra_gpio *gpio = to_tegra_gpio(gc); void __iomem *base; base = tegra186_gpio_get_base(gpio, data->hwirq); @@ -357,7 +360,8 @@ static void tegra186_irq_ack(struct irq_data *data) static void tegra186_irq_mask(struct irq_data *data) { - struct tegra_gpio *gpio = irq_data_get_irq_chip_data(data); + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + struct tegra_gpio *gpio = to_tegra_gpio(gc); void __iomem *base; u32 value; @@ -372,7 +376,8 @@ static void tegra186_irq_mask(struct irq_data *data) static void tegra186_irq_unmask(struct irq_data *data) { - struct tegra_gpio *gpio = irq_data_get_irq_chip_data(data); + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + struct tegra_gpio *gpio = to_tegra_gpio(gc); void __iomem *base; u32 value; @@ -387,7 +392,8 @@ static void tegra186_irq_unmask(struct irq_data *data) static int tegra186_irq_set_type(struct irq_data *data, unsigned int type) { - struct tegra_gpio *gpio = irq_data_get_irq_chip_data(data); + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + struct tegra_gpio *gpio = to_tegra_gpio(gc); void __iomem *base; u32 value; From 5a2aba71cd2610d3ed08867a1b1bf617cd8f89b8 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Mon, 14 Feb 2022 17:18:52 -0600 Subject: [PATCH 251/773] net: mvpp2: always set port pcs ops Booting a MACCHIATObin with 5.17, the system OOPs with a null pointer deref when the network is started. This is caused by the pcs->ops structure being null in mcpp2_acpi_start() when it tries to call pcs_config(). Hoisting the code which sets pcs_gmac.ops and pcs_xlg.ops, assuring they are always set, fixes the problem. The OOPs looks like: [ 18.687760] Unable to handle kernel access to user memory outside uaccess routines at virtual address 0000000000000010 [ 18.698561] Mem abort info: [ 18.698564] ESR = 0x96000004 [ 18.698567] EC = 0x25: DABT (current EL), IL = 32 bits [ 18.709821] SET = 0, FnV = 0 [ 18.714292] EA = 0, S1PTW = 0 [ 18.718833] FSC = 0x04: level 0 translation fault [ 18.725126] Data abort info: [ 18.729408] ISV = 0, ISS = 0x00000004 [ 18.734655] CM = 0, WnR = 0 [ 18.738933] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000111bbf000 [ 18.745409] [0000000000000010] pgd=0000000000000000, p4d=0000000000000000 [ 18.752235] Internal error: Oops: 96000004 [#1] SMP [ 18.757134] Modules linked in: rfkill ip_set nf_tables nfnetlink qrtr sunrpc vfat fat omap_rng fuse zram xfs crct10dif_ce mvpp2 ghash_ce sbsa_gwdt phylink xhci_plat_hcd ahci_plam [ 18.773481] CPU: 0 PID: 681 Comm: NetworkManager Not tainted 5.17.0-0.rc3.89.fc36.aarch64 #1 [ 18.781954] Hardware name: Marvell Armada 7k/8k Family Board /Armada 7k/8k Family Board , BIOS EDK II Jun 4 2019 [ 18.795222] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 18.802213] pc : mvpp2_start_dev+0x2b0/0x300 [mvpp2] [ 18.807208] lr : mvpp2_start_dev+0x298/0x300 [mvpp2] [ 18.812197] sp : ffff80000b4732c0 [ 18.815522] x29: ffff80000b4732c0 x28: 0000000000000000 x27: ffffccab38ae57f8 [ 18.822689] x26: ffff6eeb03065a10 x25: ffff80000b473a30 x24: ffff80000b4735b8 [ 18.829855] x23: 0000000000000000 x22: 00000000000001e0 x21: ffff6eeb07b6ab68 [ 18.837021] x20: ffff6eeb07b6ab30 x19: ffff6eeb07b6a9c0 x18: 0000000000000014 [ 18.844187] x17: 00000000f6232bfe x16: ffffccab899b1dc0 x15: 000000006a30f9fa [ 18.851353] x14: 000000003b77bd50 x13: 000006dc896f0e8e x12: 001bbbfccfd0d3a2 [ 18.858519] x11: 0000000000001528 x10: 0000000000001548 x9 : ffffccab38ad0fb0 [ 18.865685] x8 : ffff80000b473330 x7 : 0000000000000000 x6 : 0000000000000000 [ 18.872851] x5 : 0000000000000000 x4 : 0000000000000000 x3 : ffff80000b4732f8 [ 18.880017] x2 : 000000000000001a x1 : 0000000000000002 x0 : ffff6eeb07b6ab68 [ 18.887183] Call trace: [ 18.889637] mvpp2_start_dev+0x2b0/0x300 [mvpp2] [ 18.894279] mvpp2_open+0x134/0x2b4 [mvpp2] [ 18.898483] __dev_open+0x128/0x1e4 [ 18.901988] __dev_change_flags+0x17c/0x1d0 [ 18.906187] dev_change_flags+0x30/0x70 [ 18.910038] do_setlink+0x278/0xa7c [ 18.913540] __rtnl_newlink+0x44c/0x7d0 [ 18.917391] rtnl_newlink+0x5c/0x8c [ 18.920892] rtnetlink_rcv_msg+0x254/0x314 [ 18.925006] netlink_rcv_skb+0x48/0x10c [ 18.928858] rtnetlink_rcv+0x24/0x30 [ 18.932449] netlink_unicast+0x290/0x2f4 [ 18.936386] netlink_sendmsg+0x1d0/0x41c [ 18.940323] sock_sendmsg+0x60/0x70 [ 18.943825] ____sys_sendmsg+0x248/0x260 [ 18.947762] ___sys_sendmsg+0x74/0xa0 [ 18.951438] __sys_sendmsg+0x64/0xcc [ 18.955027] __arm64_sys_sendmsg+0x30/0x40 [ 18.959140] invoke_syscall+0x50/0x120 [ 18.962906] el0_svc_common.constprop.0+0x4c/0xf4 [ 18.967629] do_el0_svc+0x30/0x9c [ 18.970958] el0_svc+0x28/0xb0 [ 18.974025] el0t_64_sync_handler+0x10c/0x140 [ 18.978400] el0t_64_sync+0x1a4/0x1a8 [ 18.982078] Code: 52800004 b9416262 aa1503e0 52800041 (f94008a5) [ 18.988196] ---[ end trace 0000000000000000 ]--- Fixes: cff056322372 ("net: mvpp2: use .mac_select_pcs() interface") Suggested-by: Russell King (Oracle) Signed-off-by: Jeremy Linton Reviewed-by: Marcin Wojtas Link: https://lore.kernel.org/r/20220214231852.3331430-1-jeremy.linton@arm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 7cdbf8b8bbf6..1a835b48791b 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -6870,6 +6870,9 @@ static int mvpp2_port_probe(struct platform_device *pdev, dev->max_mtu = MVPP2_BM_JUMBO_PKT_SIZE; dev->dev.of_node = port_node; + port->pcs_gmac.ops = &mvpp2_phylink_gmac_pcs_ops; + port->pcs_xlg.ops = &mvpp2_phylink_xlg_pcs_ops; + if (!mvpp2_use_acpi_compat_mode(port_fwnode)) { port->phylink_config.dev = &dev->dev; port->phylink_config.type = PHYLINK_NETDEV; @@ -6940,9 +6943,6 @@ static int mvpp2_port_probe(struct platform_device *pdev, port->phylink_config.supported_interfaces); } - port->pcs_gmac.ops = &mvpp2_phylink_gmac_pcs_ops; - port->pcs_xlg.ops = &mvpp2_phylink_xlg_pcs_ops; - phylink = phylink_create(&port->phylink_config, port_fwnode, phy_mode, &mvpp2_phylink_ops); if (IS_ERR(phylink)) { From ba88b5533728c54bdea68431988eff2d9a7a1237 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Fri, 18 Feb 2022 01:50:18 -0700 Subject: [PATCH 252/773] MAINTAINERS: rmnet: Update email addresses Switch to the quicinc.com ids. Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Link: https://lore.kernel.org/r/1645174218-32632-1-git-send-email-quic_subashab@quicinc.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6db79f3b209e..b57077b935ba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16077,8 +16077,8 @@ F: Documentation/devicetree/bindings/mtd/qcom,nandc.yaml F: drivers/mtd/nand/raw/qcom_nandc.c QUALCOMM RMNET DRIVER -M: Subash Abhinov Kasiviswanathan -M: Sean Tranchetti +M: Subash Abhinov Kasiviswanathan +M: Sean Tranchetti L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst From 3a14d0888eb4b0045884126acc69abfb7b87814d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 18 Feb 2022 14:15:35 +0100 Subject: [PATCH 253/773] nfp: flower: Fix a potential leak in nfp_tunnel_add_shared_mac() ida_simple_get() returns an id between min (0) and max (NFP_MAX_MAC_INDEX) inclusive. So NFP_MAX_MAC_INDEX (0xff) is a valid id. In order for the error handling path to work correctly, the 'invalid' value for 'ida_idx' should not be in the 0..NFP_MAX_MAC_INDEX range, inclusive. So set it to -1. Fixes: 20cce8865098 ("nfp: flower: enable MAC address sharing for offloadable devs") Signed-off-by: Christophe JAILLET Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20220218131535.100258-1-simon.horman@corigine.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index 0a326e04e692..cb43651ea9ba 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -922,8 +922,8 @@ nfp_tunnel_add_shared_mac(struct nfp_app *app, struct net_device *netdev, int port, bool mod) { struct nfp_flower_priv *priv = app->priv; - int ida_idx = NFP_MAX_MAC_INDEX, err; struct nfp_tun_offloaded_mac *entry; + int ida_idx = -1, err; u16 nfp_mac_idx = 0; entry = nfp_tunnel_lookup_offloaded_macs(app, netdev->dev_addr); @@ -997,7 +997,7 @@ err_remove_hash: err_free_entry: kfree(entry); err_free_ida: - if (ida_idx != NFP_MAX_MAC_INDEX) + if (ida_idx != -1) ida_simple_remove(&priv->tun.mac_off_ids, ida_idx); return err; From b1e8206582f9d680cff7d04828708c8b6ab32957 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Feb 2022 10:16:57 +0100 Subject: [PATCH 254/773] sched: Fix yet more sched_fork() races Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tadeusz Struk Tested-by: Zhang Qiao Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net --- include/linux/sched/task.h | 4 ++-- kernel/fork.c | 13 ++++++++++++- kernel/sched/core.c | 34 +++++++++++++++++++++------------- 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index b9198a1b3a84..e84e54d1b490 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p, - struct kernel_clone_args *kargs); +extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..c607d238fc23 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2266,6 +2266,17 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_put_pidfd; + /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do @@ -2376,7 +2387,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fcf0c180617c..9745613d531c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1214,9 +1214,8 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { - bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -4407,7 +4406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -4425,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4440,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4462,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } @@ -6922,7 +6930,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); @@ -7213,7 +7221,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* @@ -9446,7 +9454,7 @@ void __init sched_init(void) #endif } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: From 2428766e201565a5fa964d7461d9f6608eb04d7d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 18 Feb 2022 11:49:04 +0100 Subject: [PATCH 255/773] MAINTAINERS: remove duplicate entry for i2c-qcom-geni The driver is already covered in the ARM/QUALCOMM section. Also, Akash Asthana's email bounces meanwhile and Mukesh Savaliya has never responded to mails regarding this driver. Signed-off-by: Wolfram Sang Acked-by: Bjorn Andersson Signed-off-by: Wolfram Sang --- MAINTAINERS | 8 -------- 1 file changed, 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..f0485f61295d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15972,14 +15972,6 @@ F: Documentation/devicetree/bindings/misc/qcom,fastrpc.txt F: drivers/misc/fastrpc.c F: include/uapi/misc/fastrpc.h -QUALCOMM GENERIC INTERFACE I2C DRIVER -M: Akash Asthana -M: Mukesh Savaliya -L: linux-i2c@vger.kernel.org -L: linux-arm-msm@vger.kernel.org -S: Supported -F: drivers/i2c/busses/i2c-qcom-geni.c - QUALCOMM HEXAGON ARCHITECTURE M: Brian Cain L: linux-hexagon@vger.kernel.org From 0cd33c5ffec12bd77a1c02db2469fac08f840939 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:38 -0800 Subject: [PATCH 256/773] selftests: mptcp: fix diag instability Instead of waiting for an arbitrary amount of time for the MPTCP MP_CAPABLE handshake to complete, explicitly wait for the relevant socket to enter into the established status. Additionally let the data transfer application use the slowest transfer mode available (-r), to cope with very slow host, or high jitter caused by hosting VMs. Fixes: df62f2ec3df6 ("selftests/mptcp: add diag interface tests") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/258 Reported-and-tested-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/diag.sh | 44 +++++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 2674ba20d524..ff821025d309 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -71,6 +71,36 @@ chk_msk_remote_key_nr() __chk_nr "grep -c remote_key" $* } +# $1: ns, $2: port +wait_local_port_listen() +{ + local listener_ns="${1}" + local port="${2}" + + local port_hex i + + port_hex="$(printf "%04X" "${port}")" + for i in $(seq 10); do + ip netns exec "${listener_ns}" cat /proc/net/tcp | \ + awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" && + break + sleep 0.1 + done +} + +wait_connected() +{ + local listener_ns="${1}" + local port="${2}" + + local port_hex i + + port_hex="$(printf "%04X" "${port}")" + for i in $(seq 10); do + ip netns exec ${listener_ns} grep -q " 0100007F:${port_hex} " /proc/net/tcp && break + sleep 0.1 + done +} trap cleanup EXIT ip netns add $ns @@ -81,15 +111,15 @@ echo "a" | \ ip netns exec $ns \ ./mptcp_connect -p 10000 -l -t ${timeout_poll} \ 0.0.0.0 >/dev/null & -sleep 0.1 +wait_local_port_listen $ns 10000 chk_msk_nr 0 "no msk on netns creation" echo "b" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p 10000 -j -t ${timeout_poll} \ + ./mptcp_connect -p 10000 -r 0 -t ${timeout_poll} \ 127.0.0.1 >/dev/null & -sleep 0.1 +wait_connected $ns 10000 chk_msk_nr 2 "after MPC handshake " chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" @@ -101,13 +131,13 @@ echo "a" | \ ip netns exec $ns \ ./mptcp_connect -p 10001 -l -s TCP -t ${timeout_poll} \ 0.0.0.0 >/dev/null & -sleep 0.1 +wait_local_port_listen $ns 10001 echo "b" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p 10001 -j -t ${timeout_poll} \ + ./mptcp_connect -p 10001 -r 0 -t ${timeout_poll} \ 127.0.0.1 >/dev/null & -sleep 0.1 +wait_connected $ns 10001 chk_msk_fallback_nr 1 "check fallback" flush_pids @@ -119,7 +149,7 @@ for I in `seq 1 $NR_CLIENTS`; do ./mptcp_connect -p $((I+10001)) -l -w 10 \ -t ${timeout_poll} 0.0.0.0 >/dev/null & done -sleep 0.1 +wait_local_port_listen $ns $((NR_CLIENTS + 10001)) for I in `seq 1 $NR_CLIENTS`; do echo "b" | \ From 5b31dda736e31c58d1941c7349569c7452eafb6b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:39 -0800 Subject: [PATCH 257/773] selftests: mptcp: improve 'fair usage on close' stability The mentioned test has to wait for a subflow creation failure. The current code looks for TCP sockets in TW state and sometimes misses the relevant event. Switch to a more stable check, looking for the associated mib counter. Fixes: 46e967d187ed ("selftests: mptcp: add tests for subflow creation failure") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/257 Reported-and-tested-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index c0801df15f54..10b3bd805ac6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -961,7 +961,7 @@ wait_for_tw() local ns=$1 while [ $time -lt $timeout_ms ]; do - local cnt=$(ip netns exec $ns ss -t state time-wait |wc -l) + local cnt=$(ip netns exec $ns nstat -as TcpAttemptFails | grep TcpAttemptFails | awk '{print $2}') [ "$cnt" = 1 ] && return 1 time=$((time + 100)) From 98247bc16a27cf8ead4c47ce9f15888be85841fc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:40 -0800 Subject: [PATCH 258/773] mptcp: fix race in overlapping signal events After commit a88c9e496937 ("mptcp: do not block subflows creation on errors"), if a signal address races with a failing subflow creation, the subflow creation failure control path can trigger the selection of the next address to be announced while the current announced is still pending. The above will cause the unintended suppression of the ADD_ADDR announce. Fix the issue skipping the to-be-suppressed announce before it will mark an endpoint as already used. The relevant announce will be triggered again when the current one will complete. Fixes: a88c9e496937 ("mptcp: do not block subflows creation on errors") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 356f596e2032..82f82a513f5b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -546,6 +546,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.add_addr_signaled < add_addr_signal_max) { local = select_signal_address(pernet, msk); + /* due to racing events on both ends we can reach here while + * previous add address is still running: if we invoke now + * mptcp_pm_announce_addr(), that will fail and the + * corresponding id will be marked as used. + * Instead let the PM machinery reschedule us when the + * current address announce will be completed. + */ + if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) + return; + if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); From 837cf45df163a3780bc04b555700231e95b31dc9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:41 -0800 Subject: [PATCH 259/773] mptcp: fix race in incoming ADD_ADDR option processing If an MPTCP endpoint received multiple consecutive incoming ADD_ADDR options, mptcp_pm_add_addr_received() can overwrite the current remote address value after the PM lock is released in mptcp_pm_nl_add_addr_received() and before such address is echoed. Fix the issue caching the remote address value a little earlier and always using the cached value after releasing the PM lock. Fixes: f7efc7771eac ("mptcp: drop argument port from mptcp_pm_announce_addr") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 82f82a513f5b..4b5d795383cd 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -660,6 +660,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; + bool reset_port = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); @@ -669,15 +670,19 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); - if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) + remote = msk->pm.remote; + if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) goto add_addr_echo; + /* pick id 0 port, if none is provided the remote address */ + if (!remote.port) { + reset_port = true; + remote.port = sk->sk_dport; + } + /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ - remote = msk->pm.remote; - if (!remote.port) - remote.port = sk->sk_dport; nr = fill_local_addresses_vec(msk, addrs); msk->pm.add_addr_accepted++; @@ -690,8 +695,12 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) __mptcp_subflow_connect(sk, &addrs[i], &remote); spin_lock_bh(&msk->pm.lock); + /* be sure to echo exactly the received address */ + if (reset_port) + remote.port = 0; + add_addr_echo: - mptcp_pm_announce_addr(msk, &msk->pm.remote, true); + mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_nl_addr_send_ack(msk); } From f73c1194634506ab60af0debef04671fc431a435 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:42 -0800 Subject: [PATCH 260/773] mptcp: add mibs counter for ignored incoming options The MPTCP in kernel path manager has some constraints on incoming addresses announce processing, so that in edge scenarios it can end-up dropping (ignoring) some of such announces. The above is not very limiting in practice since such scenarios are very uncommon and MPTCP will recover due to ADD_ADDR retransmissions. This patch adds a few MIB counters to account for such drop events to allow easier introspection of the critical scenarios. Fixes: f7efc7771eac ("mptcp: drop argument port from mptcp_pm_announce_addr") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/mib.c | 2 ++ net/mptcp/mib.h | 2 ++ net/mptcp/pm.c | 8 ++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 3240b72271a7..7558802a1435 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -35,12 +35,14 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), + SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), + SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index ecd3d8b117e0..2966fcb6548b 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -28,12 +28,14 @@ enum linux_mptcp_mib_field { MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ + MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ + MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 696b2c4613a7..7bea318ac5f2 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -213,6 +213,8 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } else { + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } spin_unlock_bh(&pm->lock); @@ -253,8 +255,10 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); - mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); - pm->rm_list_rx = *rm_list; + if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) + pm->rm_list_rx = *rm_list; + else + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); spin_unlock_bh(&pm->lock); } From 6ef84b1517e08f6c2fc105b798a9d21bf4caa6cb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:43 -0800 Subject: [PATCH 261/773] selftests: mptcp: more robust signal race test The in kernel MPTCP PM implementation can process a single incoming add address option at any given time. In the mentioned test the server can surpass such limit. Let the setup cope with that allowing a faster add_addr retransmission. Fixes: a88c9e496937 ("mptcp: do not block subflows creation on errors") Fixes: f7efc7771eac ("mptcp: drop argument port from mptcp_pm_announce_addr") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/254 Reported-and-tested-by: Matthieu Baerts Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 10b3bd805ac6..0d6a71e7bb59 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -752,11 +752,17 @@ chk_add_nr() local mis_ack_nr=${8:-0} local count local dump_stats + local timeout + + timeout=`ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout` printf "%-39s %s" " " "add" - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtAddAddr | awk '{print $2}'` + count=`ip netns exec $ns2 nstat -as MPTcpExtAddAddr | grep MPTcpExtAddAddr | awk '{print $2}'` [ -z "$count" ] && count=0 - if [ "$count" != "$add_nr" ]; then + + # if the test configured a short timeout tolerate greater then expected + # add addrs options, due to retransmissions + if [ "$count" != "$add_nr" ] && [ "$timeout" -gt 1 -o "$count" -lt "$add_nr" ]; then echo "[fail] got $count ADD_ADDR[s] expected $add_nr" ret=1 dump_stats=1 @@ -1158,7 +1164,10 @@ signal_address_tests() ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags signal ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal - run_tests $ns1 $ns2 10.0.1.1 + + # the peer could possibly miss some addr notification, allow retransmission + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1 + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow chk_join_nr "signal addresses race test" 3 3 3 # the server will not signal the address terminating From e35f885b357d47e04380a2056d1b2cc3e6f4f24b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Feb 2022 13:35:44 -0800 Subject: [PATCH 262/773] selftests: mptcp: be more conservative with cookie MPJ limits Since commit 2843ff6f36db ("mptcp: remote addresses fullmesh"), an MPTCP client can attempt creating multiple MPJ subflow simultaneusly. In such scenario the server, when syncookies are enabled, could end-up accepting incoming MPJ syn even above the configured subflow limit, as the such limit can be enforced in a reliable way only after the subflow creation. In case of syncookie, only after the 3rd ack reception. As a consequence the related self-tests case sporadically fails, as it verify that the server always accept the expected number of MPJ syn. Address the issues relaxing the MPJ syn number constrain. Note that the check on the accepted number of MPJ 3rd ack still remains intact. Fixes: 2843ff6f36db ("mptcp: remote addresses fullmesh") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 0d6a71e7bb59..0c8a2a20b96c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -660,6 +660,7 @@ chk_join_nr() local ack_nr=$4 local count local dump_stats + local with_cookie printf "%02u %-36s %s" "$TEST_COUNT" "$msg" "syn" count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinSynRx | awk '{print $2}'` @@ -673,12 +674,20 @@ chk_join_nr() fi echo -n " - synack" + with_cookie=`ip netns exec $ns2 sysctl -n net.ipv4.tcp_syncookies` count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinSynAckRx | awk '{print $2}'` [ -z "$count" ] && count=0 if [ "$count" != "$syn_ack_nr" ]; then - echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr" - ret=1 - dump_stats=1 + # simult connections exceeding the limit with cookie enabled could go up to + # synack validation as the conn limit can be enforced reliably only after + # the subflow creation + if [ "$with_cookie" = 2 ] && [ "$count" -gt "$syn_ack_nr" ] && [ "$count" -le "$syn_nr" ]; then + echo -n "[ ok ]" + else + echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr" + ret=1 + dump_stats=1 + fi else echo -n "[ ok ]" fi From 5486f5bf790b5c664913076c3194b8f916a5c7ad Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Feb 2022 14:35:49 +0100 Subject: [PATCH 263/773] net: Force inlining of checksum functions in net/checksum.h All functions defined as static inline in net/checksum.h are meant to be inlined for performance reason. But since commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") the compiler is allowed to uninline functions when it wants. Fair enough in the general case, but for tiny performance critical checksum helpers that's counter-productive. The problem mainly arises when selecting CONFIG_CC_OPTIMISE_FOR_SIZE, Those helpers being 'static inline' in header files you suddenly find them duplicated many times in the resulting vmlinux. Here is a typical exemple when building powerpc pmac32_defconfig with CONFIG_CC_OPTIMISE_FOR_SIZE. csum_sub() appears 4 times: c04a23cc : c04a23cc: 7c 84 20 f8 not r4,r4 c04a23d0: 7c 63 20 14 addc r3,r3,r4 c04a23d4: 7c 63 01 94 addze r3,r3 c04a23d8: 4e 80 00 20 blr ... c04a2ce8: 4b ff f6 e5 bl c04a23cc ... c04a2d2c: 4b ff f6 a1 bl c04a23cc ... c04a2d54: 4b ff f6 79 bl c04a23cc ... c04a754c : c04a754c: 7c 84 20 f8 not r4,r4 c04a7550: 7c 63 20 14 addc r3,r3,r4 c04a7554: 7c 63 01 94 addze r3,r3 c04a7558: 4e 80 00 20 blr ... c04ac930: 4b ff ac 1d bl c04a754c ... c04ad264: 4b ff a2 e9 bl c04a754c ... c04e3b08 : c04e3b08: 7c 84 20 f8 not r4,r4 c04e3b0c: 7c 63 20 14 addc r3,r3,r4 c04e3b10: 7c 63 01 94 addze r3,r3 c04e3b14: 4e 80 00 20 blr ... c04e5788: 4b ff e3 81 bl c04e3b08 ... c04e65c8: 4b ff d5 41 bl c04e3b08 ... c0512d34 : c0512d34: 7c 84 20 f8 not r4,r4 c0512d38: 7c 63 20 14 addc r3,r3,r4 c0512d3c: 7c 63 01 94 addze r3,r3 c0512d40: 4e 80 00 20 blr ... c0512dfc: 4b ff ff 39 bl c0512d34 ... c05138bc: 4b ff f4 79 bl c0512d34 ... Restore the expected behaviour by using __always_inline for all functions defined in net/checksum.h vmlinux size is even reduced by 256 bytes with this patch: text data bss dec hex filename 6980022 2515362 194384 9689768 93daa8 vmlinux.before 6979862 2515266 194384 9689512 93d9a8 vmlinux.now Fixes: ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") Cc: Masahiro Yamada Cc: Nick Desaulniers Cc: Andrew Morton Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- include/net/checksum.h | 47 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/include/net/checksum.h b/include/net/checksum.h index 5218041e5c8f..02d0c2d01014 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -22,7 +22,7 @@ #include #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER -static inline +static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { @@ -33,7 +33,7 @@ __wsum csum_and_copy_from_user (const void __user *src, void *dst, #endif #ifndef HAVE_CSUM_COPY_USER -static __inline__ __wsum csum_and_copy_to_user +static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); @@ -45,7 +45,7 @@ static __inline__ __wsum csum_and_copy_to_user #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY -static inline __wsum +static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); @@ -54,7 +54,7 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len) #endif #ifndef HAVE_ARCH_CSUM_ADD -static inline __wsum csum_add(__wsum csum, __wsum addend) +static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; @@ -62,12 +62,12 @@ static inline __wsum csum_add(__wsum csum, __wsum addend) } #endif -static inline __wsum csum_sub(__wsum csum, __wsum addend) +static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } -static inline __sum16 csum16_add(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; @@ -75,12 +75,12 @@ static inline __sum16 csum16_add(__sum16 csum, __be16 addend) return (__force __sum16)(res + (res < (__force u16)addend)); } -static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } -static inline __wsum csum_shift(__wsum sum, int offset) +static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) @@ -88,42 +88,43 @@ static inline __wsum csum_shift(__wsum sum, int offset) return sum; } -static inline __wsum +static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { return csum_add(csum, csum_shift(csum2, offset)); } -static inline __wsum +static __always_inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } -static inline __wsum +static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } -static inline __wsum csum_unfold(__sum16 n) +static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } -static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) +static __always_inline +__wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) -static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } -static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) +static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); @@ -136,7 +137,7 @@ static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) * m : old value of a 16bit field * m' : new value of a 16bit field */ -static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) +static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } @@ -150,16 +151,16 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); -static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, - __be16 from, __be16 to, - bool pseudohdr) +static __always_inline +void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, + __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } -static inline __wsum remcsum_adjust(void *ptr, __wsum csum, - int start, int offset) +static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, + int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; @@ -175,12 +176,12 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum, return delta; } -static inline void remcsum_unadjust(__sum16 *psum, __wsum delta) +static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } -static inline __wsum wsum_negate(__wsum val) +static __always_inline __wsum wsum_negate(__wsum val) { return (__force __wsum)-((__force u32)val); } From 3d00827a90db6f79abc7cdc553887f89a2e0a184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Svenning=20S=C3=B8rensen?= Date: Fri, 18 Feb 2022 11:27:01 +0000 Subject: [PATCH 264/773] net: dsa: microchip: fix bridging with more than two member ports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b3612ccdf284 ("net: dsa: microchip: implement multi-bridge support") plugged a packet leak between ports that were members of different bridges. Unfortunately, this broke another use case, namely that of more than two ports that are members of the same bridge. After that commit, when a port is added to a bridge, hardware bridging between other member ports of that bridge will be cleared, preventing packet exchange between them. Fix by ensuring that the Port VLAN Membership bitmap includes any existing ports in the bridge, not just the port being added. Fixes: b3612ccdf284 ("net: dsa: microchip: implement multi-bridge support") Signed-off-by: Svenning Sørensen Tested-by: Oleksij Rempel Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz_common.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 55dbda04ea62..243f8ad6d06e 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -26,7 +26,7 @@ void ksz_update_port_member(struct ksz_device *dev, int port) struct dsa_switch *ds = dev->ds; u8 port_member = 0, cpu_port; const struct dsa_port *dp; - int i; + int i, j; if (!dsa_is_user_port(ds, port)) return; @@ -45,13 +45,33 @@ void ksz_update_port_member(struct ksz_device *dev, int port) continue; if (!dsa_port_bridge_same(dp, other_dp)) continue; + if (other_p->stp_state != BR_STATE_FORWARDING) + continue; - if (other_p->stp_state == BR_STATE_FORWARDING && - p->stp_state == BR_STATE_FORWARDING) { + if (p->stp_state == BR_STATE_FORWARDING) { val |= BIT(port); port_member |= BIT(i); } + /* Retain port [i]'s relationship to other ports than [port] */ + for (j = 0; j < ds->num_ports; j++) { + const struct dsa_port *third_dp; + struct ksz_port *third_p; + + if (j == i) + continue; + if (j == port) + continue; + if (!dsa_is_user_port(ds, j)) + continue; + third_p = &dev->ports[j]; + if (third_p->stp_state != BR_STATE_FORWARDING) + continue; + third_dp = dsa_to_port(ds, j); + if (dsa_port_bridge_same(other_dp, third_dp)) + val |= BIT(j); + } + dev->dev_ops->cfg_port_member(dev, i, val | cpu_port); } From 8940e6b669ca1196ce0a0549c819078096390f76 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 18 Feb 2022 14:13:02 +0200 Subject: [PATCH 265/773] net: dsa: avoid call to __dev_set_promiscuity() while rtnl_mutex isn't held If the DSA master doesn't support IFF_UNICAST_FLT, then the following call path is possible: dsa_slave_switchdev_event_work -> dsa_port_host_fdb_add -> dev_uc_add -> __dev_set_rx_mode -> __dev_set_promiscuity Since the blamed commit, dsa_slave_switchdev_event_work() no longer holds rtnl_lock(), which triggers the ASSERT_RTNL() from __dev_set_promiscuity(). Taking rtnl_lock() around dev_uc_add() is impossible, because all the code paths that call dsa_flush_workqueue() do so from contexts where the rtnl_mutex is already held - so this would lead to an instant deadlock. dev_uc_add() in itself doesn't require the rtnl_mutex for protection. There is this comment in __dev_set_rx_mode() which assumes so: /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ but it is from commit 4417da668c00 ("[NET]: dev: secondary unicast address support") dated June 2007, and in the meantime, commit f1f28aa3510d ("netdev: Add addr_list_lock to struct net_device."), dated July 2008, has added &dev->addr_list_lock to protect this instead of the global rtnl_mutex. Nonetheless, __dev_set_promiscuity() does assume rtnl_mutex protection, but it is the uncommon path of what we typically expect dev_uc_add() to do. So since only the uncommon path requires rtnl_lock(), just check ahead of time whether dev_uc_add() would result into a call to __dev_set_promiscuity(), and handle that condition separately. DSA already configures the master interface to be promiscuous if the tagger requires this. We can extend this to also cover the case where the master doesn't handle dev_uc_add() (doesn't support IFF_UNICAST_FLT), and on the premise that we'd end up making it promiscuous during operation anyway, either if a DSA slave has a non-inherited MAC address, or if the bridge notifies local FDB entries for its own MAC address, the address of a station learned on a foreign port, etc. Fixes: 0faf890fc519 ("net: dsa: drop rtnl_lock from dsa_slave_switchdev_event_work") Reported-by: Oleksij Rempel Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/master.c | 7 ++++++- net/dsa/port.c | 20 ++++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/net/dsa/master.c b/net/dsa/master.c index 2199104ca7df..880f910b23a9 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -260,11 +260,16 @@ static void dsa_netdev_ops_set(struct net_device *dev, dev->dsa_ptr->netdev_ops = ops; } +/* Keep the master always promiscuous if the tagging protocol requires that + * (garbles MAC DA) or if it doesn't support unicast filtering, case in which + * it would revert to promiscuous mode as soon as we call dev_uc_add() on it + * anyway. + */ static void dsa_master_set_promiscuity(struct net_device *dev, int inc) { const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops; - if (!ops->promisc_on_master) + if ((dev->priv_flags & IFF_UNICAST_FLT) && !ops->promisc_on_master) return; ASSERT_RTNL(); diff --git a/net/dsa/port.c b/net/dsa/port.c index bd78192e0e47..eef4a98f2628 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -781,9 +781,15 @@ int dsa_port_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, struct dsa_port *cpu_dp = dp->cpu_dp; int err; - err = dev_uc_add(cpu_dp->master, addr); - if (err) - return err; + /* Avoid a call to __dev_set_promiscuity() on the master, which + * requires rtnl_lock(), since we can't guarantee that is held here, + * and we can't take it either. + */ + if (cpu_dp->master->priv_flags & IFF_UNICAST_FLT) { + err = dev_uc_add(cpu_dp->master, addr); + if (err) + return err; + } return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_ADD, &info); } @@ -800,9 +806,11 @@ int dsa_port_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, struct dsa_port *cpu_dp = dp->cpu_dp; int err; - err = dev_uc_del(cpu_dp->master, addr); - if (err) - return err; + if (cpu_dp->master->priv_flags & IFF_UNICAST_FLT) { + err = dev_uc_del(cpu_dp->master, addr); + if (err) + return err; + } return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_DEL, &info); } From b1a5983f56e371046dcf164f90bfaf704d2b89f6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 17 Feb 2022 23:41:20 +0100 Subject: [PATCH 266/773] netfilter: nf_tables_offload: incorrect flow offload action array size immediate verdict expression needs to allocate one slot in the flow offload action array, however, immediate data expression does not need to do so. fwd and dup expression need to allocate one slot, this is missing. Add a new offload_action interface to report if this expression needs to allocate one slot in the flow offload action array. Fixes: be2861dc36d7 ("netfilter: nft_{fwd,dup}_netdev: add offload support") Reported-and-tested-by: Nick Gregory Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/net/netfilter/nf_tables_offload.h | 2 -- net/netfilter/nf_tables_offload.c | 3 ++- net/netfilter/nft_dup_netdev.c | 6 ++++++ net/netfilter/nft_fwd_netdev.c | 6 ++++++ net/netfilter/nft_immediate.c | 12 +++++++++++- 6 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index eaf55da9a205..c4c0861deac1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -905,9 +905,9 @@ struct nft_expr_ops { int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); + bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); - u32 offload_flags; const struct nft_expr_type *type; void *data; }; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index f9d95ff82df8..797147843958 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -67,8 +67,6 @@ struct nft_flow_rule { struct flow_rule *rule; }; -#define NFT_OFFLOAD_F_ACTION (1 << 0) - void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 9656c1646222..2d36952b1392 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -94,7 +94,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { - if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + if (expr->ops->offload_action && + expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index bbf3fcba3df4..5b5c607fbf83 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -67,6 +67,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); } +static bool nft_dup_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, @@ -75,6 +80,7 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, .offload = nft_dup_netdev_offload, + .offload_action = nft_dup_netdev_offload_action, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index fa9301ca6033..619e394a91de 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -79,6 +79,11 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); } +static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + struct nft_fwd_neigh { u8 sreg_dev; u8 sreg_addr; @@ -222,6 +227,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .dump = nft_fwd_netdev_dump, .validate = nft_fwd_validate, .offload = nft_fwd_netdev_offload, + .offload_action = nft_fwd_netdev_offload_action, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 90c64d27ae53..d0f67d325bdf 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -213,6 +213,16 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_immediate_offload_action(const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + return true; + + return false; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -224,7 +234,7 @@ static const struct nft_expr_ops nft_imm_ops = { .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, - .offload_flags = NFT_OFFLOAD_F_ACTION, + .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { From e23e40fd6de5c1c94793bc4147e8f34387d58576 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 5 Feb 2022 01:58:04 +0100 Subject: [PATCH 267/773] hwmon: (ntc_thermistor) Underscore Samsung thermistor The sysfs does not like that we name the thermistor something that contains a dash: ntc-thermistor thermistor: hwmon: 'ssg1404-001221' is not a valid name attribute, please fix Fix it up by switching to an underscore. Fixes: e13e979b2b3d ("hwmon: (ntc_thermistor) Add Samsung 1404-001221 NTC") Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20220205005804.123245-1-linus.walleij@linaro.org Signed-off-by: Guenter Roeck --- drivers/hwmon/ntc_thermistor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/ntc_thermistor.c b/drivers/hwmon/ntc_thermistor.c index 414204f5704c..9c9e9f4ccb9e 100644 --- a/drivers/hwmon/ntc_thermistor.c +++ b/drivers/hwmon/ntc_thermistor.c @@ -59,7 +59,7 @@ static const struct platform_device_id ntc_thermistor_id[] = { [NTC_NCP15XH103] = { "ncp15xh103", TYPE_NCPXXXH103 }, [NTC_NCP18WB473] = { "ncp18wb473", TYPE_NCPXXWB473 }, [NTC_NCP21WB473] = { "ncp21wb473", TYPE_NCPXXWB473 }, - [NTC_SSG1404001221] = { "ssg1404-001221", TYPE_NCPXXWB473 }, + [NTC_SSG1404001221] = { "ssg1404_001221", TYPE_NCPXXWB473 }, [NTC_LAST] = { }, }; From c94afc46cae7ad41b2ad6a99368147879f4b0e56 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 17 Feb 2022 22:53:27 +0800 Subject: [PATCH 268/773] memblock: use kfree() to release kmalloced memblock regions memblock.{reserved,memory}.regions may be allocated using kmalloc() in memblock_double_array(). Use kfree() to release these kmalloced regions indicated by memblock_{reserved,memory}_in_slab. Signed-off-by: Miaohe Lin Fixes: 3010f876500f ("mm: discard memblock data later") Signed-off-by: Mike Rapoport --- mm/memblock.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 1018e50566f3..b12a364f2766 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -366,14 +366,20 @@ void __init memblock_discard(void) addr = __pa(memblock.reserved.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); - memblock_free_late(addr, size); + if (memblock_reserved_in_slab) + kfree(memblock.reserved.regions); + else + memblock_free_late(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { addr = __pa(memblock.memory.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); - memblock_free_late(addr, size); + if (memblock_memory_in_slab) + kfree(memblock.memory.regions); + else + memblock_free_late(addr, size); } memblock_memory = NULL; From 84d3c83e6ea7d46cf3de3a54578af73eb24a64f2 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Sun, 20 Feb 2022 04:05:47 -0500 Subject: [PATCH 269/773] bnxt_en: Fix active FEC reporting to ethtool ethtool --show-fec does not show anything when the Active FEC setting in the chip is set to None. Fix it to properly return ETHTOOL_FEC_OFF in that case. Fixes: 8b2775890ad8 ("bnxt_en: Report FEC settings to ethtool.") Signed-off-by: Somnath Kotur Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 003330e8cd58..e195f4a669d8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1969,6 +1969,9 @@ static int bnxt_get_fecparam(struct net_device *dev, case PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS272_IEEE_ACTIVE: fec->active_fec |= ETHTOOL_FEC_LLRS; break; + case PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_NONE_ACTIVE: + fec->active_fec |= ETHTOOL_FEC_OFF; + break; } return 0; } From 6758f937669dba14c6aac7ca004edda42ec1b18d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 20 Feb 2022 04:05:48 -0500 Subject: [PATCH 270/773] bnxt_en: Fix offline ethtool selftest with RDMA enabled For offline (destructive) self tests, we need to stop the RDMA driver first. Otherwise, the RDMA driver will run into unrecoverable errors when destructive firmware tests are being performed. The irq_re_init parameter used in the half close and half open sequence when preparing the NIC for offline tests should be set to true because the RDMA driver will free all IRQs before the offline tests begin. Fixes: 55fd0cf320c3 ("bnxt_en: Add external loopback test to ethtool selftest.") Reviewed-by: Edwin Peer Reviewed-by: Ben Li Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 10 +++++----- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 12 +++++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 4f94136a011a..23bbb1c5812d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -10330,12 +10330,12 @@ int bnxt_half_open_nic(struct bnxt *bp) goto half_open_err; } - rc = bnxt_alloc_mem(bp, false); + rc = bnxt_alloc_mem(bp, true); if (rc) { netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc); goto half_open_err; } - rc = bnxt_init_nic(bp, false); + rc = bnxt_init_nic(bp, true); if (rc) { netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc); goto half_open_err; @@ -10344,7 +10344,7 @@ int bnxt_half_open_nic(struct bnxt *bp) half_open_err: bnxt_free_skbs(bp); - bnxt_free_mem(bp, false); + bnxt_free_mem(bp, true); dev_close(bp->dev); return rc; } @@ -10354,9 +10354,9 @@ half_open_err: */ void bnxt_half_close_nic(struct bnxt *bp) { - bnxt_hwrm_resource_free(bp, false, false); + bnxt_hwrm_resource_free(bp, false, true); bnxt_free_skbs(bp); - bnxt_free_mem(bp, false); + bnxt_free_mem(bp, true); } void bnxt_reenable_sriov(struct bnxt *bp) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index e195f4a669d8..a85b18858b32 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -25,6 +25,7 @@ #include "bnxt_hsi.h" #include "bnxt.h" #include "bnxt_hwrm.h" +#include "bnxt_ulp.h" #include "bnxt_xdp.h" #include "bnxt_ptp.h" #include "bnxt_ethtool.h" @@ -3551,9 +3552,12 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, if (!offline) { bnxt_run_fw_tests(bp, test_mask, &test_results); } else { - rc = bnxt_close_nic(bp, false, false); - if (rc) + bnxt_ulp_stop(bp); + rc = bnxt_close_nic(bp, true, false); + if (rc) { + bnxt_ulp_start(bp, rc); return; + } bnxt_run_fw_tests(bp, test_mask, &test_results); buf[BNXT_MACLPBK_TEST_IDX] = 1; @@ -3563,6 +3567,7 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, if (rc) { bnxt_hwrm_mac_loopback(bp, false); etest->flags |= ETH_TEST_FL_FAILED; + bnxt_ulp_start(bp, rc); return; } if (bnxt_run_loopback(bp)) @@ -3588,7 +3593,8 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, } bnxt_hwrm_phy_loopback(bp, false, false); bnxt_half_close_nic(bp); - rc = bnxt_open_nic(bp, false, true); + rc = bnxt_open_nic(bp, true, true); + bnxt_ulp_start(bp, rc); } if (rc || bnxt_test_irq(bp)) { buf[BNXT_IRQ_TEST_IDX] = 1; From cfcab3b3b61584a02bb523ffa99564eafa761dfe Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 20 Feb 2022 04:05:49 -0500 Subject: [PATCH 271/773] bnxt_en: Fix occasional ethtool -t loopback test failures In the current code, we setup the port to PHY or MAC loopback mode and then transmit a test broadcast packet for the loopback test. This scheme fails sometime if the port is shared with management firmware that can also send packets. The driver may receive the management firmware's packet and the test will fail when the contents don't match the test packet. Change the test packet to use it's own MAC address as the destination and setup the port to only receive it's own MAC address. This should filter out other packets sent by management firmware. Fixes: 91725d89b97a ("bnxt_en: Add PHY loopback to ethtool self-test.") Reviewed-by: Pavan Chebbi Reviewed-by: Edwin Peer Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 +++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 23bbb1c5812d..785436f6dd24 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8639,6 +8639,9 @@ static int bnxt_init_chip(struct bnxt *bp, bool irq_re_init) vnic->uc_filter_count = 1; vnic->rx_mask = 0; + if (test_bit(BNXT_STATE_HALF_OPEN, &bp->state)) + goto skip_rx_mask; + if (bp->dev->flags & IFF_BROADCAST) vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_BCAST; @@ -8659,6 +8662,7 @@ static int bnxt_init_chip(struct bnxt *bp, bool irq_re_init) if (rc) goto err_out; +skip_rx_mask: rc = bnxt_hwrm_set_coal(bp); if (rc) netdev_warn(bp->dev, "HWRM set coalescing failure rc: %x\n", @@ -10335,8 +10339,10 @@ int bnxt_half_open_nic(struct bnxt *bp) netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc); goto half_open_err; } + set_bit(BNXT_STATE_HALF_OPEN, &bp->state); rc = bnxt_init_nic(bp, true); if (rc) { + clear_bit(BNXT_STATE_HALF_OPEN, &bp->state); netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc); goto half_open_err; } @@ -10357,6 +10363,7 @@ void bnxt_half_close_nic(struct bnxt *bp) bnxt_hwrm_resource_free(bp, false, true); bnxt_free_skbs(bp); bnxt_free_mem(bp, true); + clear_bit(BNXT_STATE_HALF_OPEN, &bp->state); } void bnxt_reenable_sriov(struct bnxt *bp) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 440dfeb4948b..666fc1e7a7d2 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1921,6 +1921,7 @@ struct bnxt { #define BNXT_STATE_RECOVER 12 #define BNXT_STATE_FW_NON_FATAL_COND 13 #define BNXT_STATE_FW_ACTIVATE_RESET 14 +#define BNXT_STATE_HALF_OPEN 15 /* For offline ethtool tests */ #define BNXT_NO_FW_ACCESS(bp) \ (test_bit(BNXT_STATE_FW_FATAL_COND, &(bp)->state) || \ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index a85b18858b32..8aaa2335f848 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -3458,7 +3458,7 @@ static int bnxt_run_loopback(struct bnxt *bp) if (!skb) return -ENOMEM; data = skb_put(skb, pkt_size); - eth_broadcast_addr(data); + ether_addr_copy(&data[i], bp->dev->dev_addr); i += ETH_ALEN; ether_addr_copy(&data[i], bp->dev->dev_addr); i += ETH_ALEN; From 8cdb15924252e27af16c4a8fe0fc606ce5fd04dc Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Sun, 20 Feb 2022 04:05:50 -0500 Subject: [PATCH 272/773] bnxt_en: Fix incorrect multicast rx mask setting when not requested We should setup multicast only when net_device flags explicitly has IFF_MULTICAST set. Otherwise we will incorrectly turn it on even when not asked. Fix it by only passing the multicast table to the firmware if IFF_MULTICAST is set. Fixes: 7d2837dd7a32 ("bnxt_en: Setup multicast properly after resetting device.") Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 785436f6dd24..fc5b1d816bdb 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4747,8 +4747,10 @@ static int bnxt_hwrm_cfa_l2_set_rx_mask(struct bnxt *bp, u16 vnic_id) return rc; req->vnic_id = cpu_to_le32(vnic->fw_vnic_id); - req->num_mc_entries = cpu_to_le32(vnic->mc_list_count); - req->mc_tbl_addr = cpu_to_le64(vnic->mc_list_mapping); + if (vnic->rx_mask & CFA_L2_SET_RX_MASK_REQ_MASK_MCAST) { + req->num_mc_entries = cpu_to_le32(vnic->mc_list_count); + req->mc_tbl_addr = cpu_to_le64(vnic->mc_list_mapping); + } req->mask = cpu_to_le32(vnic->rx_mask); return hwrm_req_send_silent(bp, req); } @@ -8651,7 +8653,7 @@ static int bnxt_init_chip(struct bnxt *bp, bool irq_re_init) if (bp->dev->flags & IFF_ALLMULTI) { vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST; vnic->mc_list_count = 0; - } else { + } else if (bp->dev->flags & IFF_MULTICAST) { u32 mask = 0; bnxt_mc_list_updated(bp, &mask); @@ -10779,7 +10781,7 @@ static void bnxt_set_rx_mode(struct net_device *dev) if (dev->flags & IFF_ALLMULTI) { mask |= CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST; vnic->mc_list_count = 0; - } else { + } else if (dev->flags & IFF_MULTICAST) { mc_update = bnxt_mc_list_updated(bp, &mask); } @@ -10856,9 +10858,10 @@ skip_uc: !bnxt_promisc_ok(bp)) vnic->rx_mask &= ~CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS; rc = bnxt_hwrm_cfa_l2_set_rx_mask(bp, 0); - if (rc && vnic->mc_list_count) { + if (rc && (vnic->rx_mask & CFA_L2_SET_RX_MASK_REQ_MASK_MCAST)) { netdev_info(bp->dev, "Failed setting MC filters rc: %d, turning on ALL_MCAST mode\n", rc); + vnic->rx_mask &= ~CFA_L2_SET_RX_MASK_REQ_MASK_MCAST; vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST; vnic->mc_list_count = 0; rc = bnxt_hwrm_cfa_l2_set_rx_mask(bp, 0); From 0e0e3c5358470cbad10bd7ca29f84a44d179d286 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sun, 20 Feb 2022 04:05:51 -0500 Subject: [PATCH 273/773] bnxt_en: Restore the resets_reliable flag in bnxt_open() During ifdown, we call bnxt_inv_fw_health_reg() which will clear both the status_reliable and resets_reliable flags if these registers are mapped. This is correct because a FW reset during ifdown will clear these register mappings. If we detect that FW has gone through reset during the next ifup, we will remap these registers. But during normal ifup with no FW reset, we need to restore the resets_reliable flag otherwise we will not show the reset counter during devlink diagnose. Fixes: 8cc95ceb7087 ("bnxt_en: improve fw diagnose devlink health messages") Reviewed-by: Vikas Gupta Reviewed-by: Pavan Chebbi Reviewed-by: Somnath Kotur Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fc5b1d816bdb..b1c98d1408b8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7789,6 +7789,19 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp) return 0; } +static void bnxt_remap_fw_health_regs(struct bnxt *bp) +{ + if (!bp->fw_health) + return; + + if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY) { + bp->fw_health->status_reliable = true; + bp->fw_health->resets_reliable = true; + } else { + bnxt_try_map_fw_health_reg(bp); + } +} + static int bnxt_hwrm_error_recovery_qcfg(struct bnxt *bp) { struct bnxt_fw_health *fw_health = bp->fw_health; @@ -9856,8 +9869,8 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) resc_reinit = true; if (flags & FUNC_DRV_IF_CHANGE_RESP_FLAGS_HOT_FW_RESET_DONE) fw_reset = true; - else if (bp->fw_health && !bp->fw_health->status_reliable) - bnxt_try_map_fw_health_reg(bp); + else + bnxt_remap_fw_health_regs(bp); if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state) && !fw_reset) { netdev_err(bp->dev, "RESET_DONE not set during FW reset.\n"); From b891106da52b2c12dbaf73400f6d225b06a38d80 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 20 Feb 2022 04:05:52 -0500 Subject: [PATCH 274/773] bnxt_en: Increase firmware message response DMA wait time When polling for the firmware message response, we first poll for the response message header. Once the valid length is detected in the header, we poll for the valid bit at the end of the message which signals DMA completion. Normally, this poll time for DMA completion is extremely short (0 to a few usec). But on some devices under some rare conditions, it can be up to about 20 msec. Increase this delay to 50 msec and use udelay() for the first 10 usec for the common case, and usleep_range() beyond that. Also, change the error message to include the above delay time when printing the timeout value. Fixes: 3c8c20db769c ("bnxt_en: move HWRM API implementation into separate file") Reviewed-by: Vladimir Olovyannikov Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c | 12 +++++++++--- drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c index 566c9487ef55..b01d42928a53 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c @@ -644,17 +644,23 @@ static int __hwrm_send(struct bnxt *bp, struct bnxt_hwrm_ctx *ctx) /* Last byte of resp contains valid bit */ valid = ((u8 *)ctx->resp) + len - 1; - for (j = 0; j < HWRM_VALID_BIT_DELAY_USEC; j++) { + for (j = 0; j < HWRM_VALID_BIT_DELAY_USEC; ) { /* make sure we read from updated DMA memory */ dma_rmb(); if (*valid) break; - usleep_range(1, 5); + if (j < 10) { + udelay(1); + j++; + } else { + usleep_range(20, 30); + j += 20; + } } if (j >= HWRM_VALID_BIT_DELAY_USEC) { hwrm_err(bp, ctx, "Error (timeout: %u) msg {0x%x 0x%x} len:%d v:%d\n", - hwrm_total_timeout(i), req_type, + hwrm_total_timeout(i) + j, req_type, le16_to_cpu(ctx->req->seq_id), len, *valid); goto exit; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h index d52bd2d63aec..c98032e38188 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h @@ -90,7 +90,7 @@ static inline unsigned int hwrm_total_timeout(unsigned int n) } -#define HWRM_VALID_BIT_DELAY_USEC 150 +#define HWRM_VALID_BIT_DELAY_USEC 50000 static inline bool bnxt_cfa_hwrm_message(u16 req_type) { From 1278d17a1fb860e7eab4bc3ff4b026a87cbf5105 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sun, 20 Feb 2022 04:05:53 -0500 Subject: [PATCH 275/773] bnxt_en: Fix devlink fw_activate To install a livepatch, first flash the package to NVM, and then activate the patch through the "HWRM_FW_LIVEPATCH" fw command. To uninstall a patch from NVM, flash the removal package and then activate it through the "HWRM_FW_LIVEPATCH" fw command. The "HWRM_FW_LIVEPATCH" fw command has to consider following scenarios: 1. no patch in NVM and no patch active. Do nothing. 2. patch in NVM, but not active. Activate the patch currently in NVM. 3. patch is not in NVM, but active. Deactivate the patch. 4. patch in NVM and the patch active. Do nothing. Fix the code to handle these scenarios during devlink "fw_activate". To install and activate a live patch: devlink dev flash pci/0000:c1:00.0 file thor_patch.pkg devlink -f dev reload pci/0000:c1:00.0 action fw_activate limit no_reset To remove and deactivate a live patch: devlink dev flash pci/0000:c1:00.0 file thor_patch_rem.pkg devlink -f dev reload pci/0000:c1:00.0 action fw_activate limit no_reset Fixes: 3c4153394e2c ("bnxt_en: implement firmware live patching") Reviewed-by: Vikas Gupta Reviewed-by: Somnath Kotur Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnxt/bnxt_devlink.c | 39 +++++++++++++++---- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 4da31b1b84f9..f6e21fac0e69 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -367,6 +367,16 @@ bnxt_dl_livepatch_report_err(struct bnxt *bp, struct netlink_ext_ack *extack, } } +/* Live patch status in NVM */ +#define BNXT_LIVEPATCH_NOT_INSTALLED 0 +#define BNXT_LIVEPATCH_INSTALLED FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_INSTALL +#define BNXT_LIVEPATCH_REMOVED FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_ACTIVE +#define BNXT_LIVEPATCH_MASK (FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_INSTALL | \ + FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_ACTIVE) +#define BNXT_LIVEPATCH_ACTIVATED BNXT_LIVEPATCH_MASK + +#define BNXT_LIVEPATCH_STATE(flags) ((flags) & BNXT_LIVEPATCH_MASK) + static int bnxt_dl_livepatch_activate(struct bnxt *bp, struct netlink_ext_ack *extack) { @@ -374,8 +384,9 @@ bnxt_dl_livepatch_activate(struct bnxt *bp, struct netlink_ext_ack *extack) struct hwrm_fw_livepatch_query_input *query_req; struct hwrm_fw_livepatch_output *patch_resp; struct hwrm_fw_livepatch_input *patch_req; + u16 flags, live_patch_state; + bool activated = false; u32 installed = 0; - u16 flags; u8 target; int rc; @@ -394,7 +405,6 @@ bnxt_dl_livepatch_activate(struct bnxt *bp, struct netlink_ext_ack *extack) hwrm_req_drop(bp, query_req); return rc; } - patch_req->opcode = FW_LIVEPATCH_REQ_OPCODE_ACTIVATE; patch_req->loadtype = FW_LIVEPATCH_REQ_LOADTYPE_NVM_INSTALL; patch_resp = hwrm_req_hold(bp, patch_req); @@ -407,12 +417,20 @@ bnxt_dl_livepatch_activate(struct bnxt *bp, struct netlink_ext_ack *extack) } flags = le16_to_cpu(query_resp->status_flags); - if (~flags & FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_INSTALL) + live_patch_state = BNXT_LIVEPATCH_STATE(flags); + + if (live_patch_state == BNXT_LIVEPATCH_NOT_INSTALLED) continue; - if ((flags & FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_ACTIVE) && - !strncmp(query_resp->active_ver, query_resp->install_ver, - sizeof(query_resp->active_ver))) + + if (live_patch_state == BNXT_LIVEPATCH_ACTIVATED) { + activated = true; continue; + } + + if (live_patch_state == BNXT_LIVEPATCH_INSTALLED) + patch_req->opcode = FW_LIVEPATCH_REQ_OPCODE_ACTIVATE; + else if (live_patch_state == BNXT_LIVEPATCH_REMOVED) + patch_req->opcode = FW_LIVEPATCH_REQ_OPCODE_DEACTIVATE; patch_req->fw_target = target; rc = hwrm_req_send(bp, patch_req); @@ -424,8 +442,13 @@ bnxt_dl_livepatch_activate(struct bnxt *bp, struct netlink_ext_ack *extack) } if (!rc && !installed) { - NL_SET_ERR_MSG_MOD(extack, "No live patches found"); - rc = -ENOENT; + if (activated) { + NL_SET_ERR_MSG_MOD(extack, "Live patch already activated"); + rc = -EEXIST; + } else { + NL_SET_ERR_MSG_MOD(extack, "No live patches found"); + rc = -ENOENT; + } } hwrm_req_drop(bp, query_req); hwrm_req_drop(bp, patch_req); From cfb92440ee71adcc2105b0890bb01ac3cddb8507 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Feb 2022 13:07:20 -0800 Subject: [PATCH 276/773] Linux 5.17-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 51e142f760f7..289ce2be8032 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Superb Owl # *DOCUMENTATION* From ba1366f3d039e7c3ca1fc29ed00ce3ed2b8fd32f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 13 Feb 2022 14:54:05 +0100 Subject: [PATCH 277/773] PCI: vmd: Prevent recursive locking on interrupt allocation Tejas reported the following recursive locking issue: swapper/0/1 is trying to acquire lock: ffff8881074fd0a0 (&md->mutex){+.+.}-{3:3}, at: msi_get_virq+0x30/0xc0 but task is already holding lock: ffff8881017cd6a0 (&md->mutex){+.+.}-{3:3}, at: __pci_enable_msi_range+0xf2/0x290 stack backtrace: __mutex_lock+0x9d/0x920 msi_get_virq+0x30/0xc0 pci_irq_vector+0x26/0x30 vmd_msi_init+0xcc/0x210 msi_domain_alloc+0xbf/0x150 msi_domain_alloc_irqs_descs_locked+0x3e/0xb0 __pci_enable_msi_range+0x155/0x290 pci_alloc_irq_vectors_affinity+0xba/0x100 pcie_port_device_register+0x307/0x550 pcie_portdrv_probe+0x3c/0xd0 pci_device_probe+0x95/0x110 This is caused by the VMD MSI code which does a lookup of the Linux interrupt number for an VMD managed MSI[X] vector. The lookup function tries to acquire the already held mutex. Avoid that by caching the Linux interrupt number at initialization time instead of looking it up over and over. Fixes: 82ff8e6b78fc ("PCI/MSI: Use msi_get_virq() in pci_get_vector()") Reported-by: "Surendrakumar Upadhyay, TejaskumarX" Signed-off-by: Thomas Gleixner Tested-by: "Surendrakumar Upadhyay, TejaskumarX" Cc: linux-pci@vger.kernel.org Link: https://lore.kernel.org/r/87a6euub2a.ffs@tglx --- drivers/pci/controller/vmd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index cc166c683638..eb05cceab964 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -99,11 +99,13 @@ struct vmd_irq { * @srcu: SRCU struct for local synchronization. * @count: number of child IRQs assigned to this vector; used to track * sharing. + * @virq: The underlying VMD Linux interrupt number */ struct vmd_irq_list { struct list_head irq_list; struct srcu_struct srcu; unsigned int count; + unsigned int virq; }; struct vmd_dev { @@ -253,7 +255,6 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info, struct msi_desc *desc = arg->desc; struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus); struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL); - unsigned int index, vector; if (!vmdirq) return -ENOMEM; @@ -261,10 +262,8 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info, INIT_LIST_HEAD(&vmdirq->node); vmdirq->irq = vmd_next_irq(vmd, desc); vmdirq->virq = virq; - index = index_from_irqs(vmd, vmdirq->irq); - vector = pci_irq_vector(vmd->dev, index); - irq_domain_set_info(domain, virq, vector, info->chip, vmdirq, + irq_domain_set_info(domain, virq, vmdirq->irq->virq, info->chip, vmdirq, handle_untracked_irq, vmd, NULL); return 0; } @@ -685,7 +684,8 @@ static int vmd_alloc_irqs(struct vmd_dev *vmd) return err; INIT_LIST_HEAD(&vmd->irqs[i].irq_list); - err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i), + vmd->irqs[i].virq = pci_irq_vector(dev, i); + err = devm_request_irq(&dev->dev, vmd->irqs[i].virq, vmd_irq, IRQF_NO_THREAD, vmd->name, &vmd->irqs[i]); if (err) @@ -969,7 +969,7 @@ static int vmd_suspend(struct device *dev) int i; for (i = 0; i < vmd->msix_count; i++) - devm_free_irq(dev, pci_irq_vector(pdev, i), &vmd->irqs[i]); + devm_free_irq(dev, vmd->irqs[i].virq, &vmd->irqs[i]); return 0; } @@ -981,7 +981,7 @@ static int vmd_resume(struct device *dev) int err, i; for (i = 0; i < vmd->msix_count; i++) { - err = devm_request_irq(dev, pci_irq_vector(pdev, i), + err = devm_request_irq(dev, vmd->irqs[i].virq, vmd_irq, IRQF_NO_THREAD, vmd->name, &vmd->irqs[i]); if (err) From fa231bef3b34f1670b240409c11e59a3ce095e6d Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 18 Feb 2022 23:57:20 +0200 Subject: [PATCH 278/773] soc: imx: gpcv2: Fix clock disabling imbalance in error path The imx_pgc_power_down() starts by enabling the domain clocks, and thus disables them in the error path. Commit 18c98573a4cf ("soc: imx: gpcv2: add domain option to keep domain clocks enabled") made the clock enable conditional, but forgot to add the same condition to the error path. This can result in a clock enable/disable imbalance. Fix it. Fixes: 18c98573a4cf ("soc: imx: gpcv2: add domain option to keep domain clocks enabled") Signed-off-by: Laurent Pinchart Reviewed-by: Lucas Stach Signed-off-by: Shawn Guo --- drivers/soc/imx/gpcv2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/soc/imx/gpcv2.c b/drivers/soc/imx/gpcv2.c index 3e59d479d001..3cb123016b3e 100644 --- a/drivers/soc/imx/gpcv2.c +++ b/drivers/soc/imx/gpcv2.c @@ -382,7 +382,8 @@ static int imx_pgc_power_down(struct generic_pm_domain *genpd) return 0; out_clk_disable: - clk_bulk_disable_unprepare(domain->num_clks, domain->clks); + if (!domain->keep_clocks) + clk_bulk_disable_unprepare(domain->num_clks, domain->clks); return ret; } From fc3ef2e3297b3c0e2006b5d7b3d66965e3392036 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 20 Feb 2022 19:01:14 +0300 Subject: [PATCH 279/773] HID: hid-thrustmaster: fix OOB read in thrustmaster_interrupts Syzbot reported an slab-out-of-bounds Read in thrustmaster_probe() bug. The root case is in missing validation check of actual number of endpoints. Code should not blindly access usb_host_interface::endpoint array, since it may contain less endpoints than code expects. Fix it by adding missing validaion check and print an error if number of endpoints do not match expected number Fixes: c49c33637802 ("HID: support for initialization of some Thrustmaster wheels") Reported-and-tested-by: syzbot+35eebd505e97d315d01c@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: Jiri Kosina --- drivers/hid/hid-thrustmaster.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c index a4e20f9e598b..c3e6d69fdfbd 100644 --- a/drivers/hid/hid-thrustmaster.c +++ b/drivers/hid/hid-thrustmaster.c @@ -160,6 +160,12 @@ static void thrustmaster_interrupts(struct hid_device *hdev) return; } + if (usbif->cur_altsetting->desc.bNumEndpoints < 2) { + kfree(send_buf); + hid_err(hdev, "Wrong number of endpoints?\n"); + return; + } + ep = &usbif->cur_altsetting->endpoint[1]; b_ep = ep->desc.bEndpointAddress; From 198a7ebd5fa17b4d0be8cb70240ee1be885175c0 Mon Sep 17 00:00:00 2001 From: Dmytro Bagrii Date: Thu, 10 Feb 2022 18:41:37 +0200 Subject: [PATCH 280/773] Revert "USB: serial: ch341: add new Product ID for CH341A" This reverts commit 46ee4abb10a07bd8f8ce910ee6b4ae6a947d7f63. CH341 has Product ID 0x5512 in EPP/MEM mode which is used for I2C/SPI/GPIO interfaces. In asynchronous serial interface mode CH341 has PID 0x5523 which is already in the table. Mode is selected by corresponding jumper setting. Signed-off-by: Dmytro Bagrii Link: https://lore.kernel.org/r/20220210164137.4376-1-dimich.dmb@gmail.com Link: https://lore.kernel.org/r/YJ0OCS/sh+1ifD/q@hovoldconsulting.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index 58cba8ee0277..2798fca71261 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -81,7 +81,6 @@ #define CH341_QUIRK_SIMULATE_BREAK BIT(1) static const struct usb_device_id id_table[] = { - { USB_DEVICE(0x1a86, 0x5512) }, { USB_DEVICE(0x1a86, 0x5523) }, { USB_DEVICE(0x1a86, 0x7522) }, { USB_DEVICE(0x1a86, 0x7523) }, From 6ecb3f0b18b320320460a42e40d6fb603f6ded96 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Mon, 14 Feb 2022 10:14:01 +0800 Subject: [PATCH 281/773] USB: serial: option: add support for DW5829e Dell DW5829e same as DW5821e except CAT level. DW5821e supports CAT16 but DW5829e supports CAT9. There are 2 types product of DW5829e: normal and eSIM. So we will add 2 PID for DW5829e. And for each PID, it support MBIM or RMNET. Let's see test evidence as below: DW5829e MBIM mode: T: Bus=04 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 4 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 2 P: Vendor=413c ProdID=81e6 Rev=03.18 S: Manufacturer=Dell Inc. S: Product=DW5829e Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 7 Cfg#= 2 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim I: If#=0x1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option I: If#=0x6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) DW5829e RMNET mode: T: Bus=04 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 5 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=413c ProdID=81e6 Rev=03.18 S: Manufacturer=Dell Inc. S: Product=DW5829e Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#=0x1 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=00 Prot=00 Driver=usbhid I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option DW5829e-eSIM MBIM mode: T: Bus=04 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 6 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 2 P: Vendor=413c ProdID=81e4 Rev=03.18 S: Manufacturer=Dell Inc. S: Product=DW5829e-eSIM Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 7 Cfg#= 2 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim I: If#=0x1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option I: If#=0x6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) DW5829e-eSIM RMNET mode: T: Bus=04 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 7 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=413c ProdID=81e4 Rev=03.18 S: Manufacturer=Dell Inc. S: Product=DW5829e-eSIM Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#=0x1 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=00 Prot=00 Driver=usbhid I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#=0x5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option BTW, the interface 0x6 of MBIM mode is GNSS port, which not same as NMEA port. So it's banned from serial option driver. The remaining interfaces 0x2-0x5 are: MODEM, MODEM, NMEA, DIAG. Signed-off-by: Slark Xiao Link: https://lore.kernel.org/r/20220214021401.6264-1-slark_xiao@163.com [ johan: drop unnecessary reservation of interface 1 ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 962e9943fc20..58daf021fa2a 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -198,6 +198,8 @@ static void option_instat_callback(struct urb *urb); #define DELL_PRODUCT_5821E 0x81d7 #define DELL_PRODUCT_5821E_ESIM 0x81e0 +#define DELL_PRODUCT_5829E_ESIM 0x81e4 +#define DELL_PRODUCT_5829E 0x81e6 #define KYOCERA_VENDOR_ID 0x0c88 #define KYOCERA_PRODUCT_KPC650 0x17da @@ -1063,6 +1065,10 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(0) | RSVD(1) | RSVD(6) }, { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5821E_ESIM), .driver_info = RSVD(0) | RSVD(1) | RSVD(6) }, + { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5829E), + .driver_info = RSVD(0) | RSVD(6) }, + { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5829E_ESIM), + .driver_info = RSVD(0) | RSVD(6) }, { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_E100A) }, /* ADU-E100, ADU-310 */ { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_500A) }, { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_620UW) }, From cfc4442c642d568014474b6718ccf65dc7ca6099 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Fri, 18 Feb 2022 14:45:52 +0100 Subject: [PATCH 282/773] USB: serial: option: add Telit LE910R1 compositions Add support for the following Telit LE910R1 compositions: 0x701a: rndis, tty, tty, tty 0x701b: ecm, tty, tty, tty 0x9201: tty Signed-off-by: Daniele Palmas Link: https://lore.kernel.org/r/20220218134552.4051-1-dnlplm@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 58daf021fa2a..e7755d9cfc61 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1279,10 +1279,16 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x7011, 0xff), /* Telit LE910-S1 (ECM) */ .driver_info = NCTRL(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x701a, 0xff), /* Telit LE910R1 (RNDIS) */ + .driver_info = NCTRL(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x701b, 0xff), /* Telit LE910R1 (ECM) */ + .driver_info = NCTRL(2) }, { USB_DEVICE(TELIT_VENDOR_ID, 0x9010), /* Telit SBL FN980 flashing device */ .driver_info = NCTRL(0) | ZLP }, { USB_DEVICE(TELIT_VENDOR_ID, 0x9200), /* Telit LE910S1 flashing device */ .driver_info = NCTRL(0) | ZLP }, + { USB_DEVICE(TELIT_VENDOR_ID, 0x9201), /* Telit LE910R1 flashing device */ + .driver_info = NCTRL(0) | ZLP }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0002, 0xff, 0xff, 0xff), .driver_info = RSVD(1) }, From d45476d9832409371537013ebdd8dc1a7781f97a Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 16 Feb 2022 20:57:00 +0100 Subject: [PATCH 283/773] x86/speculation: Rename RETPOLINE_AMD to RETPOLINE_LFENCE The RETPOLINE_AMD name is unfortunate since it isn't necessarily AMD only, in fact Hygon also uses it. Furthermore it will likely be sufficient for some Intel processors. Therefore rename the thing to RETPOLINE_LFENCE to better describe what it is. Add the spectre_v2=retpoline,lfence option as an alias to spectre_v2=retpoline,amd to preserve existing setups. However, the output of /sys/devices/system/cpu/vulnerabilities/spectre_v2 will be changed. [ bp: Fix typos, massage. ] Co-developed-by: Josh Poimboeuf Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/nospec-branch.h | 12 +++++----- arch/x86/kernel/alternative.c | 8 +++---- arch/x86/kernel/cpu/bugs.c | 29 +++++++++++++++--------- arch/x86/lib/retpoline.S | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 7 files changed, 32 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6db4e2932b3d..65d147974f8d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -204,7 +204,7 @@ /* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index cc74dc584836..c91d97bee237 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -84,7 +84,7 @@ #ifdef CONFIG_RETPOLINE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE #else jmp *%\reg #endif @@ -94,7 +94,7 @@ #ifdef CONFIG_RETPOLINE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE #else call *%\reg #endif @@ -146,7 +146,7 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) @@ -176,7 +176,7 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_AMD) + X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -188,8 +188,8 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, - SPECTRE_V2_RETPOLINE_GENERIC, - SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_RETPOLINE, + SPECTRE_V2_LFENCE, SPECTRE_V2_IBRS_ENHANCED, }; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5007c3ffe96f..b4470eabf151 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -389,7 +389,7 @@ static int emit_indirect(int op, int reg, u8 *bytes) * * CALL *%\reg * - * It also tries to inline spectre_v2=retpoline,amd when size permits. + * It also tries to inline spectre_v2=retpoline,lfence when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { @@ -407,7 +407,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) return -1; op = insn->opcode.bytes[0]; @@ -438,9 +438,9 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) } /* - * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. */ - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { bytes[i++] = 0x0f; bytes[i++] = 0xae; bytes[i++] = 0xe8; /* LFENCE */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1c1f218a701d..e3bb8afc6768 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -664,7 +664,7 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, - SPECTRE_V2_CMD_RETPOLINE_AMD, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, }; enum spectre_v2_user_cmd { @@ -824,8 +824,8 @@ set_mode: static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", }; @@ -837,7 +837,8 @@ static const struct { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, }; @@ -875,13 +876,19 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && !IS_ENABLED(CONFIG_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; @@ -916,9 +923,9 @@ static void __init spectre_v2_select_mitigation(void) if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; - case SPECTRE_V2_CMD_RETPOLINE_AMD: + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_amd; + goto retpoline_lfence; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: if (IS_ENABLED(CONFIG_RETPOLINE)) @@ -935,17 +942,17 @@ static void __init spectre_v2_select_mitigation(void) retpoline_auto: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_amd: + retpoline_lfence: if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); goto retpoline_generic; } - mode = SPECTRE_V2_RETPOLINE_AMD; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + mode = SPECTRE_V2_LFENCE; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } else { retpoline_generic: - mode = SPECTRE_V2_RETPOLINE_GENERIC; + mode = SPECTRE_V2_RETPOLINE; setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 89b3fb244e15..afbdda539b80 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -34,7 +34,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE .endm diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2b1e266ff95c..0ecb140864b2 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -394,7 +394,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) u8 *prog = *pprog; #ifdef CONFIG_RETPOLINE - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 6db4e2932b3d..ab4e53715d8a 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -204,7 +204,7 @@ /* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCEs for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ From 1e19da8522c81bf46b335f84137165741e0d82b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Feb 2022 20:57:01 +0100 Subject: [PATCH 284/773] x86/speculation: Add eIBRS + Retpoline options Thanks to the chaps at VUsec it is now clear that eIBRS is not sufficient, therefore allow enabling of retpolines along with eIBRS. Add spectre_v2=eibrs, spectre_v2=eibrs,lfence and spectre_v2=eibrs,retpoline options to explicitly pick your preferred means of mitigation. Since there's new mitigations there's also user visible changes in /sys/devices/system/cpu/vulnerabilities/spectre_v2 to reflect these new mitigations. [ bp: Massage commit message, trim error messages, do more precise eIBRS mode checking. ] Co-developed-by: Josh Poimboeuf Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Patrick Colp Reviewed-by: Thomas Gleixner --- arch/x86/include/asm/nospec-branch.h | 4 +- arch/x86/kernel/cpu/bugs.c | 147 +++++++++++++++++++-------- 2 files changed, 106 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c91d97bee237..acbaeaf83b61 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -190,7 +190,9 @@ enum spectre_v2_mitigation { SPECTRE_V2_NONE, SPECTRE_V2_RETPOLINE, SPECTRE_V2_LFENCE, - SPECTRE_V2_IBRS_ENHANCED, + SPECTRE_V2_EIBRS, + SPECTRE_V2_EIBRS_RETPOLINE, + SPECTRE_V2_EIBRS_LFENCE, }; /* The indirect branch speculation control variants */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e3bb8afc6768..79c52dd6c597 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -665,6 +665,9 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, }; enum spectre_v2_user_cmd { @@ -737,6 +740,13 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return (mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE); +} + static void __init spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) { @@ -804,7 +814,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return; /* @@ -826,7 +836,9 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", }; static const struct { @@ -840,6 +852,9 @@ static const struct { { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, }; @@ -877,15 +892,29 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) if ((cmd == SPECTRE_V2_CMD_RETPOLINE || cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE) && + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -894,6 +923,25 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE not serializing, switching to generic retpoline\n"); + return SPECTRE_V2_RETPOLINE; + } + return SPECTRE_V2_LFENCE; + } + + return SPECTRE_V2_RETPOLINE; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -914,49 +962,60 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_IBRS_ENHANCED; - /* Force it so VMEXIT will restore correctly */ - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - goto specv2_set_mode; + mode = SPECTRE_V2_EIBRS; + break; } - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + + mode = spectre_v2_select_retpoline(); break; + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_lfence; - break; - case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_generic; - break; - case SPECTRE_V2_CMD_RETPOLINE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; - break; - } - pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); - return; - -retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_lfence: - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); - goto retpoline_generic; - } mode = SPECTRE_V2_LFENCE; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); - } else { - retpoline_generic: + break; + + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: mode = SPECTRE_V2_RETPOLINE; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; + + case SPECTRE_V2_CMD_RETPOLINE: + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; + break; + } + + if (spectre_v2_in_eibrs_mode(mode)) { + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + fallthrough; + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; } -specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -982,7 +1041,7 @@ specv2_set_mode: * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { + if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } @@ -1691,7 +1750,7 @@ static ssize_t tsx_async_abort_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) return ""; switch (spectre_v2_user_stibp) { From 5ad3eb1132453b9795ce5fd4572b1c18b292cca9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Feb 2022 20:57:02 +0100 Subject: [PATCH 285/773] Documentation/hw-vuln: Update spectre doc Update the doc with the new fun. [ bp: Massage commit message. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- Documentation/admin-guide/hw-vuln/spectre.rst | 44 +++++++++++++------ .../admin-guide/kernel-parameters.txt | 8 +++- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index a2b22d5640ec..79051f4dac45 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -131,6 +131,19 @@ steer its indirect branch speculations to gadget code, and measure the speculative execution's side effects left in level 1 cache to infer the victim's data. +Yet another variant 2 attack vector is for the attacker to poison the +Branch History Buffer (BHB) to speculatively steer an indirect branch +to a specific Branch Target Buffer (BTB) entry, even if the entry isn't +associated with the source address of the indirect branch. Specifically, +the BHB might be shared across privilege levels even in the presence of +Enhanced IBRS. + +Currently the only known real-world BHB attack vector is via +unprivileged eBPF. Therefore, it's highly recommended to not enable +unprivileged eBPF, especially when eIBRS is used (without retpolines). +For a full mitigation against BHB attacks, it's recommended to use +retpolines (or eIBRS combined with retpolines). + Attack scenarios ---------------- @@ -364,13 +377,15 @@ The possible values in this file are: - Kernel status: - ==================================== ================================= - 'Not affected' The processor is not vulnerable - 'Vulnerable' Vulnerable, no mitigation - 'Mitigation: Full generic retpoline' Software-focused mitigation - 'Mitigation: Full AMD retpoline' AMD-specific software mitigation - 'Mitigation: Enhanced IBRS' Hardware-focused mitigation - ==================================== ================================= + ======================================== ================================= + 'Not affected' The processor is not vulnerable + 'Mitigation: None' Vulnerable, no mitigation + 'Mitigation: Retpolines' Use Retpoline thunks + 'Mitigation: LFENCE' Use LFENCE instructions + 'Mitigation: Enhanced IBRS' Hardware-focused mitigation + 'Mitigation: Enhanced IBRS + Retpolines' Hardware-focused + Retpolines + 'Mitigation: Enhanced IBRS + LFENCE' Hardware-focused + LFENCE + ======================================== ================================= - Firmware status: Show if Indirect Branch Restricted Speculation (IBRS) is used to protect against Spectre variant 2 attacks when calling firmware (x86 only). @@ -583,12 +598,13 @@ kernel command line. Specific mitigations can also be selected manually: - retpoline - replace indirect branches - retpoline,generic - google's original retpoline - retpoline,amd - AMD-specific minimal thunk + retpoline auto pick between generic,lfence + retpoline,generic Retpolines + retpoline,lfence LFENCE; indirect branch + retpoline,amd alias for retpoline,lfence + eibrs enhanced IBRS + eibrs,retpoline enhanced IBRS + Retpolines + eibrs,lfence enhanced IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. @@ -599,7 +615,7 @@ kernel command line. spectre_v2=off. Spectre variant 1 mitigations cannot be disabled. -For spectre_v2_user see :doc:`/admin-guide/kernel-parameters`. +For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt Mitigation selection guide -------------------------- diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f5a27f067db9..7123524a86b8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5361,8 +5361,12 @@ Specific mitigations can also be selected manually: retpoline - replace indirect branches - retpoline,generic - google's original retpoline - retpoline,amd - AMD-specific minimal thunk + retpoline,generic - Retpolines + retpoline,lfence - LFENCE; indirect branch + retpoline,amd - alias for retpoline,lfence + eibrs - enhanced IBRS + eibrs,retpoline - enhanced IBRS + Retpolines + eibrs,lfence - enhanced IBRS + LFENCE Not specifying this option is equivalent to spectre_v2=auto. From 44a3918c8245ab10c6c9719dd12e7a8d291980d8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Feb 2022 11:49:08 -0800 Subject: [PATCH 286/773] x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting With unprivileged eBPF enabled, eIBRS (without retpoline) is vulnerable to Spectre v2 BHB-based attacks. When both are enabled, print a warning message and report it in the 'spectre_v2' sysfs vulnerabilities file. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- arch/x86/kernel/cpu/bugs.c | 35 +++++++++++++++++++++++++++++------ include/linux/bpf.h | 11 +++++++++++ kernel/sysctl.c | 7 +++++++ 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 79c52dd6c597..0a4267c63d3b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -650,6 +651,16 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -994,6 +1005,9 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + if (spectre_v2_in_eibrs_mode(mode)) { /* Force it so VMEXIT will restore correctly */ x86_spec_ctrl_base |= SPEC_CTRL_IBRS; @@ -1780,6 +1794,20 @@ static char *ibpb_state(void) return ""; } +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); @@ -1805,12 +1833,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..1f56806d8eb9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1793,6 +1793,11 @@ struct bpf_core_ctx { int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2012,6 +2017,12 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, { return NULL; } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e..730ab56d9e92 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -180,6 +180,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write, return ret; } +void __weak unpriv_ebpf_notify(int new_state) +{ +} + static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -197,6 +201,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return -EPERM; *(int *)table->data = unpriv_enable; } + + unpriv_ebpf_notify(unpriv_enable); + return ret; } #endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ From 3f33364836aacc28cd430d22cf22379e3b5ecd77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 14 Feb 2022 11:18:08 +0200 Subject: [PATCH 287/773] drm/i915: Widen the QGV point mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit adlp+ adds some extra bits to the QGV point mask. The code attempts to handle that but forgot to actually make sure we can store those bits in the bw state. Fix it. Cc: stable@vger.kernel.org Cc: Stanislav Lisovskiy Fixes: 192fbfb76744 ("drm/i915: Implement PSF GV point support") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220214091811.13725-4-ville.syrjala@linux.intel.com Reviewed-by: Stanislav Lisovskiy (cherry picked from commit c0299cc9840b3805205173cc77782f317b78ea0e) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_bw.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_bw.h b/drivers/gpu/drm/i915/display/intel_bw.h index 46c6eecbd917..0ceaed1c9656 100644 --- a/drivers/gpu/drm/i915/display/intel_bw.h +++ b/drivers/gpu/drm/i915/display/intel_bw.h @@ -30,19 +30,19 @@ struct intel_bw_state { */ u8 pipe_sagv_reject; + /* bitmask of active pipes */ + u8 active_pipes; + /* * Current QGV points mask, which restricts * some particular SAGV states, not to confuse * with pipe_sagv_mask. */ - u8 qgv_points_mask; + u16 qgv_points_mask; unsigned int data_rate[I915_MAX_PIPES]; u8 num_active_planes[I915_MAX_PIPES]; - /* bitmask of active pipes */ - u8 active_pipes; - int min_cdclk; }; From a40ee54e9a0958406469d46def03eec62aea0b69 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Thu, 17 Feb 2022 17:22:37 +0200 Subject: [PATCH 288/773] drm/i915: Disconnect PHYs left connected by BIOS on disabled ports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BIOS may leave a TypeC PHY in a connected state even though the corresponding port is disabled. This will prevent any hotplug events from being signalled (after the monitor deasserts and then reasserts its HPD) until the PHY is disconnected and so the driver will not detect a connected sink. Rebooting with the PHY in the connected state also results in a system hang. Fix the above by disconnecting TypeC PHYs on disabled ports. Before commit 64851a32c463e5 the PHY connected state was read out even for disabled ports and later the PHY got disconnected as a side effect of a tc_port_lock/unlock() sequence (during connector probing), hence recovering the port's hotplug functionality. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5014 Fixes: 64851a32c463 ("drm/i915/tc: Add a mode for the TypeC PHY's disconnected state") Cc: # v5.16+ Cc: José Roberto de Souza Signed-off-by: Imre Deak Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20220217152237.670220-1-imre.deak@intel.com (cherry picked from commit ed0ccf349ffd9c80e7376d4d8c608643de990e86) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_tc.c | 28 ++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_tc.c b/drivers/gpu/drm/i915/display/intel_tc.c index dbd7d0d83a14..7784c30fe893 100644 --- a/drivers/gpu/drm/i915/display/intel_tc.c +++ b/drivers/gpu/drm/i915/display/intel_tc.c @@ -691,6 +691,8 @@ void intel_tc_port_sanitize(struct intel_digital_port *dig_port) { struct drm_i915_private *i915 = to_i915(dig_port->base.base.dev); struct intel_encoder *encoder = &dig_port->base; + intel_wakeref_t tc_cold_wref; + enum intel_display_power_domain domain; int active_links = 0; mutex_lock(&dig_port->tc_lock); @@ -702,12 +704,11 @@ void intel_tc_port_sanitize(struct intel_digital_port *dig_port) drm_WARN_ON(&i915->drm, dig_port->tc_mode != TC_PORT_DISCONNECTED); drm_WARN_ON(&i915->drm, dig_port->tc_lock_wakeref); + + tc_cold_wref = tc_cold_block(dig_port, &domain); + + dig_port->tc_mode = intel_tc_port_get_current_mode(dig_port); if (active_links) { - enum intel_display_power_domain domain; - intel_wakeref_t tc_cold_wref = tc_cold_block(dig_port, &domain); - - dig_port->tc_mode = intel_tc_port_get_current_mode(dig_port); - if (!icl_tc_phy_is_connected(dig_port)) drm_dbg_kms(&i915->drm, "Port %s: PHY disconnected with %d active link(s)\n", @@ -716,10 +717,23 @@ void intel_tc_port_sanitize(struct intel_digital_port *dig_port) dig_port->tc_lock_wakeref = tc_cold_block(dig_port, &dig_port->tc_lock_power_domain); - - tc_cold_unblock(dig_port, domain, tc_cold_wref); + } else { + /* + * TBT-alt is the default mode in any case the PHY ownership is not + * held (regardless of the sink's connected live state), so + * we'll just switch to disconnected mode from it here without + * a note. + */ + if (dig_port->tc_mode != TC_PORT_TBT_ALT) + drm_dbg_kms(&i915->drm, + "Port %s: PHY left in %s mode on disabled port, disconnecting it\n", + dig_port->tc_port_name, + tc_port_mode_name(dig_port->tc_mode)); + icl_tc_phy_disconnect(dig_port); } + tc_cold_unblock(dig_port, domain, tc_cold_wref); + drm_dbg_kms(&i915->drm, "Port %s: sanitize mode (%s)\n", dig_port->tc_port_name, tc_port_mode_name(dig_port->tc_mode)); From afc189df6bcc6be65961deb54e15ec60e7f85337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 18 Feb 2022 08:40:34 +0200 Subject: [PATCH 289/773] drm/i915: Correctly populate use_sagv_wm for all pipes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When changing between SAGV vs. no SAGV on tgl+ we have to update the use_sagv_wm flag for all the crtcs or else an active pipe not already in the state will end up using the wrong watermarks. That is especially bad when we end up with the tighter non-SAGV watermarks with SAGV enabled. Usually ends up in underruns. Cc: stable@vger.kernel.org Reviewed-by: Stanislav Lisovskiy Fixes: 7241c57d3140 ("drm/i915: Add TGL+ SAGV support") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220218064039.12834-2-ville.syrjala@linux.intel.com (cherry picked from commit 8dd8ffb824ca7b897ce9f2082ffa7e64831c22dc) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_pm.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 32bc155f5dc0..fae4f7818d28 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -4029,6 +4029,17 @@ static int intel_compute_sagv_mask(struct intel_atomic_state *state) return ret; } + if (intel_can_enable_sagv(dev_priv, new_bw_state) != + intel_can_enable_sagv(dev_priv, old_bw_state)) { + ret = intel_atomic_serialize_global_state(&new_bw_state->base); + if (ret) + return ret; + } else if (new_bw_state->pipe_sagv_reject != old_bw_state->pipe_sagv_reject) { + ret = intel_atomic_lock_global_state(&new_bw_state->base); + if (ret) + return ret; + } + for_each_new_intel_crtc_in_state(state, crtc, new_crtc_state, i) { struct skl_pipe_wm *pipe_wm = &new_crtc_state->wm.skl.optimal; @@ -4044,17 +4055,6 @@ static int intel_compute_sagv_mask(struct intel_atomic_state *state) intel_can_enable_sagv(dev_priv, new_bw_state); } - if (intel_can_enable_sagv(dev_priv, new_bw_state) != - intel_can_enable_sagv(dev_priv, old_bw_state)) { - ret = intel_atomic_serialize_global_state(&new_bw_state->base); - if (ret) - return ret; - } else if (new_bw_state->pipe_sagv_reject != old_bw_state->pipe_sagv_reject) { - ret = intel_atomic_lock_global_state(&new_bw_state->base); - if (ret) - return ret; - } - return 0; } From ec663bca9128f13eada25cd0446e7fcb5fcdc088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 18 Feb 2022 08:40:35 +0200 Subject: [PATCH 290/773] drm/i915: Fix bw atomic check when switching between SAGV vs. no SAGV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the only thing that is changing is SAGV vs. no SAGV but the number of active planes and the total data rates end up unchanged we currently bail out of intel_bw_atomic_check() early and forget to actually compute the new WGV point mask and thus won't actually enable/disable SAGV as requested. This ends up poorly if we end up running with SAGV enabled when we shouldn't. Usually ends up in underruns. To fix this let's go through the QGV point mask computation if either the data rates/number of planes, or the state of SAGV is changing. v2: Check more carefully if things are changing to avoid the extra calculations/debugs from introducing unwanted overhead Cc: stable@vger.kernel.org Reviewed-by: Stanislav Lisovskiy #v1 Fixes: 20f505f22531 ("drm/i915: Restrict qgv points which don't have enough bandwidth.") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220218064039.12834-3-ville.syrjala@linux.intel.com (cherry picked from commit 6b728595ffa51c087343c716bccbfc260f120e72) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_bw.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_bw.c b/drivers/gpu/drm/i915/display/intel_bw.c index 2da4aacc956b..8ac196e814d5 100644 --- a/drivers/gpu/drm/i915/display/intel_bw.c +++ b/drivers/gpu/drm/i915/display/intel_bw.c @@ -825,6 +825,7 @@ int intel_bw_atomic_check(struct intel_atomic_state *state) unsigned int max_bw_point = 0, max_bw = 0; unsigned int num_qgv_points = dev_priv->max_bw[0].num_qgv_points; unsigned int num_psf_gv_points = dev_priv->max_bw[0].num_psf_gv_points; + bool changed = false; u32 mask = 0; /* FIXME earlier gens need some checks too */ @@ -868,6 +869,8 @@ int intel_bw_atomic_check(struct intel_atomic_state *state) new_bw_state->data_rate[crtc->pipe] = new_data_rate; new_bw_state->num_active_planes[crtc->pipe] = new_active_planes; + changed = true; + drm_dbg_kms(&dev_priv->drm, "pipe %c data rate %u num active planes %u\n", pipe_name(crtc->pipe), @@ -875,7 +878,19 @@ int intel_bw_atomic_check(struct intel_atomic_state *state) new_bw_state->num_active_planes[crtc->pipe]); } - if (!new_bw_state) + old_bw_state = intel_atomic_get_old_bw_state(state); + new_bw_state = intel_atomic_get_new_bw_state(state); + + if (new_bw_state && + intel_can_enable_sagv(dev_priv, old_bw_state) != + intel_can_enable_sagv(dev_priv, new_bw_state)) + changed = true; + + /* + * If none of our inputs (data rates, number of active + * planes, SAGV yes/no) changed then nothing to do here. + */ + if (!changed) return 0; ret = intel_atomic_lock_global_state(&new_bw_state->base); @@ -961,7 +976,6 @@ int intel_bw_atomic_check(struct intel_atomic_state *state) */ new_bw_state->qgv_points_mask = ~allowed_points & mask; - old_bw_state = intel_atomic_get_old_bw_state(state); /* * If the actual mask had changed we need to make sure that * the commits are serialized(in case this is a nomodeset, nonblocking) From 28adef861233c6fce47372ebd2070b55eaa8e899 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Tue, 15 Feb 2022 08:35:45 -0800 Subject: [PATCH 291/773] drm/i915/dg2: Print PHY name properly on calibration error We need to use phy_name() to convert the PHY value into a human-readable character in the error message. Fixes: a6a128116e55 ("drm/i915/dg2: Wait for SNPS PHY calibration during display init") Signed-off-by: Matt Roper Reviewed-by: Swathi Dhanavanthri Link: https://patchwork.freedesktop.org/patch/msgid/20220215163545.2175730-1-matthew.d.roper@intel.com (cherry picked from commit 84073e568eec7b586b2f6fd5fb2fb08f59edec54) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_snps_phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_snps_phy.c b/drivers/gpu/drm/i915/display/intel_snps_phy.c index 09f405e4d363..92ff654f54f5 100644 --- a/drivers/gpu/drm/i915/display/intel_snps_phy.c +++ b/drivers/gpu/drm/i915/display/intel_snps_phy.c @@ -34,7 +34,7 @@ void intel_snps_phy_wait_for_calibration(struct drm_i915_private *dev_priv) if (intel_de_wait_for_clear(dev_priv, ICL_PHY_MISC(phy), DG2_PHY_DP_TX_ACK_MASK, 25)) DRM_ERROR("SNPS PHY %c failed to calibrate after 25ms.\n", - phy); + phy_name(phy)); } } From 93dd04ab0b2b32ae6e70284afc764c577156658e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 18 Feb 2022 14:13:58 +0100 Subject: [PATCH 292/773] slab: remove __alloc_size attribute from __kmalloc_track_caller Commit c37495d6254c ("slab: add __alloc_size attributes for better bounds checking") added __alloc_size attributes to a bunch of kmalloc function prototypes. Unfortunately the change to __kmalloc_track_caller seems to cause clang to generate broken code and the first time this is called when booting, the box will crash. While the compiler problems are being reworked and attempted to be solved [1], let's just drop the attribute to solve the issue now. Once it is resolved it can be added back. [1] https://github.com/ClangBuiltLinux/linux/issues/1599 Fixes: c37495d6254c ("slab: add __alloc_size attributes for better bounds checking") Cc: stable Cc: Kees Cook Cc: Daniel Micay Cc: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Andrew Morton Cc: Vlastimil Babka Cc: Nathan Chancellor Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Cc: llvm@lists.linux.dev Signed-off-by: Greg Kroah-Hartman Acked-by: Nick Desaulniers Acked-by: David Rientjes Acked-by: Kees Cook Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/20220218131358.3032912-1-gregkh@linuxfoundation.org --- include/linux/slab.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 37bde99b74af..5b6193fd8bd9 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -660,8 +660,7 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) - __alloc_size(1); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) From 221944736f66f38e9bdbce52c616d10df7f15c54 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 16 Feb 2022 12:43:30 -0800 Subject: [PATCH 293/773] tools/cgroup/slabinfo: update to work with struct slab After the introduction of the dedicated struct slab to describe slab pages by commit d122019bf061 ("mm: Split slab into its own type") and the following removal of the corresponding struct page's fields by commit 07f910f9b729 ("mm: Remove slab from struct page") the memcg_slabinfo tool broke. An attempt to run it produces a trace like this: Traceback (most recent call last): File "/usr/bin/drgn", line 33, in sys.exit(load_entry_point('drgn==0.0.16', 'console_scripts', 'drgn')()) File "/usr/lib64/python3.9/site-packages/drgn/internal/cli.py", line 133, in main runpy.run_path(args.script[0], init_globals=init_globals, run_name="__main__") File "/usr/lib64/python3.9/runpy.py", line 268, in run_path return _run_module_code(code, init_globals, run_name, File "/usr/lib64/python3.9/runpy.py", line 97, in _run_module_code _run_code(code, mod_globals, init_globals, File "/usr/lib64/python3.9/runpy.py", line 87, in _run_code exec(code, run_globals) File "memcg_slabinfo.py", line 226, in main() File "memcg_slabinfo.py", line 199, in main cache = page.slab_cache AttributeError: 'struct page' has no member 'slab_cache' The problem can be fixed by explicitly casting struct page * to struct slab * for slab pages. The tools works as expected with this fix, e.g.: cred_jar 776 776 192 21 1 : tunables 0 0 0 : slabdata 547 547 0 kmalloc-cg-32 6 6 32 128 1 : tunables 0 0 0 : slabdata 9 9 0 files_cache 3 3 832 39 8 : tunables 0 0 0 : slabdata 8 8 0 kmalloc-cg-512 1 1 512 32 4 : tunables 0 0 0 : slabdata 10 10 0 task_struct 10 10 6720 4 8 : tunables 0 0 0 : slabdata 63 63 0 mm_struct 3 3 1664 19 8 : tunables 0 0 0 : slabdata 9 9 0 kmalloc-cg-16 1 1 16 256 1 : tunables 0 0 0 : slabdata 8 8 0 pde_opener 1 1 40 102 1 : tunables 0 0 0 : slabdata 8 8 0 anon_vma_chain 375 375 64 64 1 : tunables 0 0 0 : slabdata 81 81 0 radix_tree_node 3 3 584 28 4 : tunables 0 0 0 : slabdata 419 419 0 dentry 98 98 312 26 2 : tunables 0 0 0 : slabdata 1420 1420 0 btrfs_inode 3 3 2368 13 8 : tunables 0 0 0 : slabdata 730 730 0 signal_cache 3 3 1600 20 8 : tunables 0 0 0 : slabdata 17 17 0 sighand_cache 3 3 2240 14 8 : tunables 0 0 0 : slabdata 20 20 0 filp 90 90 512 32 4 : tunables 0 0 0 : slabdata 95 95 0 anon_vma 214 214 200 20 1 : tunables 0 0 0 : slabdata 162 162 0 kmalloc-cg-1k 1 1 1024 32 8 : tunables 0 0 0 : slabdata 22 22 0 pid 10 10 256 32 2 : tunables 0 0 0 : slabdata 14 14 0 kmalloc-cg-64 2 2 64 64 1 : tunables 0 0 0 : slabdata 8 8 0 kmalloc-cg-96 3 3 96 42 1 : tunables 0 0 0 : slabdata 8 8 0 sock_inode_cache 5 5 1408 23 8 : tunables 0 0 0 : slabdata 29 29 0 UNIX 7 7 1920 17 8 : tunables 0 0 0 : slabdata 21 21 0 inode_cache 36 36 1152 28 8 : tunables 0 0 0 : slabdata 680 680 0 proc_inode_cache 26 26 1224 26 8 : tunables 0 0 0 : slabdata 64 64 0 kmalloc-cg-2k 2 2 2048 16 8 : tunables 0 0 0 : slabdata 9 9 0 v2: change naming and count_partial()/count_free()/for_each_slab() signatures to work with slabs, suggested by Matthew Wilcox Fixes: 07f910f9b729 ("mm: Remove slab from struct page") Reported-by: Vasily Averin Signed-off-by: Roman Gushchin Tested-by: Vasily Averin Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/linux-patches/Yg2cKKnIboNu7j+p@carbon.DHCP.thefacebook.com/ --- tools/cgroup/memcg_slabinfo.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py index 1600b17dbb8a..1d3a90d93fe2 100644 --- a/tools/cgroup/memcg_slabinfo.py +++ b/tools/cgroup/memcg_slabinfo.py @@ -11,7 +11,7 @@ from drgn.helpers.linux import list_for_each_entry, list_empty from drgn.helpers.linux import for_each_page from drgn.helpers.linux.cpumask import for_each_online_cpu from drgn.helpers.linux.percpu import per_cpu_ptr -from drgn import container_of, FaultError, Object +from drgn import container_of, FaultError, Object, cast DESC = """ @@ -69,15 +69,15 @@ def oo_objects(s): def count_partial(n, fn): - nr_pages = 0 - for page in list_for_each_entry('struct page', n.partial.address_of_(), - 'lru'): - nr_pages += fn(page) - return nr_pages + nr_objs = 0 + for slab in list_for_each_entry('struct slab', n.partial.address_of_(), + 'slab_list'): + nr_objs += fn(slab) + return nr_objs -def count_free(page): - return page.objects - page.inuse +def count_free(slab): + return slab.objects - slab.inuse def slub_get_slabinfo(s, cfg): @@ -145,14 +145,14 @@ def detect_kernel_config(): return cfg -def for_each_slab_page(prog): +def for_each_slab(prog): PGSlab = 1 << prog.constant('PG_slab') PGHead = 1 << prog.constant('PG_head') for page in for_each_page(prog): try: if page.flags.value_() & PGSlab: - yield page + yield cast('struct slab *', page) except FaultError: pass @@ -190,13 +190,13 @@ def main(): 'list'): obj_cgroups.add(ptr.value_()) - # look over all slab pages, belonging to non-root memcgs - # and look for objects belonging to the given memory cgroup - for page in for_each_slab_page(prog): - objcg_vec_raw = page.memcg_data.value_() + # look over all slab folios and look for objects belonging + # to the given memory cgroup + for slab in for_each_slab(prog): + objcg_vec_raw = slab.memcg_data.value_() if objcg_vec_raw == 0: continue - cache = page.slab_cache + cache = slab.slab_cache if not cache: continue addr = cache.value_() From cc20cced0598d9a5ff91ae4ab147b3b5e99ee819 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Fri, 18 Feb 2022 22:35:24 +0800 Subject: [PATCH 294/773] gso: do not skip outer ip header in case of ipip and net_failover We encounter a tcp drop issue in our cloud environment. Packet GROed in host forwards to a VM virtio_net nic with net_failover enabled. VM acts as a IPVS LB with ipip encapsulation. The full path like: host gro -> vm virtio_net rx -> net_failover rx -> ipvs fullnat -> ipip encap -> net_failover tx -> virtio_net tx When net_failover transmits a ipip pkt (gso_type = 0x0103, which means SKB_GSO_TCPV4, SKB_GSO_DODGY and SKB_GSO_IPXIP4), there is no gso did because it supports TSO and GSO_IPXIP4. But network_header points to inner ip header. Call Trace: tcp4_gso_segment ------> return NULL inet_gso_segment ------> inner iph, network_header points to ipip_gso_segment inet_gso_segment ------> outer iph skb_mac_gso_segment Afterwards virtio_net transmits the pkt, only inner ip header is modified. And the outer one just keeps unchanged. The pkt will be dropped in remote host. Call Trace: inet_gso_segment ------> inner iph, outer iph is skipped skb_mac_gso_segment __skb_gso_segment validate_xmit_skb validate_xmit_skb_list sch_direct_xmit __qdisc_run __dev_queue_xmit ------> virtio_net dev_hard_start_xmit __dev_queue_xmit ------> net_failover ip_finish_output2 ip_output iptunnel_xmit ip_tunnel_xmit ipip_tunnel_xmit ------> ipip dev_hard_start_xmit __dev_queue_xmit ip_finish_output2 ip_output ip_forward ip_rcv __netif_receive_skb_one_core netif_receive_skb_internal napi_gro_receive receive_buf virtnet_poll net_rx_action The root cause of this issue is specific with the rare combination of SKB_GSO_DODGY and a tunnel device that adds an SKB_GSO_ tunnel option. SKB_GSO_DODGY is set from external virtio_net. We need to reset network header when callbacks.gso_segment() returns NULL. This patch also includes ipv6_gso_segment(), considering SIT, etc. Fixes: cb32f511a70b ("ipip: add GSO/TSO support") Signed-off-by: Tao Liu Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 5 ++++- net/ipv6/ip6_offload.c | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9c465bac1eb0..72fde2888ad2 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1376,8 +1376,11 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, } ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_segment)) + if (likely(ops && ops->callbacks.gso_segment)) { segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; + } if (IS_ERR_OR_NULL(segs)) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index b29e9ba5e113..5f577e21459b 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -114,6 +114,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) From 228339662b398a59b3560cd571deb8b25b253c7e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Feb 2022 05:49:30 -0700 Subject: [PATCH 295/773] io_uring: don't convert to jiffies for waiting on timeouts If an application calls io_uring_enter(2) with a timespec passed in, convert that timespec to ktime_t rather than jiffies. The latter does not provide the granularity the application may expect, and may in fact provided different granularity on different systems, depending on what the HZ value is configured at. Turn the timespec into an absolute ktime_t, and use that with schedule_hrtimeout() instead. Link: https://github.com/axboe/liburing/issues/531 Cc: stable@vger.kernel.org Reported-by: Bob Chen Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b928928617a4..928446fe1319 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7694,7 +7694,7 @@ static int io_run_task_work_sig(void) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - signed long *timeout) + ktime_t timeout) { int ret; @@ -7706,8 +7706,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (test_bit(0, &ctx->check_cq_overflow)) return 1; - *timeout = schedule_timeout(*timeout); - return !*timeout ? -ETIME : 1; + if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) + return -ETIME; + return 1; } /* @@ -7720,7 +7721,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; - signed long timeout = MAX_SCHEDULE_TIMEOUT; + ktime_t timeout = KTIME_MAX; int ret; do { @@ -7736,7 +7737,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (get_timespec64(&ts, uts)) return -EFAULT; - timeout = timespec64_to_jiffies(&ts); + timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } if (sig) { @@ -7768,7 +7769,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); + ret = io_cqring_wait_schedule(ctx, &iowq, timeout); finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); From b6ad6261d27708567b309fdb3102b12c42a070cc Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 21 Feb 2022 13:45:57 +0200 Subject: [PATCH 296/773] net: mdio-ipq4019: add delay after clock enable Experimentation shows that PHY detect might fail when the code attempts MDIO bus read immediately after clock enable. Add delay to stabilize the clock before bus access. PHY detect failure started to show after commit 7590fc6f80ac ("net: mdio: Demote probed message to debug print") that removed coincidental delay between clock enable and bus access. 10ms is meant to match the time it take to send the probed message over UART at 115200 bps. This might be a far overshoot. Fixes: 23a890d493e3 ("net: mdio: Add the reset function for IPQ MDIO driver") Signed-off-by: Baruch Siach Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/mdio/mdio-ipq4019.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/mdio/mdio-ipq4019.c b/drivers/net/mdio/mdio-ipq4019.c index 5f4cd24a0241..4eba5a91075c 100644 --- a/drivers/net/mdio/mdio-ipq4019.c +++ b/drivers/net/mdio/mdio-ipq4019.c @@ -200,7 +200,11 @@ static int ipq_mdio_reset(struct mii_bus *bus) if (ret) return ret; - return clk_prepare_enable(priv->mdio_clk); + ret = clk_prepare_enable(priv->mdio_clk); + if (ret == 0) + mdelay(10); + + return ret; } static int ipq4019_mdio_probe(struct platform_device *pdev) From ae09639e3b2a0291b37b122c94dd4f773cd4e513 Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Wed, 16 Feb 2022 22:53:02 +0000 Subject: [PATCH 297/773] platform/x86: int3472: Add terminator to gpiod_lookup_table Without the terminator, if a con_id is passed to gpio_find() that does not exist in the lookup table the function will not stop looping correctly, and eventually cause an oops. Fixes: 19d8d6e36b4b ("platform/x86: int3472: Pass tps68470_regulator_platform_data to the tps68470-regulator MFD-cell") Signed-off-by: Daniel Scally Link: https://lore.kernel.org/r/20220216225304.53911-5-djrscally@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/int3472/tps68470_board_data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/int3472/tps68470_board_data.c b/drivers/platform/x86/intel/int3472/tps68470_board_data.c index f93d437fd192..525f09a3b5ff 100644 --- a/drivers/platform/x86/intel/int3472/tps68470_board_data.c +++ b/drivers/platform/x86/intel/int3472/tps68470_board_data.c @@ -100,7 +100,8 @@ static struct gpiod_lookup_table surface_go_tps68470_gpios = { .dev_id = "i2c-INT347A:00", .table = { GPIO_LOOKUP("tps68470-gpio", 9, "reset", GPIO_ACTIVE_LOW), - GPIO_LOOKUP("tps68470-gpio", 7, "powerdown", GPIO_ACTIVE_LOW) + GPIO_LOOKUP("tps68470-gpio", 7, "powerdown", GPIO_ACTIVE_LOW), + { } } }; From c086df4902573e2f06c6a2a83452c13a8bc603f5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Jan 2022 18:52:52 -0500 Subject: [PATCH 298/773] fuse: move FUSE_SUPER_MAGIC definition to magic.h ...to help userland apps that need to identify FUSE mounts. Signed-off-by: Jeff Layton Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 3 +-- include/uapi/linux/magic.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ee846ce371d8..9ee36aa73251 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -23,6 +23,7 @@ #include #include #include +#include MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); -#define FUSE_SUPER_MAGIC 0x65735546 - #define FUSE_DEFAULT_BLKSIZE 512 /** Maximum number of outstanding background requests */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 0425cd79af9a..f724129c0425 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -36,6 +36,7 @@ #define EFIVARFS_MAGIC 0xde5e81e4 #define HOSTFS_SUPER_MAGIC 0x00c0ffee #define OVERLAYFS_SUPER_MAGIC 0x794c7630 +#define FUSE_SUPER_MAGIC 0x65735546 #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ From 6069da443bf65f513bb507bb21e2f87cfb1ad0b6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 18 Feb 2022 12:45:32 +0100 Subject: [PATCH 299/773] netfilter: nf_tables: unregister flowtable hooks on netns exit Unregister flowtable hooks before they are releases via nf_tables_flowtable_destroy() otherwise hook core reports UAF. BUG: KASAN: use-after-free in nf_hook_entries_grow+0x5a7/0x700 net/netfilter/core.c:142 net/netfilter/core.c:142 Read of size 4 at addr ffff8880736f7438 by task syz-executor579/3666 CPU: 0 PID: 3666 Comm: syz-executor579 Not tainted 5.16.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] __dump_stack lib/dump_stack.c:88 [inline] lib/dump_stack.c:106 dump_stack_lvl+0x1dc/0x2d8 lib/dump_stack.c:106 lib/dump_stack.c:106 print_address_description+0x65/0x380 mm/kasan/report.c:247 mm/kasan/report.c:247 __kasan_report mm/kasan/report.c:433 [inline] __kasan_report mm/kasan/report.c:433 [inline] mm/kasan/report.c:450 kasan_report+0x19a/0x1f0 mm/kasan/report.c:450 mm/kasan/report.c:450 nf_hook_entries_grow+0x5a7/0x700 net/netfilter/core.c:142 net/netfilter/core.c:142 __nf_register_net_hook+0x27e/0x8d0 net/netfilter/core.c:429 net/netfilter/core.c:429 nf_register_net_hook+0xaa/0x180 net/netfilter/core.c:571 net/netfilter/core.c:571 nft_register_flowtable_net_hooks+0x3c5/0x730 net/netfilter/nf_tables_api.c:7232 net/netfilter/nf_tables_api.c:7232 nf_tables_newflowtable+0x2022/0x2cf0 net/netfilter/nf_tables_api.c:7430 net/netfilter/nf_tables_api.c:7430 nfnetlink_rcv_batch net/netfilter/nfnetlink.c:513 [inline] nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:634 [inline] nfnetlink_rcv_batch net/netfilter/nfnetlink.c:513 [inline] net/netfilter/nfnetlink.c:652 nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:634 [inline] net/netfilter/nfnetlink.c:652 nfnetlink_rcv+0x10e6/0x2550 net/netfilter/nfnetlink.c:652 net/netfilter/nfnetlink.c:652 __nft_release_hook() calls nft_unregister_flowtable_net_hooks() which only unregisters the hooks, then after RCU grace period, it is guaranteed that no packets add new entries to the flowtable (no flow offload rules and flowtable hooks are reachable from packet path), so it is safe to call nf_flow_table_free() which cleans up the remaining entries from the flowtable (both software and hardware) and it unbinds the flow_block. Fixes: ff4bf2f42a40 ("netfilter: nf_tables: add nft_unregister_flowtable_hook()") Reported-by: syzbot+e918523f77e62790d6d9@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5fa16990da95..3081c4399f10 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9636,10 +9636,13 @@ EXPORT_SYMBOL_GPL(__nft_release_basechain); static void __nft_release_hook(struct net *net, struct nft_table *table) { + struct nft_flowtable *flowtable; struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) nf_tables_unregister_hook(net, table, chain); + list_for_each_entry(flowtable, &table->flowtables, list) + nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list); } static void __nft_release_hooks(struct net *net) From 1a58f84ea5df7f026bf92a0009f931bf547fe965 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 18 Feb 2022 13:17:05 +0100 Subject: [PATCH 300/773] netfilter: nft_limit: fix stateful object memory leak We need to provide a destroy callback to release the extra fields. Fixes: 3b9e2ea6c11b ("netfilter: nft_limit: move stateful fields out of expression data") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index c4f308460dd1..a726b623963d 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -340,11 +340,20 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb, return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } +static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_limit_priv_pkts *priv = nft_obj_data(obj); + + nft_limit_destroy(ctx, &priv->limit); +} + static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_pkts_ops = { .type = &nft_limit_obj_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .init = nft_limit_obj_pkts_init, + .destroy = nft_limit_obj_pkts_destroy, .eval = nft_limit_obj_pkts_eval, .dump = nft_limit_obj_pkts_dump, }; @@ -378,11 +387,20 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb, return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } +static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_limit_priv *priv = nft_obj_data(obj); + + nft_limit_destroy(ctx, priv); +} + static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_bytes_ops = { .type = &nft_limit_obj_type, .size = sizeof(struct nft_limit_priv), .init = nft_limit_obj_bytes_init, + .destroy = nft_limit_obj_bytes_destroy, .eval = nft_limit_obj_bytes_eval, .dump = nft_limit_obj_bytes_dump, }; From d920eaa4c4559f59be7b4c2d26fa0a2e1aaa3da9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 16 Feb 2022 15:37:38 +0000 Subject: [PATCH 301/773] ARM: Fix kgdb breakpoint for Thumb2 The kgdb code needs to register an undef hook for the Thumb UDF instruction that will fault in order to be functional on Thumb2 platforms. Reported-by: Johannes Stezenbach Tested-by: Johannes Stezenbach Fixes: 5cbad0ebf45c ("kgdb: support for ARCH=arm") Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/kgdb.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c index 7bd30c0a4280..22f937e6f3ff 100644 --- a/arch/arm/kernel/kgdb.c +++ b/arch/arm/kernel/kgdb.c @@ -154,22 +154,38 @@ static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr) return 0; } -static struct undef_hook kgdb_brkpt_hook = { +static struct undef_hook kgdb_brkpt_arm_hook = { .instr_mask = 0xffffffff, .instr_val = KGDB_BREAKINST, - .cpsr_mask = MODE_MASK, + .cpsr_mask = PSR_T_BIT | MODE_MASK, .cpsr_val = SVC_MODE, .fn = kgdb_brk_fn }; -static struct undef_hook kgdb_compiled_brkpt_hook = { +static struct undef_hook kgdb_brkpt_thumb_hook = { + .instr_mask = 0xffff, + .instr_val = KGDB_BREAKINST & 0xffff, + .cpsr_mask = PSR_T_BIT | MODE_MASK, + .cpsr_val = PSR_T_BIT | SVC_MODE, + .fn = kgdb_brk_fn +}; + +static struct undef_hook kgdb_compiled_brkpt_arm_hook = { .instr_mask = 0xffffffff, .instr_val = KGDB_COMPILED_BREAK, - .cpsr_mask = MODE_MASK, + .cpsr_mask = PSR_T_BIT | MODE_MASK, .cpsr_val = SVC_MODE, .fn = kgdb_compiled_brk_fn }; +static struct undef_hook kgdb_compiled_brkpt_thumb_hook = { + .instr_mask = 0xffff, + .instr_val = KGDB_COMPILED_BREAK & 0xffff, + .cpsr_mask = PSR_T_BIT | MODE_MASK, + .cpsr_val = PSR_T_BIT | SVC_MODE, + .fn = kgdb_compiled_brk_fn +}; + static int __kgdb_notify(struct die_args *args, unsigned long cmd) { struct pt_regs *regs = args->regs; @@ -210,8 +226,10 @@ int kgdb_arch_init(void) if (ret != 0) return ret; - register_undef_hook(&kgdb_brkpt_hook); - register_undef_hook(&kgdb_compiled_brkpt_hook); + register_undef_hook(&kgdb_brkpt_arm_hook); + register_undef_hook(&kgdb_brkpt_thumb_hook); + register_undef_hook(&kgdb_compiled_brkpt_arm_hook); + register_undef_hook(&kgdb_compiled_brkpt_thumb_hook); return 0; } @@ -224,8 +242,10 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { - unregister_undef_hook(&kgdb_brkpt_hook); - unregister_undef_hook(&kgdb_compiled_brkpt_hook); + unregister_undef_hook(&kgdb_brkpt_arm_hook); + unregister_undef_hook(&kgdb_brkpt_thumb_hook); + unregister_undef_hook(&kgdb_compiled_brkpt_arm_hook); + unregister_undef_hook(&kgdb_compiled_brkpt_thumb_hook); unregister_die_notifier(&kgdb_notifier); } From 11c57c3ba94da74c3446924260e34e0b1950b5d7 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Mon, 17 Jan 2022 05:09:40 +0100 Subject: [PATCH 302/773] ARM: 9178/1: fix unmet dependency on BITREVERSE for HAVE_ARCH_BITREVERSE Resending this to properly add it to the patch tracker - thanks for letting me know, Arnd :) When ARM is enabled, and BITREVERSE is disabled, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for HAVE_ARCH_BITREVERSE Depends on [n]: BITREVERSE [=n] Selected by [y]: - ARM [=y] && (CPU_32v7M [=n] || CPU_32v7 [=y]) && !CPU_32v6 [=n] This is because ARM selects HAVE_ARCH_BITREVERSE without selecting BITREVERSE, despite HAVE_ARCH_BITREVERSE depending on BITREVERSE. This unmet dependency bug was found by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. Signed-off-by: Julian Braha Signed-off-by: Russell King (Oracle) --- lib/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig b/lib/Kconfig index 5e7165e6a346..fa4b10322efc 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -45,7 +45,6 @@ config BITREVERSE config HAVE_ARCH_BITREVERSE bool default n - depends on BITREVERSE help This option enables the use of hardware bit-reversal instructions on architectures which support such operations. From 9d2231c5d74e13b2a0546fee6737ee4446017903 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 21 Feb 2022 11:03:13 +0100 Subject: [PATCH 303/773] lib/iov_iter: initialize "flags" in new pipe_buffer The functions copy_page_to_iter_pipe() and push_pipe() can both allocate a new pipe_buffer, but the "flags" member initializer is missing. Fixes: 241699cd72a8 ("new iov_iter flavour: pipe-backed") To: Alexander Viro To: linux-fsdevel@vger.kernel.org To: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Max Kellermann Signed-off-by: Al Viro --- lib/iov_iter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index b0e0acdf96c1..6dd5330f7a99 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -414,6 +414,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by return 0; buf->ops = &page_cache_pipe_buf_ops; + buf->flags = 0; get_page(page); buf->page = page; buf->offset = offset; @@ -577,6 +578,7 @@ static size_t push_pipe(struct iov_iter *i, size_t size, break; buf->ops = &default_pipe_buf_ops; + buf->flags = 0; buf->page = page; buf->offset = 0; buf->len = min_t(ssize_t, left, PAGE_SIZE); From f6c052afe6f802d87c74153b7a57c43b2e9faf07 Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Sun, 20 Feb 2022 15:14:31 +0000 Subject: [PATCH 304/773] nvmem: core: Fix a conflict between MTD and NVMEM on wp-gpios property Wp-gpios property can be used on NVMEM nodes and the same property can be also used on MTD NAND nodes. In case of the wp-gpios property is defined at NAND level node, the GPIO management is done at NAND driver level. Write protect is disabled when the driver is probed or resumed and is enabled when the driver is released or suspended. When no partitions are defined in the NAND DT node, then the NAND DT node will be passed to NVMEM framework. If wp-gpios property is defined in this node, the GPIO resource is taken twice and the NAND controller driver fails to probe. It would be possible to set config->wp_gpio at MTD level before calling nvmem_register function but NVMEM framework will toggle this GPIO on each write when this GPIO should only be controlled at NAND level driver to ensure that the Write Protect has not been enabled. A way to fix this conflict is to add a new boolean flag in nvmem_config named ignore_wp. In case ignore_wp is set, the GPIO resource will be managed by the provider. Fixes: 2a127da461a9 ("nvmem: add support for the write-protect pin") Cc: stable@vger.kernel.org Signed-off-by: Christophe Kerello Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220220151432.16605-2-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 2 +- include/linux/nvmem-provider.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 23a38dcf0fc4..9fd1602b539d 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -771,7 +771,7 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) if (config->wp_gpio) nvmem->wp_gpio = config->wp_gpio; - else + else if (!config->ignore_wp) nvmem->wp_gpio = gpiod_get_optional(config->dev, "wp", GPIOD_OUT_HIGH); if (IS_ERR(nvmem->wp_gpio)) { diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 98efb7b5660d..c9a3ac9efeaa 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -70,7 +70,8 @@ struct nvmem_keepout { * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. * @priv: User context passed to read/write callbacks. - * @wp-gpio: Write protect pin + * @wp-gpio: Write protect pin + * @ignore_wp: Write Protect pin is managed by the provider. * * Note: A default "nvmem" name will be assigned to the device if * no name is specified in its configuration. In such case "" is @@ -92,6 +93,7 @@ struct nvmem_config { enum nvmem_type type; bool read_only; bool root_only; + bool ignore_wp; struct device_node *of_node; bool no_of_node; nvmem_reg_read_t reg_read; From 6c7621890995d089a56a06d11580d185ede7c2f8 Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Sun, 20 Feb 2022 15:14:32 +0000 Subject: [PATCH 305/773] mtd: core: Fix a conflict between MTD and NVMEM on wp-gpios property Wp-gpios property can be used on NVMEM nodes and the same property can be also used on MTD NAND nodes. In case of the wp-gpios property is defined at NAND level node, the GPIO management is done at NAND driver level. Write protect is disabled when the driver is probed or resumed and is enabled when the driver is released or suspended. When no partitions are defined in the NAND DT node, then the NAND DT node will be passed to NVMEM framework. If wp-gpios property is defined in this node, the GPIO resource is taken twice and the NAND controller driver fails to probe. A new Boolean flag named ignore_wp has been added in nvmem_config. In case ignore_wp is set, it means that the GPIO is handled by the provider. Lets set this flag in MTD layer to avoid the conflict on wp_gpios property. Fixes: 2a127da461a9 ("nvmem: add support for the write-protect pin") Cc: stable@vger.kernel.org Acked-by: Miquel Raynal Signed-off-by: Christophe Kerello Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220220151432.16605-3-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/mtdcore.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 70f492dce158..eef87b28d6c8 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -546,6 +546,7 @@ static int mtd_nvmem_add(struct mtd_info *mtd) config.stride = 1; config.read_only = true; config.root_only = true; + config.ignore_wp = true; config.no_of_node = !of_device_is_compatible(node, "nvmem-cells"); config.priv = mtd; @@ -833,6 +834,7 @@ static struct nvmem_device *mtd_otp_nvmem_register(struct mtd_info *mtd, config.owner = THIS_MODULE; config.type = NVMEM_TYPE_OTP; config.root_only = true; + config.ignore_wp = true; config.reg_read = reg_read; config.size = size; config.of_node = np; From 737b0ef3be6b319d6c1fd64193d1603311969326 Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Thu, 17 Feb 2022 23:31:17 -0800 Subject: [PATCH 306/773] tty: n_gsm: fix encoding of control signal octet bit DV n_gsm is based on the 3GPP 07.010 and its newer version is the 3GPP 27.010. See https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=1516 The changes from 07.010 to 27.010 are non-functional. Therefore, I refer to the newer 27.010 here. Chapter 5.4.6.3.7 describes the encoding of the control signal octet used by the MSC (modem status command). The same encoding is also used in convergence layer type 2 as described in chapter 5.5.2. Table 7 and 24 both require the DV (data valid) bit to be set 1 for outgoing control signal octets sent by the DTE (data terminal equipment), i.e. for the initiator side. Currently, the DV bit is only set if CD (carrier detect) is on, regardless of the side. This patch fixes this behavior by setting the DV bit on the initiator side unconditionally. Fixes: e1eaea46bb40 ("tty: n_gsm line discipline") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-1-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 0b1808e3a912..e199315a158e 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -439,7 +439,7 @@ static u8 gsm_encode_modem(const struct gsm_dlci *dlci) modembits |= MDM_RTR; if (dlci->modem_tx & TIOCM_RI) modembits |= MDM_IC; - if (dlci->modem_tx & TIOCM_CD) + if (dlci->modem_tx & TIOCM_CD || dlci->gsm->initiator) modembits |= MDM_DV; return modembits; } From 57435c42400ec147a527b2313188b649e81e449e Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Thu, 17 Feb 2022 23:31:18 -0800 Subject: [PATCH 307/773] tty: n_gsm: fix encoding of command/response bit n_gsm is based on the 3GPP 07.010 and its newer version is the 3GPP 27.010. See https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=1516 The changes from 07.010 to 27.010 are non-functional. Therefore, I refer to the newer 27.010 here. Chapter 5.2.1.2 describes the encoding of the C/R (command/response) bit. Table 1 shows that the actual encoding of the C/R bit is inverted if the associated frame is sent by the responder. The referenced commit fixed here further broke the internal meaning of this bit in the outgoing path by always setting the C/R bit regardless of the frame type. This patch fixes both by setting the C/R bit always consistently for command (1) and response (0) frames and inverting it later for the responder where necessary. The meaning of this bit in the debug output is being preserved and shows the bit as if it was encoded by the initiator. This reflects only the frame type rather than the encoded combination of communication side and frame type. Fixes: cc0f42122a7e ("tty: n_gsm: Modify CR,PF bit when config requester") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-2-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index e199315a158e..db41b9e5d35e 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -448,7 +448,7 @@ static u8 gsm_encode_modem(const struct gsm_dlci *dlci) * gsm_print_packet - display a frame for debug * @hdr: header to print before decode * @addr: address EA from the frame - * @cr: C/R bit from the frame + * @cr: C/R bit seen as initiator * @control: control including PF bit * @data: following data bytes * @dlen: length of data @@ -548,7 +548,7 @@ static int gsm_stuff_frame(const u8 *input, u8 *output, int len) * gsm_send - send a control frame * @gsm: our GSM mux * @addr: address for control frame - * @cr: command/response bit + * @cr: command/response bit seen as initiator * @control: control byte including PF bit * * Format up and transmit a control frame. These do not go via the @@ -563,11 +563,15 @@ static void gsm_send(struct gsm_mux *gsm, int addr, int cr, int control) int len; u8 cbuf[10]; u8 ibuf[3]; + int ocr; + + /* toggle C/R coding if not initiator */ + ocr = cr ^ (gsm->initiator ? 0 : 1); switch (gsm->encoding) { case 0: cbuf[0] = GSM0_SOF; - cbuf[1] = (addr << 2) | (cr << 1) | EA; + cbuf[1] = (addr << 2) | (ocr << 1) | EA; cbuf[2] = control; cbuf[3] = EA; /* Length of data = 0 */ cbuf[4] = 0xFF - gsm_fcs_add_block(INIT_FCS, cbuf + 1, 3); @@ -577,7 +581,7 @@ static void gsm_send(struct gsm_mux *gsm, int addr, int cr, int control) case 1: case 2: /* Control frame + packing (but not frame stuffing) in mode 1 */ - ibuf[0] = (addr << 2) | (cr << 1) | EA; + ibuf[0] = (addr << 2) | (ocr << 1) | EA; ibuf[1] = control; ibuf[2] = 0xFF - gsm_fcs_add_block(INIT_FCS, ibuf, 2); /* Stuffing may double the size worst case */ @@ -611,7 +615,7 @@ static void gsm_send(struct gsm_mux *gsm, int addr, int cr, int control) static inline void gsm_response(struct gsm_mux *gsm, int addr, int control) { - gsm_send(gsm, addr, 1, control); + gsm_send(gsm, addr, 0, control); } /** @@ -1800,10 +1804,10 @@ static void gsm_queue(struct gsm_mux *gsm) goto invalid; cr = gsm->address & 1; /* C/R bit */ + cr ^= gsm->initiator ? 0 : 1; /* Flip so 1 always means command */ gsm_print_packet("<--", address, cr, gsm->control, gsm->buf, gsm->len); - cr ^= 1 - gsm->initiator; /* Flip so 1 always means command */ dlci = gsm->dlci[address]; switch (gsm->control) { From e3b7468f082d106459e86e8dc6fb9bdd65553433 Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Thu, 17 Feb 2022 23:31:19 -0800 Subject: [PATCH 308/773] tty: n_gsm: fix proper link termination after failed open Trying to open a DLCI by sending a SABM frame may fail with a timeout. The link is closed on the initiator side without informing the responder about this event. The responder assumes the link is open after sending a UA frame to answer the SABM frame. The link gets stuck in a half open state. This patch fixes this by initiating the proper link termination procedure after link setup timeout instead of silently closing it down. Fixes: e1eaea46bb40 ("tty: n_gsm line discipline") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-3-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index db41b9e5d35e..53fa1cec49bd 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -1518,7 +1518,7 @@ static void gsm_dlci_t1(struct timer_list *t) dlci->mode = DLCI_MODE_ADM; gsm_dlci_open(dlci); } else { - gsm_dlci_close(dlci); + gsm_dlci_begin_close(dlci); /* prevent half open link */ } break; From 96b169f05cdcc844b400695184d77e42071d14f2 Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Thu, 17 Feb 2022 23:31:20 -0800 Subject: [PATCH 309/773] tty: n_gsm: fix NULL pointer access due to DLCI release The here fixed commit made the tty hangup asynchronous to avoid a circular locking warning. I could not reproduce this warning. Furthermore, due to the asynchronous hangup the function call now gets queued up while the underlying tty is being freed. Depending on the timing this results in a NULL pointer access in the global work queue scheduler. To be precise in process_one_work(). Therefore, the previous commit made the issue worse which it tried to fix. This patch fixes this by falling back to the old behavior which uses a blocking tty hangup call before freeing up the associated tty. Fixes: 7030082a7415 ("tty: n_gsm: avoid recursive locking with async port hangup") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-4-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 53fa1cec49bd..79397a40a8f8 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -1752,7 +1752,12 @@ static void gsm_dlci_release(struct gsm_dlci *dlci) gsm_destroy_network(dlci); mutex_unlock(&dlci->mutex); - tty_hangup(tty); + /* We cannot use tty_hangup() because in tty_kref_put() the tty + * driver assumes that the hangup queue is free and reuses it to + * queue release_one_tty() -> NULL pointer panic in + * process_one_work(). + */ + tty_vhangup(tty); tty_port_tty_set(&dlci->port, NULL); tty_kref_put(tty); From c19d93542a6081577e6da9bf5e887979c72e80c1 Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Thu, 17 Feb 2022 23:31:21 -0800 Subject: [PATCH 310/773] tty: n_gsm: fix wrong tty control line for flow control tty flow control is handled via gsmtty_throttle() and gsmtty_unthrottle(). Both functions propagate the outgoing hardware flow control state to the remote side via MSC (modem status command) frames. The local state is taken from the RTS (ready to send) flag of the tty. However, RTS gets mapped to DTR (data terminal ready), which is wrong. This patch corrects this by mapping RTS to RTS. Fixes: e1eaea46bb40 ("tty: n_gsm line discipline") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-5-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 79397a40a8f8..b4ddac0124b9 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -3243,9 +3243,9 @@ static void gsmtty_throttle(struct tty_struct *tty) if (dlci->state == DLCI_CLOSED) return; if (C_CRTSCTS(tty)) - dlci->modem_tx &= ~TIOCM_DTR; + dlci->modem_tx &= ~TIOCM_RTS; dlci->throttled = true; - /* Send an MSC with DTR cleared */ + /* Send an MSC with RTS cleared */ gsmtty_modem_update(dlci, 0); } @@ -3255,9 +3255,9 @@ static void gsmtty_unthrottle(struct tty_struct *tty) if (dlci->state == DLCI_CLOSED) return; if (C_CRTSCTS(tty)) - dlci->modem_tx |= TIOCM_DTR; + dlci->modem_tx |= TIOCM_RTS; dlci->throttled = false; - /* Send an MSC with DTR set */ + /* Send an MSC with RTS set */ gsmtty_modem_update(dlci, 0); } From 687f9ad43c52501f46164758e908a5dd181a87fc Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Thu, 17 Feb 2022 23:31:22 -0800 Subject: [PATCH 311/773] tty: n_gsm: fix wrong modem processing in convergence layer type 2 The function gsm_process_modem() exists to handle modem status bits of incoming frames. This includes incoming MSC (modem status command) frames and convergence layer type 2 data frames. The function, however, was only designed to handle MSC frames as it expects the command length. Within gsm_dlci_data() it is wrongly assumed that this is the same as the data frame length. This is only true if the data frame contains only 1 byte of payload. This patch names the length parameter of gsm_process_modem() in a generic manner to reflect its association. It also corrects all calls to the function to handle the variable number of modem status octets correctly in both cases. Fixes: 7263287af93d ("tty: n_gsm: Fixed logic to decode break signal from modem status") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-6-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index b4ddac0124b9..8ff5c0c61233 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -1021,25 +1021,25 @@ static void gsm_control_reply(struct gsm_mux *gsm, int cmd, const u8 *data, * @tty: virtual tty bound to the DLCI * @dlci: DLCI to affect * @modem: modem bits (full EA) - * @clen: command length + * @slen: number of signal octets * * Used when a modem control message or line state inline in adaption * layer 2 is processed. Sort out the local modem state and throttles */ static void gsm_process_modem(struct tty_struct *tty, struct gsm_dlci *dlci, - u32 modem, int clen) + u32 modem, int slen) { int mlines = 0; u8 brk = 0; int fc; - /* The modem status command can either contain one octet (v.24 signals) - or two octets (v.24 signals + break signals). The length field will - either be 2 or 3 respectively. This is specified in section - 5.4.6.3.7 of the 27.010 mux spec. */ + /* The modem status command can either contain one octet (V.24 signals) + * or two octets (V.24 signals + break signals). This is specified in + * section 5.4.6.3.7 of the 07.10 mux spec. + */ - if (clen == 2) + if (slen == 1) modem = modem & 0x7f; else { brk = modem & 0x7f; @@ -1096,6 +1096,7 @@ static void gsm_control_modem(struct gsm_mux *gsm, const u8 *data, int clen) unsigned int brk = 0; struct gsm_dlci *dlci; int len = clen; + int slen; const u8 *dp = data; struct tty_struct *tty; @@ -1115,6 +1116,7 @@ static void gsm_control_modem(struct gsm_mux *gsm, const u8 *data, int clen) return; dlci = gsm->dlci[addr]; + slen = len; while (gsm_read_ea(&modem, *dp++) == 0) { len--; if (len == 0) @@ -1131,7 +1133,7 @@ static void gsm_control_modem(struct gsm_mux *gsm, const u8 *data, int clen) modem |= (brk & 0x7f); } tty = tty_port_tty_get(&dlci->port); - gsm_process_modem(tty, dlci, modem, clen); + gsm_process_modem(tty, dlci, modem, slen); if (tty) { tty_wakeup(tty); tty_kref_put(tty); @@ -1597,6 +1599,7 @@ static void gsm_dlci_data(struct gsm_dlci *dlci, const u8 *data, int clen) struct tty_struct *tty; unsigned int modem = 0; int len = clen; + int slen = 0; if (debug & 16) pr_debug("%d bytes for tty\n", len); @@ -1609,12 +1612,14 @@ static void gsm_dlci_data(struct gsm_dlci *dlci, const u8 *data, int clen) case 2: /* Asynchronous serial with line state in each frame */ while (gsm_read_ea(&modem, *data++) == 0) { len--; + slen++; if (len == 0) return; } + slen++; tty = tty_port_tty_get(port); if (tty) { - gsm_process_modem(tty, dlci, modem, clen); + gsm_process_modem(tty, dlci, modem, slen); tty_kref_put(tty); } fallthrough; From a2ab75b8e76e455af7867e3835fd9cdf386b508f Mon Sep 17 00:00:00 2001 From: "daniel.starke@siemens.com" Date: Thu, 17 Feb 2022 23:31:23 -0800 Subject: [PATCH 312/773] tty: n_gsm: fix deadlock in gsmtty_open() In the current implementation the user may open a virtual tty which then could fail to establish the underlying DLCI. The function gsmtty_open() gets stuck in tty_port_block_til_ready() while waiting for a carrier rise. This happens if the remote side fails to acknowledge the link establishment request in time or completely. At some point gsm_dlci_close() is called to abort the link establishment attempt. The function tries to inform the associated virtual tty by performing a hangup. But the blocking loop within tty_port_block_til_ready() is not informed about this event. The patch proposed here fixes this by resetting the initialization state of the virtual tty to ensure the loop exits and triggering it to make tty_port_block_til_ready() return. Fixes: e1eaea46bb40 ("tty: n_gsm line discipline") Cc: stable@vger.kernel.org Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20220218073123.2121-7-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 8ff5c0c61233..fa92f727fdf8 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -1457,6 +1457,9 @@ static void gsm_dlci_close(struct gsm_dlci *dlci) if (dlci->addr != 0) { tty_port_tty_hangup(&dlci->port, false); kfifo_reset(&dlci->fifo); + /* Ensure that gsmtty_open() can return. */ + tty_port_set_initialized(&dlci->port, 0); + wake_up_interruptible(&dlci->port.open_wait); } else dlci->gsm->dead = true; /* Unregister gsmtty driver,report gsmtty dev remove uevent for user */ From eebb0f4e894f1e9577a56b337693d1051dd6ebfd Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Wed, 16 Feb 2022 16:08:02 +0000 Subject: [PATCH 313/773] sc16is7xx: Fix for incorrect data being transmitted UART drivers are meant to use the port spinlock within certain methods, to protect against reentrancy. The sc16is7xx driver does very little locking, presumably because when added it triggers "scheduling while atomic" errors. This is due to the use of mutexes within the regmap abstraction layer, and the mutex implementation's habit of sleeping the current thread while waiting for access. Unfortunately this lack of interlocking can lead to corruption of outbound data, which occurs when the buffer used for I2C transmission is used simultaneously by two threads - a work queue thread running sc16is7xx_tx_proc, and an IRQ thread in sc16is7xx_port_irq, both of which can call sc16is7xx_handle_tx. An earlier patch added efr_lock, a mutex that controls access to the EFR register. This mutex is already claimed in the IRQ handler, and all that is required is to claim the same mutex in sc16is7xx_tx_proc. See: https://github.com/raspberrypi/linux/issues/4885 Fixes: 6393ff1c4435 ("sc16is7xx: Use threaded IRQ") Cc: stable Signed-off-by: Phil Elwell Link: https://lore.kernel.org/r/20220216160802.1026013-1-phil@raspberrypi.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sc16is7xx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index 64e7e6c8145f..38d1c0748533 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -734,12 +734,15 @@ static irqreturn_t sc16is7xx_irq(int irq, void *dev_id) static void sc16is7xx_tx_proc(struct kthread_work *ws) { struct uart_port *port = &(to_sc16is7xx_one(ws, tx_work)->port); + struct sc16is7xx_port *s = dev_get_drvdata(port->dev); if ((port->rs485.flags & SER_RS485_ENABLED) && (port->rs485.delay_rts_before_send > 0)) msleep(port->rs485.delay_rts_before_send); + mutex_lock(&s->efr_lock); sc16is7xx_handle_tx(port); + mutex_unlock(&s->efr_lock); } static void sc16is7xx_reconf_rs485(struct uart_port *port) From 1432108d00e42ffa383240bcac8d58f89ae19104 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Sat, 12 Feb 2022 16:40:00 +0100 Subject: [PATCH 314/773] drm/amd/display: Protect update_bw_bounding_box FPU code. For DCN3/3.01/3.02 at least these use the fpu. v2: squash in build fix for when DCN is not enabled (Leo) Signed-off-by: Bas Nieuwenhuizen Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c | 2 ++ drivers/gpu/drm/amd/display/dc/core/dc.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c index f977f29907df..10c7be40dfb0 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn30/dcn30_clk_mgr.c @@ -473,8 +473,10 @@ static void dcn3_get_memclk_states_from_smu(struct clk_mgr *clk_mgr_base) clk_mgr_base->bw_params->dc_mode_softmax_memclk = dcn30_smu_get_dc_mode_max_dpm_freq(clk_mgr, PPCLK_UCLK); /* Refresh bounding box */ + DC_FP_START(); clk_mgr_base->ctx->dc->res_pool->funcs->update_bw_bounding_box( clk_mgr->base.ctx->dc, clk_mgr_base->bw_params); + DC_FP_END(); } static bool dcn3_is_smu_present(struct clk_mgr *clk_mgr_base) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index d18e9f3ea998..ba1aa994db4b 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -985,10 +985,13 @@ static bool dc_construct(struct dc *dc, goto fail; #ifdef CONFIG_DRM_AMD_DC_DCN dc->clk_mgr->force_smu_not_present = init_params->force_smu_not_present; -#endif - if (dc->res_pool->funcs->update_bw_bounding_box) + if (dc->res_pool->funcs->update_bw_bounding_box) { + DC_FP_START(); dc->res_pool->funcs->update_bw_bounding_box(dc, dc->clk_mgr->bw_params); + DC_FP_END(); + } +#endif /* Creation of current_state must occur after dc->dml * is initialized in dc_create_resource_pool because From f626dd0ff05043e5a7154770cc7cda66acee33a3 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 20 Jan 2022 16:15:52 +0800 Subject: [PATCH 315/773] drm/amdgpu: disable MMHUB PG for Picasso MMHUB PG needs to be disabled for Picasso for stability reasons. Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc15.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 0fc1747e4a70..d0e37f7f196c 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1114,8 +1114,11 @@ static int soc15_common_early_init(void *handle) AMD_CG_SUPPORT_SDMA_LS | AMD_CG_SUPPORT_VCN_MGCG; + /* + * MMHUB PG needs to be disabled for Picasso for + * stability reasons. + */ adev->pg_flags = AMD_PG_SUPPORT_SDMA | - AMD_PG_SUPPORT_MMHUB | AMD_PG_SUPPORT_VCN; } else { adev->cg_flags = AMD_CG_SUPPORT_GFX_MGCG | From e3f3824874da78db5775a5cb9c0970cd1c6978bc Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 20 Jan 2022 19:16:19 +0800 Subject: [PATCH 316/773] drm/amd/pm: fix some OEM SKU specific stability issues Add a quirk in sienna_cichlid_ppt.c to fix some OEM SKU specific stability issues. Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 2320bd750876..5488a0edb942 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -421,6 +421,36 @@ static int sienna_cichlid_store_powerplay_table(struct smu_context *smu) return 0; } +static int sienna_cichlid_patch_pptable_quirk(struct smu_context *smu) +{ + struct amdgpu_device *adev = smu->adev; + uint32_t *board_reserved; + uint16_t *freq_table_gfx; + uint32_t i; + + /* Fix some OEM SKU specific stability issues */ + GET_PPTABLE_MEMBER(BoardReserved, &board_reserved); + if ((adev->pdev->device == 0x73DF) && + (adev->pdev->revision == 0XC3) && + (adev->pdev->subsystem_device == 0x16C2) && + (adev->pdev->subsystem_vendor == 0x1043)) + board_reserved[0] = 1387; + + GET_PPTABLE_MEMBER(FreqTableGfx, &freq_table_gfx); + if ((adev->pdev->device == 0x73DF) && + (adev->pdev->revision == 0XC3) && + ((adev->pdev->subsystem_device == 0x16C2) || + (adev->pdev->subsystem_device == 0x133C)) && + (adev->pdev->subsystem_vendor == 0x1043)) { + for (i = 0; i < NUM_GFXCLK_DPM_LEVELS; i++) { + if (freq_table_gfx[i] > 2500) + freq_table_gfx[i] = 2500; + } + } + + return 0; +} + static int sienna_cichlid_setup_pptable(struct smu_context *smu) { int ret = 0; @@ -441,7 +471,7 @@ static int sienna_cichlid_setup_pptable(struct smu_context *smu) if (ret) return ret; - return ret; + return sienna_cichlid_patch_pptable_quirk(smu); } static int sienna_cichlid_tables_init(struct smu_context *smu) From 4d22336f903930eb94588b939c310743a3640276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Tue, 15 Feb 2022 19:53:37 +0100 Subject: [PATCH 317/773] drm/amd/display: For vblank_disable_immediate, check PSR is really used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if PSR is allowed for a present GPU, there might be no eDP link which supports PSR. Fixes: 708978487304 ("drm/amdgpu/display: Only set vblank_disable_immediate when PSR is not enabled") Reviewed-by: Harry Wentland Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7c1c623ba799..075429bea427 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4256,6 +4256,9 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) } #endif + /* Disable vblank IRQs aggressively for power-saving. */ + adev_to_drm(adev)->vblank_disable_immediate = true; + /* loops over all connectors on the board */ for (i = 0; i < link_cnt; i++) { struct dc_link *link = NULL; @@ -4301,19 +4304,17 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) update_connector_ext_caps(aconnector); if (psr_feature_enabled) amdgpu_dm_set_psr_caps(link); + + /* TODO: Fix vblank control helpers to delay PSR entry to allow this when + * PSR is also supported. + */ + if (link->psr_settings.psr_feature_enabled) + adev_to_drm(adev)->vblank_disable_immediate = false; } } - /* - * Disable vblank IRQs aggressively for power-saving. - * - * TODO: Fix vblank control helpers to delay PSR entry to allow this when PSR - * is also supported. - */ - adev_to_drm(adev)->vblank_disable_immediate = !psr_feature_enabled; - /* Software is initialized. Now we can register interrupt handlers. */ switch (adev->asic_type) { #if defined(CONFIG_DRM_AMD_DC_SI) From 5f6b0f2d037c8864f20ff15311c695f65eb09db5 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 19 Feb 2022 23:04:29 +0300 Subject: [PATCH 318/773] ata: pata_hpt37x: fix PCI clock detection The f_CNT register (at the PCI config. address 0x78) is 16-bit, not 8-bit! The bug was there from the very start... :-( Signed-off-by: Sergey Shtylyov Fixes: 669a5db411d8 ("[libata] Add a bunch of PATA drivers.") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal --- drivers/ata/pata_hpt37x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c index 7abc7e04f656..1baaca7b72ed 100644 --- a/drivers/ata/pata_hpt37x.c +++ b/drivers/ata/pata_hpt37x.c @@ -950,14 +950,14 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id) if ((freq >> 12) != 0xABCDE) { int i; - u8 sr; + u16 sr; u32 total = 0; dev_warn(&dev->dev, "BIOS has not set timing clocks\n"); /* This is the process the HPT371 BIOS is reported to use */ for (i = 0; i < 128; i++) { - pci_read_config_byte(dev, 0x78, &sr); + pci_read_config_word(dev, 0x78, &sr); total += sr & 0x1FF; udelay(15); } From dad3bdeef45f81a6e90204bcc85360bb76eccec7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 21 Feb 2022 13:31:49 +0100 Subject: [PATCH 319/773] netfilter: nf_tables: fix memory leak during stateful obj update stateful objects can be updated from the control plane. The transaction logic allocates a temporary object for this purpose. The ->init function was called for this object, so plain kfree() leaks resources. We must call ->destroy function of the object. nft_obj_destroy does this, but it also decrements the module refcount, but the update path doesn't increment it. To avoid special-casing the update object release, do module_get for the update case too and release it via nft_obj_destroy(). Fixes: d62d0ba97b58 ("netfilter: nf_tables: Introduce stateful object update operation") Cc: Fernando Fernandez Mancera Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3081c4399f10..9cd1d7a62804 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6551,12 +6551,15 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, { struct nft_object *newobj; struct nft_trans *trans; - int err; + int err = -ENOMEM; + + if (!try_module_get(type->owner)) + return -ENOENT; trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) - return -ENOMEM; + goto err_trans; newobj = nft_obj_init(ctx, type, attr); if (IS_ERR(newobj)) { @@ -6573,6 +6576,8 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, err_free_trans: kfree(trans); +err_trans: + module_put(type->owner); return err; } @@ -8185,7 +8190,7 @@ static void nft_obj_commit_update(struct nft_trans *trans) if (obj->ops->update) obj->ops->update(obj, newobj); - kfree(newobj); + nft_obj_destroy(&trans->ctx, newobj); } static void nft_commit_release(struct nft_trans *trans) @@ -8976,7 +8981,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { - kfree(nft_trans_obj_newobj(trans)); + nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { trans->ctx.table->use--; From d8f7a5484f2188e9af2d9e4e587587d724501b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5rten=20Lindahl?= Date: Wed, 16 Feb 2022 10:41:28 +0100 Subject: [PATCH 320/773] driver core: Free DMA range map when device is released MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When unbinding/binding a driver with DMA mapped memory, the DMA map is not freed before the driver is reloaded. This leads to a memory leak when the DMA map is overwritten when reprobing the driver. This can be reproduced with a platform driver having a dma-range: dummy { ... #address-cells = <0x2>; #size-cells = <0x2>; ranges; dma-ranges = <...>; ... }; and then unbinding/binding it: ~# echo soc:dummy >/sys/bus/platform/drivers//unbind DMA map object 0xffffff800b0ae540 still being held by &pdev->dev ~# echo soc:dummy >/sys/bus/platform/drivers//bind ~# echo scan > /sys/kernel/debug/kmemleak ~# cat /sys/kernel/debug/kmemleak unreferenced object 0xffffff800b0ae540 (size 64): comm "sh", pid 833, jiffies 4295174550 (age 2535.352s) hex dump (first 32 bytes): 00 00 00 80 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 80 00 00 00 00 00 00 00 80 00 00 00 00 ................ backtrace: [] create_object.isra.0+0x108/0x344 [] kmemleak_alloc+0x8c/0xd0 [] __kmalloc+0x440/0x6f0 [] of_dma_get_range+0x124/0x220 [] of_dma_configure_id+0x40/0x2d0 [] platform_dma_configure+0x5c/0xa4 [] really_probe+0x8c/0x514 [] __driver_probe_device+0x9c/0x19c [] device_driver_attach+0x54/0xbc [] bind_store+0xc4/0x120 [] drv_attr_store+0x30/0x44 [] sysfs_kf_write+0x50/0x60 [] kernfs_fop_write_iter+0x124/0x1b4 [] new_sync_write+0xdc/0x160 [] vfs_write+0x23c/0x2a0 [] ksys_write+0x64/0xec To prevent this we should free the dma_range_map when the device is released. Fixes: e0d072782c73 ("dma-mapping: introduce DMA range map, supplanting dma_pfn_offset") Cc: stable Suggested-by: Rob Herring Reviewed-by: Rob Herring Signed-off-by: Mårten Lindahl Link: https://lore.kernel.org/r/20220216094128.4025861-1-marten.lindahl@axis.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 9eaaff2f556c..f47cab21430f 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -629,6 +629,9 @@ re_probe: drv->remove(dev); devres_release_all(dev); + arch_teardown_dma_ops(dev); + kfree(dev->dma_range_map); + dev->dma_range_map = NULL; driver_sysfs_remove(dev); dev->driver = NULL; dev_set_drvdata(dev, NULL); @@ -1209,6 +1212,8 @@ static void __device_release_driver(struct device *dev, struct device *parent) devres_release_all(dev); arch_teardown_dma_ops(dev); + kfree(dev->dma_range_map); + dev->dma_range_map = NULL; dev->driver = NULL; dev_set_drvdata(dev, NULL); if (dev->pm_domain && dev->pm_domain->dismiss) From 1e6ae0e46e32749b130f1823da30cea9aa2a59a0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 21 Feb 2022 09:50:29 -0800 Subject: [PATCH 321/773] mips: setup: fix setnocoherentio() boolean setting Correct a typo/pasto: setnocoherentio() should set dma_default_coherent to false, not true. Fixes: 14ac09a65e19 ("MIPS: refactor the runtime coherent vs noncoherent DMA indicators") Signed-off-by: Randy Dunlap Cc: Christoph Hellwig Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index f979adfd4fc2..ef73ba1e0ec1 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -803,7 +803,7 @@ early_param("coherentio", setcoherentio); static int __init setnocoherentio(char *str) { - dma_default_coherent = true; + dma_default_coherent = false; pr_info("Software DMA cache coherency (command line)\n"); return 0; } From f0fdfc04fd974cea23351b830fcac0822ea19a51 Mon Sep 17 00:00:00 2001 From: Adam Ward Date: Tue, 22 Feb 2022 00:27:42 +0000 Subject: [PATCH 322/773] regulator: da9121: Fix DA914x current values Update DA9141/2 ranges to correct errors Signed-off-by: Adam Ward Link: https://lore.kernel.org/r/cd5732c5061ce49dcfbcebb306d12ba1664b4ea6.1645489455.git.Adam.Ward.opensource@diasemi.com Signed-off-by: Mark Brown --- drivers/regulator/da9121-regulator.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/regulator/da9121-regulator.c b/drivers/regulator/da9121-regulator.c index 6f21223a488e..39d77726970c 100644 --- a/drivers/regulator/da9121-regulator.c +++ b/drivers/regulator/da9121-regulator.c @@ -87,16 +87,16 @@ static struct da9121_range da9121_3A_1phase_current = { }; static struct da9121_range da914x_40A_4phase_current = { - .val_min = 14000000, - .val_max = 80000000, - .val_stp = 2000000, + .val_min = 26000000, + .val_max = 78000000, + .val_stp = 4000000, .reg_min = 1, .reg_max = 14, }; static struct da9121_range da914x_20A_2phase_current = { - .val_min = 7000000, - .val_max = 40000000, + .val_min = 13000000, + .val_max = 39000000, .val_stp = 2000000, .reg_min = 1, .reg_max = 14, From c8c57fbc1c5067b913077e948c7d957af6834ba3 Mon Sep 17 00:00:00 2001 From: Adam Ward Date: Tue, 22 Feb 2022 00:27:43 +0000 Subject: [PATCH 323/773] regulator: da9121: Fix DA914x voltage value Update DA9141/2 max voltage to match spec change Signed-off-by: Adam Ward Link: https://lore.kernel.org/r/9d1ec5b6db70d27f56d05b8a0139fc0840f03e20.1645489455.git.Adam.Ward.opensource@diasemi.com Signed-off-by: Mark Brown --- drivers/regulator/da9121-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/da9121-regulator.c b/drivers/regulator/da9121-regulator.c index 39d77726970c..f16649dec17e 100644 --- a/drivers/regulator/da9121-regulator.c +++ b/drivers/regulator/da9121-regulator.c @@ -561,7 +561,7 @@ static const struct regulator_desc da9217_reg = { }; #define DA914X_MIN_MV 500 -#define DA914X_MAX_MV 1000 +#define DA914X_MAX_MV 1300 #define DA914X_STEP_MV 10 #define DA914X_MIN_SEL (DA914X_MIN_MV / DA914X_STEP_MV) #define DA914X_N_VOLTAGES (((DA914X_MAX_MV - DA914X_MIN_MV) / DA914X_STEP_MV) \ From 9c7cf33c53ce833b58de9e5c192b4736dbd09cb1 Mon Sep 17 00:00:00 2001 From: Adam Ward Date: Tue, 22 Feb 2022 00:27:44 +0000 Subject: [PATCH 324/773] regulator: da9121: Remove surplus DA9141 parameters Remove ramp_delay/enable_time values - subject to OTP, incorrect Signed-off-by: Adam Ward Link: https://lore.kernel.org/r/a175201b4a7ea323c6a70d77f7f6d2124bfc0bed.1645489455.git.Adam.Ward.opensource@diasemi.com Signed-off-by: Mark Brown --- drivers/regulator/da9121-regulator.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/regulator/da9121-regulator.c b/drivers/regulator/da9121-regulator.c index f16649dec17e..eb9df485bd8a 100644 --- a/drivers/regulator/da9121-regulator.c +++ b/drivers/regulator/da9121-regulator.c @@ -585,10 +585,6 @@ static const struct regulator_desc da9141_reg = { .vsel_mask = DA9121_MASK_BUCK_BUCKx_5_CHx_A_VOUT, .enable_reg = DA9121_REG_BUCK_BUCK1_0, .enable_mask = DA9121_MASK_BUCK_BUCKx_0_CHx_EN, - /* Default value of BUCK_BUCK1_0.CH1_SRC_DVC_UP */ - .ramp_delay = 20000, - /* tBUCK_EN */ - .enable_time = 20, }; static const struct regulator_desc da9142_reg = { From bb49c6fa8b845591b317b0d7afea4ae60ec7f3aa Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 11 Feb 2022 10:01:36 +0100 Subject: [PATCH 325/773] block: clear iocb->private in blkdev_bio_end_io_async() iocb_bio_iopoll() expects iocb->private to be cleared before releasing the bio. We already do this in blkdev_bio_end_io(), but we forgot in the recently added blkdev_bio_end_io_async(). Fixes: 54a88eb838d3 ("block: add single bio async direct IO helper") Cc: asml.silence@gmail.com Signed-off-by: Stefano Garzarella Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220211090136.44471-1-sgarzare@redhat.com Signed-off-by: Jens Axboe --- block/fops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/fops.c b/block/fops.c index 4f59e0f5bf30..a18e7fbd97b8 100644 --- a/block/fops.c +++ b/block/fops.c @@ -289,6 +289,8 @@ static void blkdev_bio_end_io_async(struct bio *bio) struct kiocb *iocb = dio->iocb; ssize_t ret; + WRITE_ONCE(iocb->private, NULL); + if (likely(!bio->bi_status)) { ret = dio->size; iocb->ki_pos += ret; From 93b71801a8274cd9511557faf04365a5de487197 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Feb 2022 09:06:54 -0500 Subject: [PATCH 326/773] KVM: PPC: reserve capability 210 for KVM_CAP_PPC_AIL_MODE_3 Add KVM_CAP_PPC_AIL_MODE_3 to advertise the capability to set the AIL resource mode to 3 with the H_SET_MODE hypercall. This capability differs between processor types and KVM types (PR, HV, Nested HV), and affects guest-visible behaviour. QEMU will implement a cap-ail-mode-3 to control this behaviour[1], and use the KVM CAP if available to determine KVM support[2]. Reviewed-by: Fabiano Rosas Signed-off-by: Nicholas Piggin Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 14 ++++++++++++++ include/uapi/linux/kvm.h | 1 + tools/include/uapi/linux/kvm.h | 1 + 3 files changed, 16 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index a4267104db50..9954568c7eab 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6997,6 +6997,20 @@ indicated by the fd to the VM this is called on. This is intended to support intra-host migration of VMs between userspace VMMs, upgrading the VMM process without interrupting the guest. +7.30 KVM_CAP_PPC_AIL_MODE_3 +------------------------------- + +:Capability: KVM_CAP_PPC_AIL_MODE_3 +:Architectures: ppc +:Type: vm + +This capability indicates that the kernel supports the mode 3 setting for the +"Address Translation Mode on Interrupt" aka "Alternate Interrupt Location" +resource that is controlled with the H_SET_MODE hypercall. + +This capability allows a guest kernel to use a better-performance mode for +handling interrupts and system calls. + 8. Other capabilities. ====================== diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5191b57e1562..507ee1f2aa96 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 5191b57e1562..507ee1f2aa96 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 #ifdef KVM_CAP_IRQ_ROUTING From 1b5f517cca36292076d9e38fa6e33a257703e62e Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 21 Feb 2022 08:32:14 -0800 Subject: [PATCH 327/773] hwmon: Handle failure to register sensor with thermal zone correctly If an attempt is made to a sensor with a thermal zone and it fails, the call to devm_thermal_zone_of_sensor_register() may return -ENODEV. This may result in crashes similar to the following. Unable to handle kernel NULL pointer dereference at virtual address 00000000000003cd ... Internal error: Oops: 96000021 [#1] PREEMPT SMP ... pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : mutex_lock+0x18/0x60 lr : thermal_zone_device_update+0x40/0x2e0 sp : ffff800014c4fc60 x29: ffff800014c4fc60 x28: ffff365ee3f6e000 x27: ffffdde218426790 x26: ffff365ee3f6e000 x25: 0000000000000000 x24: ffff365ee3f6e000 x23: ffffdde218426870 x22: ffff365ee3f6e000 x21: 00000000000003cd x20: ffff365ee8bf3308 x19: ffffffffffffffed x18: 0000000000000000 x17: ffffdde21842689c x16: ffffdde1cb7a0b7c x15: 0000000000000040 x14: ffffdde21a4889a0 x13: 0000000000000228 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : 0000000001120000 x7 : 0000000000000001 x6 : 0000000000000000 x5 : 0068000878e20f07 x4 : 0000000000000000 x3 : 00000000000003cd x2 : ffff365ee3f6e000 x1 : 0000000000000000 x0 : 00000000000003cd Call trace: mutex_lock+0x18/0x60 hwmon_notify_event+0xfc/0x110 0xffffdde1cb7a0a90 0xffffdde1cb7a0b7c irq_thread_fn+0x2c/0xa0 irq_thread+0x134/0x240 kthread+0x178/0x190 ret_from_fork+0x10/0x20 Code: d503201f d503201f d2800001 aa0103e4 (c8e47c02) Jon Hunter reports that the exact call sequence is: hwmon_notify_event() --> hwmon_thermal_notify() --> thermal_zone_device_update() --> update_temperature() --> mutex_lock() The hwmon core needs to handle all errors returned from calls to devm_thermal_zone_of_sensor_register(). If the call fails with -ENODEV, report that the sensor was not attached to a thermal zone but continue to register the hwmon device. Reported-by: Jon Hunter Cc: Dmitry Osipenko Fixes: 1597b374af222 ("hwmon: Add notification support") Reviewed-by: Dmitry Osipenko Tested-by: Jon Hunter Signed-off-by: Guenter Roeck --- drivers/hwmon/hwmon.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 3501a3ead4ba..3ae961986fc3 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -214,12 +214,14 @@ static int hwmon_thermal_add_sensor(struct device *dev, int index) tzd = devm_thermal_zone_of_sensor_register(dev, index, tdata, &hwmon_thermal_ops); - /* - * If CONFIG_THERMAL_OF is disabled, this returns -ENODEV, - * so ignore that error but forward any other error. - */ - if (IS_ERR(tzd) && (PTR_ERR(tzd) != -ENODEV)) - return PTR_ERR(tzd); + if (IS_ERR(tzd)) { + if (PTR_ERR(tzd) != -ENODEV) + return PTR_ERR(tzd); + dev_info(dev, "temp%d_input not attached to any thermal zone\n", + index + 1); + devm_kfree(dev, tdata); + return 0; + } err = devm_add_action(dev, hwmon_thermal_remove_sensor, &tdata->node); if (err) From 35f165f08950a876f1b95a61d79c93678fba2fd6 Mon Sep 17 00:00:00 2001 From: Vikash Chandola Date: Tue, 22 Feb 2022 13:12:53 +0000 Subject: [PATCH 328/773] hwmon: (pmbus) Clear pmbus fault/warning bits after read Almost all fault/warning bits in pmbus status registers remain set even after fault/warning condition are removed. As per pmbus specification these faults must be cleared by user. Modify hwmon behavior to clear fault/warning bit after fetching data if fault/warning bit was set. This allows to get fresh data in next read. Signed-off-by: Vikash Chandola Link: https://lore.kernel.org/r/20220222131253.2426834-1-vikash.chandola@linux.intel.com Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/pmbus_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c index 776ee2237be2..ac2fbee1ba9c 100644 --- a/drivers/hwmon/pmbus/pmbus_core.c +++ b/drivers/hwmon/pmbus/pmbus_core.c @@ -911,6 +911,11 @@ static int pmbus_get_boolean(struct i2c_client *client, struct pmbus_boolean *b, pmbus_update_sensor_data(client, s2); regval = status & mask; + if (regval) { + ret = pmbus_write_byte_data(client, page, reg, regval); + if (ret) + goto unlock; + } if (s1 && s2) { s64 v1, v2; From 80912cef18f16f8fe59d1fb9548d4364342be360 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 22 Feb 2022 08:17:51 -0800 Subject: [PATCH 329/773] io_uring: disallow modification of rsrc_data during quiesce io_rsrc_ref_quiesce will unlock the uring while it waits for references to the io_rsrc_data to be killed. There are other places to the data that might add references to data via calls to io_rsrc_node_switch. There is a race condition where this reference can be added after the completion has been signalled. At this point the io_rsrc_ref_quiesce call will wake up and relock the uring, assuming the data is unused and can be freed - although it is actually being used. To fix this check in io_rsrc_ref_quiesce if a resource has been revived. Reported-by: syzbot+ca8bf833622a1662745b@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220222161751.995746-1-dylany@fb.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 928446fe1319..4715980e9015 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7926,7 +7926,15 @@ static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); - break; + if (atomic_read(&data->refs) > 0) { + /* + * it has been revived by another thread while + * we were unlocked + */ + mutex_unlock(&ctx->uring_lock); + } else { + break; + } } atomic_inc(&data->refs); From 84ec758fb2daa236026506868c8796b0500c047d Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Tue, 15 Feb 2022 15:10:30 +0800 Subject: [PATCH 330/773] configfs: fix a race in configfs_{,un}register_subsystem() When configfs_register_subsystem() or configfs_unregister_subsystem() is executing link_group() or unlink_group(), it is possible that two processes add or delete list concurrently. Some unfortunate interleavings of them can cause kernel panic. One of cases is: A --> B --> C --> D A <-- B <-- C <-- D delete list_head *B | delete list_head *C --------------------------------|----------------------------------- configfs_unregister_subsystem | configfs_unregister_subsystem unlink_group | unlink_group unlink_obj | unlink_obj list_del_init | list_del_init __list_del_entry | __list_del_entry __list_del | __list_del // next == C | next->prev = prev | | next->prev = prev prev->next = next | | // prev == B | prev->next = next Fix this by adding mutex when calling link_group() or unlink_group(), but parent configfs_subsystem is NULL when config_item is root. So I create a mutex configfs_subsystem_mutex. Fixes: 7063fbf22611 ("[PATCH] configfs: User-driven configuration filesystem") Signed-off-by: ChenXiaoSong Signed-off-by: Laibin Qiu Signed-off-by: Christoph Hellwig --- fs/configfs/dir.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index d3cd2a94d1e8..d1f9d2632202 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -34,6 +34,14 @@ */ DEFINE_SPINLOCK(configfs_dirent_lock); +/* + * All of link_obj/unlink_obj/link_group/unlink_group require that + * subsys->su_mutex is held. + * But parent configfs_subsystem is NULL when config_item is root. + * Use this mutex when config_item is root. + */ +static DEFINE_MUTEX(configfs_subsystem_mutex); + static void configfs_d_iput(struct dentry * dentry, struct inode * inode) { @@ -1859,7 +1867,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) group->cg_item.ci_name = group->cg_item.ci_namebuf; sd = root->d_fsdata; + mutex_lock(&configfs_subsystem_mutex); link_group(to_config_group(sd->s_element), group); + mutex_unlock(&configfs_subsystem_mutex); inode_lock_nested(d_inode(root), I_MUTEX_PARENT); @@ -1884,7 +1894,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) inode_unlock(d_inode(root)); if (err) { + mutex_lock(&configfs_subsystem_mutex); unlink_group(group); + mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } put_fragment(frag); @@ -1931,7 +1943,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) dput(dentry); + mutex_lock(&configfs_subsystem_mutex); unlink_group(group); + mutex_unlock(&configfs_subsystem_mutex); configfs_release_fs(); } From 6d3971dab239e7db1691690a02ce6becf30689cb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 21 Feb 2022 16:16:39 +0100 Subject: [PATCH 331/773] cgroup: clarify cgroup_css_set_fork() With recent fixes for the permission checking when moving a task into a cgroup using a file descriptor to a cgroup's cgroup.procs file and calling write() it seems a good idea to clarify CLONE_INTO_CGROUP permission checking with a comment. Cc: Tejun Heo Cc: Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..77702e089d6a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6161,6 +6161,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) if (ret) goto err; + /* + * Spawning a task directly into a cgroup works by passing a file + * descriptor to the target cgroup directory. This can even be an O_PATH + * file descriptor. But it can never be a cgroup.procs file descriptor. + * This was done on purpose so spawning into a cgroup could be + * conceptualized as an atomic + * + * fd = openat(dfd_cgroup, "cgroup.procs", ...); + * write(fd, , ...); + * + * sequence, i.e. it's a shorthand for the caller opening and writing + * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us + * to always use the caller's credentials. + */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); From 467a726b754f474936980da793b4ff2ec3e382a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Thu, 17 Feb 2022 17:11:28 +0100 Subject: [PATCH 332/773] cgroup-v1: Correct privileges check in release_agent writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idea is to check: a) the owning user_ns of cgroup_ns, b) capabilities in init_user_ns. The commit 24f600856418 ("cgroup-v1: Require capabilities to set release_agent") got this wrong in the write handler of release_agent since it checked user_ns of the opener (may be different from the owning user_ns of cgroup_ns). Secondly, to avoid possibly confused deputy, the capability of the opener must be checked. Fixes: 24f600856418 ("cgroup-v1: Require capabilities to set release_agent") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/stable/20220216121142.GB30035@blackbody.suse.cz/ Signed-off-by: Michal Koutný Reviewed-by: Masami Ichikawa(CIP) Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0e877dbcfeea..afc6c0e9c966 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -546,6 +546,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; + struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); @@ -553,8 +554,9 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, * Release agent gets called with all capabilities, * require capabilities to set release agent. */ - if ((of->file->f_cred->user_ns != &init_user_ns) || - !capable(CAP_SYS_ADMIN)) + ctx = of->priv; + if ((ctx->ns->user_ns != &init_user_ns) || + !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); From c70cd039f1d779126347a896a58876782dcc5284 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 16 Feb 2022 11:17:53 +0800 Subject: [PATCH 333/773] cpuset: Fix kernel-doc Fix the following W=1 kernel warnings: kernel/cgroup/cpuset.c:3718: warning: expecting prototype for cpuset_memory_pressure_bump(). Prototype was for __cpuset_memory_pressure_bump() instead. kernel/cgroup/cpuset.c:3568: warning: expecting prototype for cpuset_node_allowed(). Prototype was for __cpuset_node_allowed() instead. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 97c53f3cc917..5de18448016c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3524,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) return cs; } -/** - * cpuset_node_allowed - Can we allocate on a memory node? +/* + * __cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3696,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void) int cpuset_memory_pressure_enabled __read_mostly; -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. * * Keep a running average of the rate of synchronous (direct) * page reclaim efforts initiated by tasks in each cpuset. @@ -3712,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly; * "memory_pressure". Value displayed is an integer * representing the recent rate of entry into the synchronous * (direct) page reclaim by any task attached to the cpuset. - **/ + */ void __cpuset_memory_pressure_bump(void) { From 69560e366fc4d5fca7bebb0e44edbfafc8bcaf05 Mon Sep 17 00:00:00 2001 From: Alexey Bayduraev Date: Fri, 18 Feb 2022 18:23:41 +0300 Subject: [PATCH 334/773] perf data: Fix double free in perf_session__delete() When perf_data__create_dir() fails, it calls close_dir(), but perf_session__delete() also calls close_dir() and since dir.version and dir.nr were initialized by perf_data__create_dir(), a double free occurs. This patch moves the initialization of dir.version and dir.nr after successful initialization of dir.files, that prevents double freeing. This behavior is already implemented in perf_data__open_dir(). Fixes: 145520631130bd64 ("perf data: Add perf_data__(create_dir|close_dir) functions") Signed-off-by: Alexey Bayduraev Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Antonov Cc: Alexander Shishkin Cc: Alexei Budankov Cc: Andi Kleen Cc: Ingo Molnar Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220218152341.5197-2-alexey.v.bayduraev@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index f5d260b1df4d..15a4547d608e 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -44,10 +44,6 @@ int perf_data__create_dir(struct perf_data *data, int nr) if (!files) return -ENOMEM; - data->dir.version = PERF_DIR_VERSION; - data->dir.files = files; - data->dir.nr = nr; - for (i = 0; i < nr; i++) { struct perf_data_file *file = &files[i]; @@ -62,6 +58,9 @@ int perf_data__create_dir(struct perf_data *data, int nr) file->fd = ret; } + data->dir.version = PERF_DIR_VERSION; + data->dir.files = files; + data->dir.nr = nr; return 0; out_err: From 5b061a322b05a5e023d9a0df1ae1f8bb562ed87b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 7 Aug 2020 08:45:47 -0300 Subject: [PATCH 335/773] tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes in: 3915035282573c5e ("KVM: x86: SVM: move avic definitions from AMD's spec to svm.h") Addressing these tools/perf build warnings: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h' That makes the beautification scripts to pick some new entries: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after --- before 2022-02-22 17:35:36.996271430 -0300 +++ after 2022-02-22 17:35:46.258503347 -0300 @@ -287,6 +287,7 @@ [0xc0010114 - x86_AMD_V_KVM_MSRs_offset] = "VM_CR", [0xc0010115 - x86_AMD_V_KVM_MSRs_offset] = "VM_IGNNE", [0xc0010117 - x86_AMD_V_KVM_MSRs_offset] = "VM_HSAVE_PA", + [0xc001011b - x86_AMD_V_KVM_MSRs_offset] = "AMD64_SVM_AVIC_DOORBELL", [0xc001011e - x86_AMD_V_KVM_MSRs_offset] = "AMD64_VM_PAGE_FLUSH", [0xc001011f - x86_AMD_V_KVM_MSRs_offset] = "AMD64_VIRT_SPEC_CTRL", [0xc0010130 - x86_AMD_V_KVM_MSRs_offset] = "AMD64_SEV_ES_GHCB", $ And this gets rebuilt: CC /tmp/build/perf/trace/beauty/tracepoints/x86_msr.o LD /tmp/build/perf/trace/beauty/tracepoints/perf-in.o LD /tmp/build/perf/trace/beauty/perf-in.o CC /tmp/build/perf/util/amd-sample-raw.o LD /tmp/build/perf/util/perf-in.o LD /tmp/build/perf/perf-in.o LINK /tmp/build/perf/perf Now one can trace systemwide asking to see backtraces to where those MSRs are being read/written with: # perf trace -e msr:*_msr/max-stack=32/ --filter="msr>=AMD64_SVM_AVIC_DOORBELL && msr<=AMD64_SEV_ES_GHCB" ^C# If we use -v (verbose mode) we can see what it does behind the scenes: # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr>=AMD64_SVM_AVIC_DOORBELL && msr<=AMD64_SEV_ES_GHCB" Using CPUID AuthenticAMD-25-21-0 0xc001011b 0xc0010130 New filter for msr:read_msr: (msr>=0xc001011b && msr<=0xc0010130) && (common_pid != 1019953 && common_pid != 3629) 0xc001011b 0xc0010130 New filter for msr:write_msr: (msr>=0xc001011b && msr<=0xc0010130) && (common_pid != 1019953 && common_pid != 3629) mmap size 528384B ^C# Example with a frequent msr: # perf trace -v -e msr:*_msr/max-stack=32/ --filter="msr==IA32_SPEC_CTRL" --max-events 2 Using CPUID AuthenticAMD-25-21-0 0x48 New filter for msr:read_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841) 0x48 New filter for msr:write_msr: (msr==0x48) && (common_pid != 2612129 && common_pid != 3841) mmap size 528384B Looking at the vmlinux_path (8 entries long) symsrc__init: build id mismatch for vmlinux. Using /proc/kcore for kernel data Using /proc/kallsyms for symbols 0.000 Timer/2525383 msr:write_msr(msr: IA32_SPEC_CTRL, val: 6) do_trace_write_msr ([kernel.kallsyms]) do_trace_write_msr ([kernel.kallsyms]) __switch_to_xtra ([kernel.kallsyms]) __switch_to ([kernel.kallsyms]) __schedule ([kernel.kallsyms]) schedule ([kernel.kallsyms]) futex_wait_queue_me ([kernel.kallsyms]) futex_wait ([kernel.kallsyms]) do_futex ([kernel.kallsyms]) __x64_sys_futex ([kernel.kallsyms]) do_syscall_64 ([kernel.kallsyms]) entry_SYSCALL_64_after_hwframe ([kernel.kallsyms]) __futex_abstimed_wait_common64 (/usr/lib64/libpthread-2.33.so) 0.030 :0/0 msr:write_msr(msr: IA32_SPEC_CTRL, val: 2) do_trace_write_msr ([kernel.kallsyms]) do_trace_write_msr ([kernel.kallsyms]) __switch_to_xtra ([kernel.kallsyms]) __switch_to ([kernel.kallsyms]) __schedule ([kernel.kallsyms]) schedule_idle ([kernel.kallsyms]) do_idle ([kernel.kallsyms]) cpu_startup_entry ([kernel.kallsyms]) secondary_startup_64_no_verify ([kernel.kallsyms]) # Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Maxim Levitsky Cc: Namhyung Kim Cc: Paolo Bonzini Link: http://lore.kernel.org/lkml/YhVKxaft+z8rpOfy@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 3faf0f97edb1..a4a39c3e0f19 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -476,6 +476,7 @@ #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b #define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 From 34f3eda8c8ffd4d0b2145ac11c91cc365cd1ada3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 14 Feb 2022 09:23:49 +0100 Subject: [PATCH 336/773] MAINTAINERS: sifive: drop Yash Shah Emails to Yash Shah bounce with "The email account that you tried to reach does not exist.", so drop him from all maintainer entries. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220214082349.162973-1-krzysztof.kozlowski@canonical.com --- Documentation/devicetree/bindings/gpio/sifive,gpio.yaml | 1 - Documentation/devicetree/bindings/pwm/pwm-sifive.yaml | 1 - .../devicetree/bindings/riscv/sifive-l2-cache.yaml | 1 - MAINTAINERS | 6 ------ 4 files changed, 9 deletions(-) diff --git a/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml b/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml index e04349567eeb..427c5873f96a 100644 --- a/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml @@ -7,7 +7,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: SiFive GPIO controller maintainers: - - Yash Shah - Paul Walmsley properties: diff --git a/Documentation/devicetree/bindings/pwm/pwm-sifive.yaml b/Documentation/devicetree/bindings/pwm/pwm-sifive.yaml index 84e66913d042..db41cd7bf150 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-sifive.yaml +++ b/Documentation/devicetree/bindings/pwm/pwm-sifive.yaml @@ -8,7 +8,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: SiFive PWM controller maintainers: - - Yash Shah - Sagar Kadam - Paul Walmsley diff --git a/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml b/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml index 2b1f91603897..e2d330bd4608 100644 --- a/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml +++ b/Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml @@ -9,7 +9,6 @@ title: SiFive L2 Cache Controller maintainers: - Sagar Kadam - - Yash Shah - Paul Walmsley description: diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..9d89755f911e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7006,12 +7006,6 @@ L: linux-edac@vger.kernel.org S: Maintained F: drivers/edac/sb_edac.c -EDAC-SIFIVE -M: Yash Shah -L: linux-edac@vger.kernel.org -S: Supported -F: drivers/edac/sifive_edac.c - EDAC-SKYLAKE M: Tony Luck L: linux-edac@vger.kernel.org From 0c0822bcb73f154d96ee648644ec5a8628e3b864 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 21 Feb 2022 11:07:01 +0100 Subject: [PATCH 337/773] dt-bindings: update Roger Quadros email Emails to Roger Quadros TI account bounce with: 550 Invalid recipient (#5.1.1) Signed-off-by: Krzysztof Kozlowski Acked-by: Roger Quadros Acked-By: Vinod Koul Acked-by: Lee Jones Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220221100701.48593-1-krzysztof.kozlowski@canonical.com --- .../devicetree/bindings/mfd/ti,j721e-system-controller.yaml | 2 +- Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml | 2 +- Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml | 2 +- Documentation/devicetree/bindings/usb/ti,keystone-dwc3.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/ti,j721e-system-controller.yaml b/Documentation/devicetree/bindings/mfd/ti,j721e-system-controller.yaml index 272832e9f8f2..fa86691ebf16 100644 --- a/Documentation/devicetree/bindings/mfd/ti,j721e-system-controller.yaml +++ b/Documentation/devicetree/bindings/mfd/ti,j721e-system-controller.yaml @@ -20,7 +20,7 @@ description: | maintainers: - Kishon Vijay Abraham I - - Roger Quadros properties: compatible: diff --git a/Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml b/Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml index cbbf5e8b1197..f78d3246fbdc 100644 --- a/Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml +++ b/Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml @@ -8,7 +8,7 @@ title: OMAP USB2 PHY maintainers: - Kishon Vijay Abraham I - - Roger Quadros + - Roger Quadros properties: compatible: diff --git a/Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml b/Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml index a634774c537c..eedde385d299 100644 --- a/Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml +++ b/Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml @@ -7,7 +7,7 @@ $schema: "http://devicetree.org/meta-schemas/core.yaml#" title: Bindings for the TI wrapper module for the Cadence USBSS-DRD controller maintainers: - - Roger Quadros + - Roger Quadros properties: compatible: diff --git a/Documentation/devicetree/bindings/usb/ti,keystone-dwc3.yaml b/Documentation/devicetree/bindings/usb/ti,keystone-dwc3.yaml index f6e91a5fd8fe..4f7a212fddd3 100644 --- a/Documentation/devicetree/bindings/usb/ti,keystone-dwc3.yaml +++ b/Documentation/devicetree/bindings/usb/ti,keystone-dwc3.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: TI Keystone Soc USB Controller maintainers: - - Roger Quadros + - Roger Quadros properties: compatible: From ce2fc710c9d2b25afc710f49bb2065b4439a62bc Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 21 Feb 2022 15:06:49 +0100 Subject: [PATCH 338/773] selinux: fix misuse of mutex_is_locked() mutex_is_locked() tests whether the mutex is locked *by any task*, while here we want to test if it is held *by the current task*. To avoid false/missed WARNINGs, use lockdep_assert_is_held() and lockdep_assert_is_not_held() instead, which do the right thing (though they are a no-op if CONFIG_LOCKDEP=n). Cc: stable@vger.kernel.org Fixes: 2554a48f4437 ("selinux: measure state and policy capabilities") Signed-off-by: Ondrej Mosnacek Signed-off-by: Paul Moore --- security/selinux/ima.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/selinux/ima.c b/security/selinux/ima.c index 727c4e43219d..ff7aea6b3774 100644 --- a/security/selinux/ima.c +++ b/security/selinux/ima.c @@ -77,7 +77,7 @@ void selinux_ima_measure_state_locked(struct selinux_state *state) size_t policy_len; int rc = 0; - WARN_ON(!mutex_is_locked(&state->policy_mutex)); + lockdep_assert_held(&state->policy_mutex); state_str = selinux_ima_collect_state(state); if (!state_str) { @@ -117,7 +117,7 @@ void selinux_ima_measure_state_locked(struct selinux_state *state) */ void selinux_ima_measure_state(struct selinux_state *state) { - WARN_ON(mutex_is_locked(&state->policy_mutex)); + lockdep_assert_not_held(&state->policy_mutex); mutex_lock(&state->policy_mutex); selinux_ima_measure_state_locked(state); From 13e741b834538a225512912608f0182079fc64e2 Mon Sep 17 00:00:00 2001 From: German Gomez Date: Mon, 21 Feb 2022 17:17:06 +0000 Subject: [PATCH 339/773] perf script: Fix error when printing 'weight' field In SPE traces the 'weight' field can't be printed in 'perf script' because the 'dummy:u' event doesn't have the WEIGHT attribute set. Use evsel__do_check_stype(..) to check this field, as it's done with other fields such as "phys_addr". Before: $ perf record -e arm_spe_0// -- sleep 1 $ perf script -F event,ip,weight Samples for 'dummy:u' event do not have WEIGHT attribute set. Cannot print 'weight' field. After: $ perf script -F event,ip,weight l1d-access: 12 ffffaf629d4cb320 tlb-access: 12 ffffaf629d4cb320 memory: 12 ffffaf629d4cb320 Fixes: b0fde9c6e291e528 ("perf arm-spe: Add SPE total latency as PERF_SAMPLE_WEIGHT") Signed-off-by: German Gomez Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220221171707.62960-1-german.gomez@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index abae8184e171..fa478ddcd18a 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -463,7 +463,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) return -EINVAL; if (PRINT_FIELD(WEIGHT) && - evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT_TYPE, "WEIGHT", PERF_OUTPUT_WEIGHT)) + evsel__do_check_stype(evsel, PERF_SAMPLE_WEIGHT_TYPE, "WEIGHT", PERF_OUTPUT_WEIGHT, allow_user_set)) return -EINVAL; if (PRINT_FIELD(SYM) && From ef527f968ae05c6717c39f49c8709a7e2c19183a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 20 Feb 2022 07:40:52 -0800 Subject: [PATCH 340/773] net: __pskb_pull_tail() & pskb_carve_frag_list() drop_monitor friends Whenever one of these functions pull all data from an skb in a frag_list, use consume_skb() instead of kfree_skb() to avoid polluting drop monitoring. Fixes: 6fa01ccd8830 ("skbuff: Add pskb_extract() helper function") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20220220154052.1308469-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6a15ce3eb1d3..b8138c372535 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2276,7 +2276,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { @@ -6105,7 +6105,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb, /* Free pulled out fragments. */ while ((list = shinfo->frag_list) != insp) { shinfo->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { From 8d093e02e898b24c58788b0289e3202317a96d2a Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 19 Feb 2022 20:44:43 +0300 Subject: [PATCH 341/773] ata: pata_hpt37x: disable primary channel on HPT371 The HPT371 chip physically has only one channel, the secondary one, however the primary channel registers do exist! Thus we have to manually disable the non-existing channel if the BIOS hasn't done this already. Similarly to the pata_hpt3x2n driver, always disable the primary channel. Fixes: 669a5db411d8 ("[libata] Add a bunch of PATA drivers.") Cc: stable@vger.kernel.org Signed-off-by: Sergey Shtylyov Signed-off-by: Damien Le Moal --- drivers/ata/pata_hpt37x.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c index 1baaca7b72ed..6fa4a2faf49c 100644 --- a/drivers/ata/pata_hpt37x.c +++ b/drivers/ata/pata_hpt37x.c @@ -919,6 +919,20 @@ static int hpt37x_init_one(struct pci_dev *dev, const struct pci_device_id *id) irqmask &= ~0x10; pci_write_config_byte(dev, 0x5a, irqmask); + /* + * HPT371 chips physically have only one channel, the secondary one, + * but the primary channel registers do exist! Go figure... + * So, we manually disable the non-existing channel here + * (if the BIOS hasn't done this already). + */ + if (dev->device == PCI_DEVICE_ID_TTI_HPT371) { + u8 mcr1; + + pci_read_config_byte(dev, 0x50, &mcr1); + mcr1 &= ~0x04; + pci_write_config_byte(dev, 0x50, mcr1); + } + /* * default to pci clock. make sure MA15/16 are set to output * to prevent drives having problems with 40-pin cables. Needed From 342b6419193c6f697fd47d9c72fcff9cafc70687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= Date: Mon, 21 Feb 2022 21:35:38 +0100 Subject: [PATCH 342/773] net: dsa: fix panic when removing unoffloaded port from bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a bridged port is not offloaded to the hardware - either because the underlying driver does not implement the port_bridge_{join,leave} ops, or because the operation failed - then its dp->bridge pointer will be NULL when dsa_port_bridge_leave() is called. Avoid dereferncing NULL. This fixes the following splat when removing a port from a bridge: Unable to handle kernel access to user memory outside uaccess routines at virtual address 0000000000000000 Internal error: Oops: 96000004 [#1] PREEMPT_RT SMP CPU: 3 PID: 1119 Comm: brctl Tainted: G O 5.17.0-rc4-rt4 #1 Call trace: dsa_port_bridge_leave+0x8c/0x1e4 dsa_slave_changeupper+0x40/0x170 dsa_slave_netdevice_event+0x494/0x4d4 notifier_call_chain+0x80/0xe0 raw_notifier_call_chain+0x1c/0x24 call_netdevice_notifiers_info+0x5c/0xac __netdev_upper_dev_unlink+0xa4/0x200 netdev_upper_dev_unlink+0x38/0x60 del_nbp+0x1b0/0x300 br_del_if+0x38/0x114 add_del_if+0x60/0xa0 br_ioctl_stub+0x128/0x2dc br_ioctl_call+0x68/0xb0 dev_ifsioc+0x390/0x554 dev_ioctl+0x128/0x400 sock_do_ioctl+0xb4/0xf4 sock_ioctl+0x12c/0x4e0 __arm64_sys_ioctl+0xa8/0xf0 invoke_syscall+0x4c/0x110 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x28/0x84 el0_svc+0x1c/0x50 el0t_64_sync_handler+0xa8/0xb0 el0t_64_sync+0x17c/0x180 Code: f9402f00 f0002261 f9401302 913cc021 (a9401404) ---[ end trace 0000000000000000 ]--- Fixes: d3eed0e57d5d ("net: dsa: keep the bridge_dev and bridge_num as part of the same structure") Signed-off-by: Alvin Šipraga Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220221203539.310690-1-alvin@pqrs.dk Signed-off-by: Jakub Kicinski --- net/dsa/port.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/dsa/port.c b/net/dsa/port.c index eef4a98f2628..1a40c52f5a42 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -395,10 +395,17 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) .tree_index = dp->ds->dst->index, .sw_index = dp->ds->index, .port = dp->index, - .bridge = *dp->bridge, }; int err; + /* If the port could not be offloaded to begin with, then + * there is nothing to do. + */ + if (!dp->bridge) + return; + + info.bridge = *dp->bridge; + /* Here the port is already unbridged. Reflect the current configuration * so that drivers can program their chips accordingly. */ From 277f2bb14361790a70e4b3c649e794b75a91a597 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 21 Feb 2022 15:05:45 -0600 Subject: [PATCH 343/773] ibmvnic: schedule failover only if vioctl fails If client is unable to initiate a failover reset via H_VIOCTL hcall, then it should schedule a failover reset as a last resort. Otherwise, there is no need to do a last resort. Fixes: 334c42414729 ("ibmvnic: improve failover sysfs entry") Reported-by: Cris Forno Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Dany Madden Link: https://lore.kernel.org/r/20220221210545.115283-1-drt@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 29617a86b299..dee05a353dbd 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5917,10 +5917,14 @@ static ssize_t failover_store(struct device *dev, struct device_attribute *attr, be64_to_cpu(session_token)); rc = plpar_hcall_norets(H_VIOCTL, adapter->vdev->unit_address, H_SESSION_ERR_DETECTED, session_token, 0, 0); - if (rc) + if (rc) { netdev_err(netdev, "H_VIOCTL initiated failover failed, rc %ld\n", rc); + goto last_resort; + } + + return count; last_resort: netdev_dbg(netdev, "Trying to send CRQ_CMD, the last resort\n"); From ae089831ff28a115908b8d796f667c2dadef1637 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Feb 2022 10:13:31 -0800 Subject: [PATCH 344/773] netfilter: nf_tables: prefer kfree_rcu(ptr, rcu) variant While kfree_rcu(ptr) _is_ supported, it has some limitations. Given that 99.99% of kfree_rcu() users [1] use the legacy two parameters variant, and @catchall objects do have an rcu head, simply use it. Choice of kfree_rcu(ptr) variant was probably not intentional. [1] including calls from net/netfilter/nf_tables_api.c Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Signed-off-by: Eric Dumazet Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9cd1d7a62804..c86748b3873b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4502,7 +4502,7 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); nft_set_elem_destroy(set, catchall->elem, true); - kfree_rcu(catchall); + kfree_rcu(catchall, rcu); } } @@ -5669,7 +5669,7 @@ static void nft_setelem_catchall_remove(const struct net *net, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { if (catchall->elem == elem->priv) { list_del_rcu(&catchall->list); - kfree_rcu(catchall); + kfree_rcu(catchall, rcu); break; } } From f762ce78897d734a08f52e39a353359b7d417578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 21 Feb 2022 09:31:28 +0100 Subject: [PATCH 345/773] drm/radeon: fix variable type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we switch to dma_resv_wait_timeout() the returned type changes as well. Signed-off-by: Christian König Fixes: 89aae41d740f ("drm/radeon: use dma_resv_wait_timeout() instead of manually waiting") Bug: https://bugzilla.kernel.org/show_bug.cgi?id=215600 Reviewed-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20220221110503.2803-1-christian.koenig@amd.com --- drivers/gpu/drm/radeon/radeon_uvd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c index 377f9cdb5b53..84013faa4756 100644 --- a/drivers/gpu/drm/radeon/radeon_uvd.c +++ b/drivers/gpu/drm/radeon/radeon_uvd.c @@ -470,8 +470,8 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, struct radeon_bo *bo, int32_t *msg, msg_type, handle; unsigned img_size = 0; void *ptr; - - int i, r; + long r; + int i; if (offset & 0x3F) { DRM_ERROR("UVD messages must be 64 byte aligned!\n"); @@ -481,13 +481,13 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, struct radeon_bo *bo, r = dma_resv_wait_timeout(bo->tbo.base.resv, false, false, MAX_SCHEDULE_TIMEOUT); if (r <= 0) { - DRM_ERROR("Failed waiting for UVD message (%d)!\n", r); + DRM_ERROR("Failed waiting for UVD message (%ld)!\n", r); return r ? r : -ETIME; } r = radeon_bo_kmap(bo, &ptr); if (r) { - DRM_ERROR("Failed mapping the UVD message (%d)!\n", r); + DRM_ERROR("Failed mapping the UVD message (%ld)!\n", r); return r; } From 8913e1aea4b32a866343b14e565c62cec54f3f78 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 20 Dec 2021 11:28:22 +0100 Subject: [PATCH 346/773] drm/tegra: dpaux: Populate AUX bus The DPAUX hardware block exposes an DP AUX interface that provides access to an AUX bus and the devices on that bus. Use the DP AUX bus infrastructure that was recently introduced to probe devices on this bus from DT. Signed-off-by: Thierry Reding --- drivers/gpu/drm/tegra/Kconfig | 1 + drivers/gpu/drm/tegra/dpaux.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/gpu/drm/tegra/Kconfig b/drivers/gpu/drm/tegra/Kconfig index 8cf5aeb9db6c..201f5175ecfe 100644 --- a/drivers/gpu/drm/tegra/Kconfig +++ b/drivers/gpu/drm/tegra/Kconfig @@ -5,6 +5,7 @@ config DRM_TEGRA depends on COMMON_CLK depends on DRM depends on OF + select DRM_DP_AUX_BUS select DRM_KMS_HELPER select DRM_MIPI_DSI select DRM_PANEL diff --git a/drivers/gpu/drm/tegra/dpaux.c b/drivers/gpu/drm/tegra/dpaux.c index 1f96e416fa08..d7a731d287d2 100644 --- a/drivers/gpu/drm/tegra/dpaux.c +++ b/drivers/gpu/drm/tegra/dpaux.c @@ -19,6 +19,7 @@ #include #include +#include #include #include "dp.h" @@ -570,6 +571,12 @@ static int tegra_dpaux_probe(struct platform_device *pdev) list_add_tail(&dpaux->list, &dpaux_list); mutex_unlock(&dpaux_lock); + err = devm_of_dp_aux_populate_ep_devices(&dpaux->aux); + if (err < 0) { + dev_err(dpaux->dev, "failed to populate AUX bus: %d\n", err); + return err; + } + return 0; } From 8d3b01e0d4bb54368d73d0984466d72c2eeeac74 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 20 Dec 2021 11:32:39 +0100 Subject: [PATCH 347/773] ARM: tegra: Move panels to AUX bus Move the eDP panel on Venice 2 and Nyan boards into the corresponding AUX bus device tree node. This allows us to avoid a nasty circular dependency that would otherwise be created between the DPAUX and panel nodes via the DDC/I2C phandle. Fixes: eb481f9ac95c ("ARM: tegra: add Acer Chromebook 13 device tree") Fixes: 59fe02cb079f ("ARM: tegra: Add DTS for the nyan-blaze board") Fixes: 40e231c770a4 ("ARM: tegra: Enable eDP for Venice2") Signed-off-by: Thierry Reding --- arch/arm/boot/dts/tegra124-nyan-big.dts | 15 +++++++++------ arch/arm/boot/dts/tegra124-nyan-blaze.dts | 15 +++++++++------ arch/arm/boot/dts/tegra124-venice2.dts | 14 +++++++------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/arm/boot/dts/tegra124-nyan-big.dts b/arch/arm/boot/dts/tegra124-nyan-big.dts index 1d2aac2cb6d0..fdc1d64dfff9 100644 --- a/arch/arm/boot/dts/tegra124-nyan-big.dts +++ b/arch/arm/boot/dts/tegra124-nyan-big.dts @@ -13,12 +13,15 @@ "google,nyan-big-rev1", "google,nyan-big-rev0", "google,nyan-big", "google,nyan", "nvidia,tegra124"; - panel: panel { - compatible = "auo,b133xtn01"; - - power-supply = <&vdd_3v3_panel>; - backlight = <&backlight>; - ddc-i2c-bus = <&dpaux>; + host1x@50000000 { + dpaux@545c0000 { + aux-bus { + panel: panel { + compatible = "auo,b133xtn01"; + backlight = <&backlight>; + }; + }; + }; }; mmc@700b0400 { /* SD Card on this bus */ diff --git a/arch/arm/boot/dts/tegra124-nyan-blaze.dts b/arch/arm/boot/dts/tegra124-nyan-blaze.dts index 677babde6460..abdf4456826f 100644 --- a/arch/arm/boot/dts/tegra124-nyan-blaze.dts +++ b/arch/arm/boot/dts/tegra124-nyan-blaze.dts @@ -15,12 +15,15 @@ "google,nyan-blaze-rev0", "google,nyan-blaze", "google,nyan", "nvidia,tegra124"; - panel: panel { - compatible = "samsung,ltn140at29-301"; - - power-supply = <&vdd_3v3_panel>; - backlight = <&backlight>; - ddc-i2c-bus = <&dpaux>; + host1x@50000000 { + dpaux@545c0000 { + aux-bus { + panel: panel { + compatible = "samsung,ltn140at29-301"; + backlight = <&backlight>; + }; + }; + }; }; sound { diff --git a/arch/arm/boot/dts/tegra124-venice2.dts b/arch/arm/boot/dts/tegra124-venice2.dts index 232c90604df9..6a9592ceb5f2 100644 --- a/arch/arm/boot/dts/tegra124-venice2.dts +++ b/arch/arm/boot/dts/tegra124-venice2.dts @@ -48,6 +48,13 @@ dpaux@545c0000 { vdd-supply = <&vdd_3v3_panel>; status = "okay"; + + aux-bus { + panel: panel { + compatible = "lg,lp129qe"; + backlight = <&backlight>; + }; + }; }; }; @@ -1080,13 +1087,6 @@ }; }; - panel: panel { - compatible = "lg,lp129qe"; - power-supply = <&vdd_3v3_panel>; - backlight = <&backlight>; - ddc-i2c-bus = <&dpaux>; - }; - vdd_mux: regulator-mux { compatible = "regulator-fixed"; regulator-name = "+VDD_MUX"; From a58da53ffd70294ebea8ecd0eb45fd0d74add9f9 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 22 Feb 2022 10:47:42 +0100 Subject: [PATCH 348/773] vhost/vsock: don't check owner in vhost_vsock_stop() while releasing vhost_vsock_stop() calls vhost_dev_check_owner() to check the device ownership. It expects current->mm to be valid. vhost_vsock_stop() is also called by vhost_vsock_dev_release() when the user has not done close(), so when we are in do_exit(). In this case current->mm is invalid and we're releasing the device, so we should clean it anyway. Let's check the owner only when vhost_vsock_stop() is called by an ioctl. When invoked from release we can not fail so we don't check return code of vhost_vsock_stop(). We need to stop vsock even if it's not the owner. Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko") Cc: stable@vger.kernel.org Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+3140b17cb44a7b174008@syzkaller.appspotmail.com Signed-off-by: Stefano Garzarella Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/vhost/vsock.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index d6ca1c7ad513..37f0b4274113 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -629,16 +629,18 @@ err: return ret; } -static int vhost_vsock_stop(struct vhost_vsock *vsock) +static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner) { size_t i; - int ret; + int ret = 0; mutex_lock(&vsock->dev.mutex); - ret = vhost_dev_check_owner(&vsock->dev); - if (ret) - goto err; + if (check_owner) { + ret = vhost_dev_check_owner(&vsock->dev); + if (ret) + goto err; + } for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { struct vhost_virtqueue *vq = &vsock->vqs[i]; @@ -753,7 +755,12 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file) * inefficient. Room for improvement here. */ vsock_for_each_connected_socket(vhost_vsock_reset_orphans); - vhost_vsock_stop(vsock); + /* Don't check the owner, because we are in the release path, so we + * need to stop the vsock device in any case. + * vhost_vsock_stop() can not fail in this case, so we don't need to + * check the return code. + */ + vhost_vsock_stop(vsock, false); vhost_vsock_flush(vsock); vhost_dev_stop(&vsock->dev); @@ -868,7 +875,7 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, if (start) return vhost_vsock_start(vsock); else - return vhost_vsock_stop(vsock); + return vhost_vsock_stop(vsock, true); case VHOST_GET_FEATURES: features = VHOST_VSOCK_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) From de7b2efacf4e83954aed3f029d347dfc0b7a4f49 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 22 Feb 2022 16:42:51 +0300 Subject: [PATCH 349/773] udp_tunnel: Fix end of loop test in udp_tunnel_nic_unregister() This test is checking if we exited the list via break or not. However if it did not exit via a break then "node" does not point to a valid udp_tunnel_nic_shared_node struct. It will work because of the way the structs are laid out it's the equivalent of "if (info->shared->udp_tunnel_nic_info != dev)" which will always be true, but it's not the right way to test. Fixes: 74cc6d182d03 ("udp_tunnel: add the ability to share port tables") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/ipv4/udp_tunnel_nic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index b91003538d87..bc3a043a5d5c 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -846,7 +846,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; - if (node->dev != dev) + if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); From a1f8fec4dac8bc7b172b2bdbd881e015261a6322 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 22 Feb 2022 16:43:12 +0300 Subject: [PATCH 350/773] tipc: Fix end of loop tests for list_for_each_entry() These tests are supposed to check if the loop exited via a break or not. However the tests are wrong because if we did not exit via a break then "p" is not a valid pointer. In that case, it's the equivalent of "if (*(u32 *)sr == *last_key) {". That's going to work most of the time, but there is a potential for those to be equal. Fixes: 1593123a6a49 ("tipc: add name table dump to new netlink api") Fixes: 1a1a143daf84 ("tipc: add publication dump to new netlink api") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/tipc/name_table.c | 2 +- net/tipc/socket.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 01396dd1c899..1d8ba233d047 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -967,7 +967,7 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, list_for_each_entry(p, &sr->all_publ, all_publ) if (p->key == *last_key) break; - if (p->key != *last_key) + if (list_entry_is_head(p, &sr->all_publ, all_publ)) return -EPIPE; } else { p = list_first_entry(&sr->all_publ, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3e63c83e641c..7545321c3440 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3749,7 +3749,7 @@ static int __tipc_nl_list_sk_publ(struct sk_buff *skb, if (p->key == *last_publ) break; } - if (p->key != *last_publ) { + if (list_entry_is_head(p, &tsk->publications, binding_sock)) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback From 404ba13a6588d72b3fb9e5c17b73e4725f18c047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= Date: Tue, 22 Feb 2022 17:14:08 +0100 Subject: [PATCH 351/773] MAINTAINERS: add myself as co-maintainer for Realtek DSA switch drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding myself (Alvin Šipraga) as another maintainer for the Realtek DSA switch drivers. I intend to help Linus out with reviewing and testing changes to these drivers, particularly the rtl8365mb driver which I authored and have hardware access to. Cc: Linus Walleij Signed-off-by: Alvin Šipraga Reviewed-by: Linus Walleij Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b57077b935ba..0171f3e949fb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16370,6 +16370,7 @@ F: drivers/watchdog/realtek_otto_wdt.c REALTEK RTL83xx SMI DSA ROUTER CHIPS M: Linus Walleij +M: Alvin Šipraga S: Maintained F: Documentation/devicetree/bindings/net/dsa/realtek-smi.txt F: drivers/net/dsa/realtek-smi* From ecf4a24cf97838fb0b78d4ede0f91d80b058289c Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Wed, 23 Feb 2022 10:34:19 +0800 Subject: [PATCH 352/773] net: sched: avoid newline at end of message in NL_SET_ERR_MSG_MOD Fix following coccicheck warning: ./net/sched/act_api.c:277:7-49: WARNING avoid newline at end of message in NL_SET_ERR_MSG_MOD Signed-off-by: Wan Jiabing Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2811348f3acc..ca03e7284254 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -274,7 +274,7 @@ static int tcf_action_offload_add_ex(struct tc_action *action, err = tc_setup_action(&fl_action->action, actions); if (err) { NL_SET_ERR_MSG_MOD(extack, - "Failed to setup tc actions for offload\n"); + "Failed to setup tc actions for offload"); goto fl_err; } From 4f1e72850d452e5c3302faa82a01f179ff5f9482 Mon Sep 17 00:00:00 2001 From: Heyi Guo Date: Wed, 23 Feb 2022 11:14:34 +0800 Subject: [PATCH 353/773] drivers/net/ftgmac100: refactor ftgmac100_reset_task to enable direct function call This is to prepare for ftgmac100_adjust_link() to call reset function directly, instead of task schedule. Signed-off-by: Heyi Guo Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 691605c15265..1f3eb4431475 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1410,10 +1410,8 @@ static int ftgmac100_init_all(struct ftgmac100 *priv, bool ignore_alloc_err) return err; } -static void ftgmac100_reset_task(struct work_struct *work) +static void ftgmac100_reset(struct ftgmac100 *priv) { - struct ftgmac100 *priv = container_of(work, struct ftgmac100, - reset_task); struct net_device *netdev = priv->netdev; int err; @@ -1459,6 +1457,14 @@ static void ftgmac100_reset_task(struct work_struct *work) rtnl_unlock(); } +static void ftgmac100_reset_task(struct work_struct *work) +{ + struct ftgmac100 *priv = container_of(work, struct ftgmac100, + reset_task); + + ftgmac100_reset(priv); +} + static int ftgmac100_open(struct net_device *netdev) { struct ftgmac100 *priv = netdev_priv(netdev); From 3c773dba8182cdfea7b32caafe9290240ab8de5f Mon Sep 17 00:00:00 2001 From: Heyi Guo Date: Wed, 23 Feb 2022 11:14:35 +0800 Subject: [PATCH 354/773] drivers/net/ftgmac100: adjust code place for function call dependency This is to prepare for ftgmac100_adjust_link() to call ftgmac100_reset() directly. Only code places are changed. Signed-off-by: Heyi Guo Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 222 +++++++++++------------ 1 file changed, 111 insertions(+), 111 deletions(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 1f3eb4431475..c1deb6e5d26c 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -989,117 +989,6 @@ static int ftgmac100_alloc_rx_buffers(struct ftgmac100 *priv) return 0; } -static void ftgmac100_adjust_link(struct net_device *netdev) -{ - struct ftgmac100 *priv = netdev_priv(netdev); - struct phy_device *phydev = netdev->phydev; - bool tx_pause, rx_pause; - int new_speed; - - /* We store "no link" as speed 0 */ - if (!phydev->link) - new_speed = 0; - else - new_speed = phydev->speed; - - /* Grab pause settings from PHY if configured to do so */ - if (priv->aneg_pause) { - rx_pause = tx_pause = phydev->pause; - if (phydev->asym_pause) - tx_pause = !rx_pause; - } else { - rx_pause = priv->rx_pause; - tx_pause = priv->tx_pause; - } - - /* Link hasn't changed, do nothing */ - if (phydev->speed == priv->cur_speed && - phydev->duplex == priv->cur_duplex && - rx_pause == priv->rx_pause && - tx_pause == priv->tx_pause) - return; - - /* Print status if we have a link or we had one and just lost it, - * don't print otherwise. - */ - if (new_speed || priv->cur_speed) - phy_print_status(phydev); - - priv->cur_speed = new_speed; - priv->cur_duplex = phydev->duplex; - priv->rx_pause = rx_pause; - priv->tx_pause = tx_pause; - - /* Link is down, do nothing else */ - if (!new_speed) - return; - - /* Disable all interrupts */ - iowrite32(0, priv->base + FTGMAC100_OFFSET_IER); - - /* Reset the adapter asynchronously */ - schedule_work(&priv->reset_task); -} - -static int ftgmac100_mii_probe(struct net_device *netdev) -{ - struct ftgmac100 *priv = netdev_priv(netdev); - struct platform_device *pdev = to_platform_device(priv->dev); - struct device_node *np = pdev->dev.of_node; - struct phy_device *phydev; - phy_interface_t phy_intf; - int err; - - /* Default to RGMII. It's a gigabit part after all */ - err = of_get_phy_mode(np, &phy_intf); - if (err) - phy_intf = PHY_INTERFACE_MODE_RGMII; - - /* Aspeed only supports these. I don't know about other IP - * block vendors so I'm going to just let them through for - * now. Note that this is only a warning if for some obscure - * reason the DT really means to lie about it or it's a newer - * part we don't know about. - * - * On the Aspeed SoC there are additionally straps and SCU - * control bits that could tell us what the interface is - * (or allow us to configure it while the IP block is held - * in reset). For now I chose to keep this driver away from - * those SoC specific bits and assume the device-tree is - * right and the SCU has been configured properly by pinmux - * or the firmware. - */ - if (priv->is_aspeed && !(phy_interface_mode_is_rgmii(phy_intf))) { - netdev_warn(netdev, - "Unsupported PHY mode %s !\n", - phy_modes(phy_intf)); - } - - phydev = phy_find_first(priv->mii_bus); - if (!phydev) { - netdev_info(netdev, "%s: no PHY found\n", netdev->name); - return -ENODEV; - } - - phydev = phy_connect(netdev, phydev_name(phydev), - &ftgmac100_adjust_link, phy_intf); - - if (IS_ERR(phydev)) { - netdev_err(netdev, "%s: Could not attach to PHY\n", netdev->name); - return PTR_ERR(phydev); - } - - /* Indicate that we support PAUSE frames (see comment in - * Documentation/networking/phy.rst) - */ - phy_support_asym_pause(phydev); - - /* Display what we found */ - phy_attached_info(phydev); - - return 0; -} - static int ftgmac100_mdiobus_read(struct mii_bus *bus, int phy_addr, int regnum) { struct net_device *netdev = bus->priv; @@ -1465,6 +1354,117 @@ static void ftgmac100_reset_task(struct work_struct *work) ftgmac100_reset(priv); } +static void ftgmac100_adjust_link(struct net_device *netdev) +{ + struct ftgmac100 *priv = netdev_priv(netdev); + struct phy_device *phydev = netdev->phydev; + bool tx_pause, rx_pause; + int new_speed; + + /* We store "no link" as speed 0 */ + if (!phydev->link) + new_speed = 0; + else + new_speed = phydev->speed; + + /* Grab pause settings from PHY if configured to do so */ + if (priv->aneg_pause) { + rx_pause = tx_pause = phydev->pause; + if (phydev->asym_pause) + tx_pause = !rx_pause; + } else { + rx_pause = priv->rx_pause; + tx_pause = priv->tx_pause; + } + + /* Link hasn't changed, do nothing */ + if (phydev->speed == priv->cur_speed && + phydev->duplex == priv->cur_duplex && + rx_pause == priv->rx_pause && + tx_pause == priv->tx_pause) + return; + + /* Print status if we have a link or we had one and just lost it, + * don't print otherwise. + */ + if (new_speed || priv->cur_speed) + phy_print_status(phydev); + + priv->cur_speed = new_speed; + priv->cur_duplex = phydev->duplex; + priv->rx_pause = rx_pause; + priv->tx_pause = tx_pause; + + /* Link is down, do nothing else */ + if (!new_speed) + return; + + /* Disable all interrupts */ + iowrite32(0, priv->base + FTGMAC100_OFFSET_IER); + + /* Reset the adapter asynchronously */ + schedule_work(&priv->reset_task); +} + +static int ftgmac100_mii_probe(struct net_device *netdev) +{ + struct ftgmac100 *priv = netdev_priv(netdev); + struct platform_device *pdev = to_platform_device(priv->dev); + struct device_node *np = pdev->dev.of_node; + struct phy_device *phydev; + phy_interface_t phy_intf; + int err; + + /* Default to RGMII. It's a gigabit part after all */ + err = of_get_phy_mode(np, &phy_intf); + if (err) + phy_intf = PHY_INTERFACE_MODE_RGMII; + + /* Aspeed only supports these. I don't know about other IP + * block vendors so I'm going to just let them through for + * now. Note that this is only a warning if for some obscure + * reason the DT really means to lie about it or it's a newer + * part we don't know about. + * + * On the Aspeed SoC there are additionally straps and SCU + * control bits that could tell us what the interface is + * (or allow us to configure it while the IP block is held + * in reset). For now I chose to keep this driver away from + * those SoC specific bits and assume the device-tree is + * right and the SCU has been configured properly by pinmux + * or the firmware. + */ + if (priv->is_aspeed && !(phy_interface_mode_is_rgmii(phy_intf))) { + netdev_warn(netdev, + "Unsupported PHY mode %s !\n", + phy_modes(phy_intf)); + } + + phydev = phy_find_first(priv->mii_bus); + if (!phydev) { + netdev_info(netdev, "%s: no PHY found\n", netdev->name); + return -ENODEV; + } + + phydev = phy_connect(netdev, phydev_name(phydev), + &ftgmac100_adjust_link, phy_intf); + + if (IS_ERR(phydev)) { + netdev_err(netdev, "%s: Could not attach to PHY\n", netdev->name); + return PTR_ERR(phydev); + } + + /* Indicate that we support PAUSE frames (see comment in + * Documentation/networking/phy.rst) + */ + phy_support_asym_pause(phydev); + + /* Display what we found */ + phy_attached_info(phydev); + + return 0; +} + static int ftgmac100_open(struct net_device *netdev) { struct ftgmac100 *priv = netdev_priv(netdev); From 1baf2e50e48f10f0ea07d53e13381fd0da1546d2 Mon Sep 17 00:00:00 2001 From: Heyi Guo Date: Wed, 23 Feb 2022 11:14:36 +0800 Subject: [PATCH 355/773] drivers/net/ftgmac100: fix DHCP potential failure with systemd DHCP failures were observed with systemd 247.6. The issue could be reproduced by rebooting Aspeed 2600 and then running ifconfig ethX down/up. It is caused by below procedures in the driver: 1. ftgmac100_open() enables net interface and call phy_start() 2. When PHY is link up, it calls netif_carrier_on() and then adjust_link callback 3. ftgmac100_adjust_link() will schedule the reset task 4. ftgmac100_reset_task() will then reset the MAC in another schedule After step 2, systemd will be notified to send DHCP discover packet, while the packet might be corrupted by MAC reset operation in step 4. Call ftgmac100_reset() directly instead of scheduling task to fix the issue. Signed-off-by: Heyi Guo Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index c1deb6e5d26c..d5356db7539a 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1402,8 +1402,17 @@ static void ftgmac100_adjust_link(struct net_device *netdev) /* Disable all interrupts */ iowrite32(0, priv->base + FTGMAC100_OFFSET_IER); - /* Reset the adapter asynchronously */ - schedule_work(&priv->reset_task); + /* Release phy lock to allow ftgmac100_reset to aquire it, keeping lock + * order consistent to prevent dead lock. + */ + if (netdev->phydev) + mutex_unlock(&netdev->phydev->lock); + + ftgmac100_reset(priv); + + if (netdev->phydev) + mutex_lock(&netdev->phydev->lock); + } static int ftgmac100_mii_probe(struct net_device *netdev) From ecbd4912a693b862e25cba0a6990a8c95b00721e Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 3 Feb 2022 12:54:16 +0100 Subject: [PATCH 356/773] drm/edid: Always set RGB444 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to fill the drm_display_info structure each time an EDID is read, the code currently will call drm_add_display_info with the parsed EDID. drm_add_display_info will then call drm_reset_display_info to reset all the fields to 0, and then set them to the proper value depending on the EDID. In the color_formats case, we will thus report that we don't support any color format, and then fill it back with RGB444 plus the additional formats described in the EDID Feature Support byte. However, since that byte only contains format-related bits since the 1.4 specification, this doesn't happen if the EDID is following an earlier specification. In turn, it means that for one of these EDID, we end up with color_formats set to 0. The EDID 1.3 specification never really specifies what it means by RGB exactly, but since both HDMI and DVI will use RGB444, it's fairly safe to assume it's supposed to be RGB444. Let's move the addition of RGB444 to color_formats earlier in drm_add_display_info() so that it's always set for a digital display. Fixes: da05a5a71ad8 ("drm: parse color format support for digital displays") Cc: Ville Syrjälä Reported-by: Matthias Reichl Signed-off-by: Maxime Ripard Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220203115416.1137308-1-maxime@cerno.tech --- drivers/gpu/drm/drm_edid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 12893e7be89b..f5f5de362ff2 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -5345,6 +5345,7 @@ u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edi if (!(edid->input & DRM_EDID_INPUT_DIGITAL)) return quirks; + info->color_formats |= DRM_COLOR_FORMAT_RGB444; drm_parse_cea_ext(connector, edid); /* @@ -5393,7 +5394,6 @@ u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edi DRM_DEBUG("%s: Assigning EDID-1.4 digital sink color depth as %d bpc.\n", connector->name, info->bpc); - info->color_formats |= DRM_COLOR_FORMAT_RGB444; if (edid->features & DRM_EDID_FEATURE_RGB_YCRCB444) info->color_formats |= DRM_COLOR_FORMAT_YCRCB444; if (edid->features & DRM_EDID_FEATURE_RGB_YCRCB422) From 363f6368603743072e5f318c668c632bccb097a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:07:15 +0100 Subject: [PATCH 357/773] nvme: don't return an error from nvme_configure_metadata When a fabrics controller claims to support an invalidate metadata configuration we already warn and disable metadata support. No need to also return an error during revalidation. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Daniel Wagner Tested-by: Kanchan Joshi --- drivers/nvme/host/core.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 469f23186159..5fc040b279bf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1723,7 +1723,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return 0; } -static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) +static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -1739,7 +1739,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - return 0; + return; + if (ctrl->ops->flags & NVME_F_FABRICS) { /* * The NVMe over Fabrics specification only supports metadata as @@ -1747,7 +1748,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) * remap the separate metadata buffer from the block layer. */ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) - return -EINVAL; + return; ns->features |= NVME_NS_EXT_LBAS; @@ -1774,8 +1775,6 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) else ns->features |= NVME_NS_METADATA_SUPPORTED; } - - return 0; } static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, @@ -1916,9 +1915,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) ns->lba_shift = id->lbaf[lbaf].ds; nvme_set_queue_limits(ns->ctrl, ns->queue); - ret = nvme_configure_metadata(ns, id); - if (ret) - goto out_unfreeze; + nvme_configure_metadata(ns, id); nvme_set_chunk_sectors(ns, id); nvme_update_disk_info(ns->disk, ns, id); From 602e57c9799c19f27e440639deed3ec45cfe1651 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 14:14:58 +0100 Subject: [PATCH 358/773] nvme: also mark passthrough-only namespaces ready in nvme_update_ns_info Commit e7d65803e2bb ("nvme-multipath: revalidate paths during rescan") introduced the NVME_NS_READY flag, which nvme_path_is_disabled() uses to check if a path can be used or not. We also need to set this flag for devices that fail the ZNS feature validation and which are available through passthrough devices only to that they can be used in multipathing setups. Fixes: e7d65803e2bb ("nvme-multipath: revalidate paths during rescan") Reported-by: Kanchan Joshi Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Daniel Wagner Tested-by: Kanchan Joshi --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5fc040b279bf..fd4720d37cc0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1931,7 +1931,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) if (blk_queue_is_zoned(ns->queue)) { ret = nvme_revalidate_zones(ns); if (ret && !nvme_first_scan(ns->disk)) - goto out; + return ret; } if (nvme_ns_head_multipath(ns->head)) { @@ -1946,16 +1946,16 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) return 0; out_unfreeze: - blk_mq_unfreeze_queue(ns->disk->queue); -out: /* * If probing fails due an unsupported feature, hide the block device, * but still allow other access. */ if (ret == -ENODEV) { ns->disk->flags |= GENHD_FL_HIDDEN; + set_bit(NVME_NS_READY, &ns->flags); ret = 0; } + blk_mq_unfreeze_queue(ns->disk->queue); return ret; } From c2700d2886a87f83f31e0a301de1d2350b52c79b Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Sat, 22 Jan 2022 22:27:44 +0530 Subject: [PATCH 359/773] nvme-tcp: send H2CData PDUs based on MAXH2CDATA As per NVMe/TCP specification (revision 1.0a, section 3.6.2.3) Maximum Host to Controller Data length (MAXH2CDATA): Specifies the maximum number of PDU-Data bytes per H2CData PDU in bytes. This value is a multiple of dwords and should be no less than 4,096. Current code sets H2CData PDU data_length to r2t_length, it does not check MAXH2CDATA value. Fix this by setting H2CData PDU data_length to min(req->h2cdata_left, queue->maxh2cdata). Also validate MAXH2CDATA value returned by target in ICResp PDU, if it is not a multiple of dword or if it is less than 4096 return -EINVAL from nvme_tcp_init_connection(). Signed-off-by: Varun Prakash Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 63 +++++++++++++++++++++++++++++++--------- include/linux/nvme-tcp.h | 1 + 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 891a36d02e7c..65e00c64a588 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -44,6 +44,8 @@ struct nvme_tcp_request { u32 data_len; u32 pdu_len; u32 pdu_sent; + u32 h2cdata_left; + u32 h2cdata_offset; u16 ttag; __le16 status; struct list_head entry; @@ -95,6 +97,7 @@ struct nvme_tcp_queue { struct nvme_tcp_request *request; int queue_size; + u32 maxh2cdata; size_t cmnd_capsule_len; struct nvme_tcp_ctrl *ctrl; unsigned long flags; @@ -572,23 +575,26 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, return ret; } -static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, - struct nvme_tcp_r2t_pdu *pdu) +static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_data_pdu *data = req->pdu; struct nvme_tcp_queue *queue = req->queue; struct request *rq = blk_mq_rq_from_pdu(req); + u32 h2cdata_sent = req->pdu_len; u8 hdgst = nvme_tcp_hdgst_len(queue); u8 ddgst = nvme_tcp_ddgst_len(queue); req->state = NVME_TCP_SEND_H2C_PDU; req->offset = 0; - req->pdu_len = le32_to_cpu(pdu->r2t_length); + req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata); req->pdu_sent = 0; + req->h2cdata_left -= req->pdu_len; + req->h2cdata_offset += h2cdata_sent; memset(data, 0, sizeof(*data)); data->hdr.type = nvme_tcp_h2c_data; - data->hdr.flags = NVME_TCP_F_DATA_LAST; + if (!req->h2cdata_left) + data->hdr.flags = NVME_TCP_F_DATA_LAST; if (queue->hdr_digest) data->hdr.flags |= NVME_TCP_F_HDGST; if (queue->data_digest) @@ -597,9 +603,9 @@ static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, data->hdr.pdo = data->hdr.hlen + hdgst; data->hdr.plen = cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); - data->ttag = pdu->ttag; + data->ttag = req->ttag; data->command_id = nvme_cid(rq); - data->data_offset = pdu->r2t_offset; + data->data_offset = cpu_to_le32(req->h2cdata_offset); data->data_length = cpu_to_le32(req->pdu_len); } @@ -609,6 +615,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, struct nvme_tcp_request *req; struct request *rq; u32 r2t_length = le32_to_cpu(pdu->r2t_length); + u32 r2t_offset = le32_to_cpu(pdu->r2t_offset); rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { @@ -633,14 +640,19 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, return -EPROTO; } - if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) { + if (unlikely(r2t_offset < req->data_sent)) { dev_err(queue->ctrl->ctrl.device, "req %d unexpected r2t offset %u (expected %zu)\n", - rq->tag, le32_to_cpu(pdu->r2t_offset), req->data_sent); + rq->tag, r2t_offset, req->data_sent); return -EPROTO; } - nvme_tcp_setup_h2c_data_pdu(req, pdu); + req->pdu_len = 0; + req->h2cdata_left = r2t_length; + req->h2cdata_offset = r2t_offset; + req->ttag = pdu->ttag; + + nvme_tcp_setup_h2c_data_pdu(req); nvme_tcp_queue_request(req, false, true); return 0; @@ -928,6 +940,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; int req_data_len = req->data_len; + u32 h2cdata_left = req->h2cdata_left; while (true) { struct page *page = nvme_tcp_req_cur_page(req); @@ -972,7 +985,10 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) req->state = NVME_TCP_SEND_DDGST; req->offset = 0; } else { - nvme_tcp_done_send_req(queue); + if (h2cdata_left) + nvme_tcp_setup_h2c_data_pdu(req); + else + nvme_tcp_done_send_req(queue); } return 1; } @@ -1030,9 +1046,14 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) if (queue->hdr_digest && !req->offset) nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); + if (!req->h2cdata_left) + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, + MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); + else + ret = sock_no_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, + MSG_DONTWAIT | MSG_MORE); if (unlikely(ret <= 0)) return ret; @@ -1052,6 +1073,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; size_t offset = req->offset; + u32 h2cdata_left = req->h2cdata_left; int ret; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { @@ -1069,7 +1091,10 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) return ret; if (offset + ret == NVME_TCP_DIGEST_LENGTH) { - nvme_tcp_done_send_req(queue); + if (h2cdata_left) + nvme_tcp_setup_h2c_data_pdu(req); + else + nvme_tcp_done_send_req(queue); return 1; } @@ -1261,6 +1286,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) struct msghdr msg = {}; struct kvec iov; bool ctrl_hdgst, ctrl_ddgst; + u32 maxh2cdata; int ret; icreq = kzalloc(sizeof(*icreq), GFP_KERNEL); @@ -1344,6 +1370,14 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) goto free_icresp; } + maxh2cdata = le32_to_cpu(icresp->maxdata); + if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) { + pr_err("queue %d: invalid maxh2cdata returned %u\n", + nvme_tcp_queue_id(queue), maxh2cdata); + goto free_icresp; + } + queue->maxh2cdata = maxh2cdata; + ret = 0; free_icresp: kfree(icresp); @@ -2329,6 +2363,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_sent = 0; req->pdu_len = 0; req->pdu_sent = 0; + req->h2cdata_left = 0; req->data_len = blk_rq_nr_phys_segments(rq) ? blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index 959e0bd9a913..75470159a194 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -12,6 +12,7 @@ #define NVME_TCP_DISC_PORT 8009 #define NVME_TCP_ADMIN_CCSZ SZ_8K #define NVME_TCP_DIGEST_LENGTH 4 +#define NVME_TCP_MIN_MAXH2CDATA 4096 enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, From 7093f15291e95f16dfb5a93307eda3272bfe1108 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 28 Jan 2022 15:21:20 +0800 Subject: [PATCH 360/773] btrfs: defrag: don't try to merge regular extents with preallocated extents [BUG] With older kernels (before v5.16), btrfs will defrag preallocated extents. While with newer kernels (v5.16 and newer) btrfs will not defrag preallocated extents, but it will defrag the extent just before the preallocated extent, even it's just a single sector. This can be exposed by the following small script: mkfs.btrfs -f $dev > /dev/null mount $dev $mnt xfs_io -f -c "pwrite 0 4k" -c sync -c "falloc 4k 16K" $mnt/file xfs_io -c "fiemap -v" $mnt/file btrfs fi defrag $mnt/file sync xfs_io -c "fiemap -v" $mnt/file The output looks like this on older kernels: /mnt/btrfs/file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 26624..26631 8 0x0 1: [8..39]: 26632..26663 32 0x801 /mnt/btrfs/file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..39]: 26664..26703 40 0x1 Which defrags the single sector along with the preallocated extent, and replace them with an regular extent into a new location (caused by data COW). This wastes most of the data IO just for the preallocated range. On the other hand, v5.16 is slightly better: /mnt/btrfs/file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 26624..26631 8 0x0 1: [8..39]: 26632..26663 32 0x801 /mnt/btrfs/file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 26664..26671 8 0x0 1: [8..39]: 26632..26663 32 0x801 The preallocated range is not defragged, but the sector before it still gets defragged, which has no need for it. [CAUSE] One of the function reused by the old and new behavior is defrag_check_next_extent(), it will determine if we should defrag current extent by checking the next one. It only checks if the next extent is a hole or inlined, but it doesn't check if it's preallocated. On the other hand, out of the function, both old and new kernel will reject preallocated extents. Such inconsistent behavior causes above behavior. [FIX] - Also check if next extent is preallocated If so, don't defrag current extent. - Add comments for each branch why we reject the extent This will reduce the IO caused by defrag ioctl and autodefrag. CC: stable@vger.kernel.org # 5.16 Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 218724e4edd6..1c36e437b027 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1050,19 +1050,24 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, bool locked) { struct extent_map *next; - bool ret = true; + bool ret = false; /* this is the last extent */ if (em->start + em->len >= i_size_read(inode)) return false; next = defrag_lookup_extent(inode, em->start + em->len, locked); + /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - ret = false; - else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > SZ_128K && next->block_len > SZ_128K)) - ret = false; - + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* Physically adjacent and large enough */ + if ((em->block_start + em->block_len == next->block_start) && + (em->block_len > SZ_128K && next->block_len > SZ_128K)) + goto out; + ret = true; +out: free_extent_map(next); return ret; } From 979b25c300dbcbcb750e88715018e04e854de6c6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 28 Jan 2022 15:21:21 +0800 Subject: [PATCH 361/773] btrfs: defrag: don't defrag extents which are already at max capacity [BUG] For compressed extents, defrag ioctl will always try to defrag any compressed extents, wasting not only IO but also CPU time to compress/decompress: mkfs.btrfs -f $DEV mount -o compress $DEV $MNT xfs_io -f -c "pwrite -S 0xab 0 128K" $MNT/foobar sync xfs_io -f -c "pwrite -S 0xcd 128K 128K" $MNT/foobar sync echo "=== before ===" xfs_io -c "fiemap -v" $MNT/foobar btrfs filesystem defrag $MNT/foobar sync echo "=== after ===" xfs_io -c "fiemap -v" $MNT/foobar Then it shows the 2 128K extents just get COW for no extra benefit, with extra IO/CPU spent: === before === /mnt/btrfs/file1: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..255]: 26624..26879 256 0x8 1: [256..511]: 26632..26887 256 0x9 === after === /mnt/btrfs/file1: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..255]: 26640..26895 256 0x8 1: [256..511]: 26648..26903 256 0x9 This affects not only v5.16 (after the defrag rework), but also v5.15 (before the defrag rework). [CAUSE] From the very beginning, btrfs defrag never checks if one extent is already at its max capacity (128K for compressed extents, 128M otherwise). And the default extent size threshold is 256K, which is already beyond the compressed extent max size. This means, by default btrfs defrag ioctl will mark all compressed extent which is not adjacent to a hole/preallocated range for defrag. [FIX] Introduce a helper to grab the maximum extent size, and then in defrag_collect_targets() and defrag_check_next_extent(), reject extents which are already at their max capacity. Reported-by: Filipe Manana CC: stable@vger.kernel.org # 5.16 Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1c36e437b027..8e1589dd1c70 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1046,6 +1046,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } +static u32 get_extent_max_capacity(const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return BTRFS_MAX_EXTENT_SIZE; +} + static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, bool locked) { @@ -1062,6 +1069,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, goto out; if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(em)) + goto out; /* Physically adjacent and large enough */ if ((em->block_start + em->block_len == next->block_start) && (em->block_len > SZ_128K && next->block_len > SZ_128K)) @@ -1262,6 +1275,13 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (range_len >= extent_thresh) goto next; + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(em)) + goto next; + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, locked); if (!next_mergeable) { From 550f133f6959db927127111b50e483da3a7ce662 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 28 Jan 2022 15:21:22 +0800 Subject: [PATCH 362/773] btrfs: defrag: remove an ambiguous condition for rejection From the very beginning of btrfs defrag, there is a check to reject extents which meet both conditions: - Physically adjacent We may want to defrag physically adjacent extents to reduce the number of extents or the size of subvolume tree. - Larger than 128K This may be there for compressed extents, but unfortunately 128K is exactly the max capacity for compressed extents. And the check is > 128K, thus it never rejects compressed extents. Furthermore, the compressed extent capacity bug is fixed by previous patch, there is no reason for that check anymore. The original check has a very small ranges to reject (the target extent size is > 128K, and default extent threshold is 256K), and for compressed extent it doesn't work at all. So it's better just to remove the rejection, and allow us to defrag physically adjacent extents. CC: stable@vger.kernel.org # 5.16 Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8e1589dd1c70..212157473ad8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1075,10 +1075,6 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, */ if (next->len >= get_extent_max_capacity(em)) goto out; - /* Physically adjacent and large enough */ - if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > SZ_128K && next->block_len > SZ_128K)) - goto out; ret = true; out: free_extent_map(next); From d5633b0dee02d7d25e93463a03709f11c71500e2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 11 Feb 2022 14:46:12 +0800 Subject: [PATCH 363/773] btrfs: defrag: bring back the old file extent search behavior For defrag, we don't really want to use btrfs_get_extent() to iterate all extent maps of an inode. The reasons are: - btrfs_get_extent() can merge extent maps And the result em has the higher generation of the two, causing defrag to mark unnecessary part of such merged large extent map. This in fact can result extra IO for autodefrag in v5.16+ kernels. However this patch is not going to completely solve the problem, as one can still using read() to trigger extent map reading, and got them merged. The completely solution for the extent map merging generation problem will come as an standalone fix. - btrfs_get_extent() caches the extent map result Normally it's fine, but for defrag the target range may not get another read/write for a long long time. Such cache would only increase the memory usage. - btrfs_get_extent() doesn't skip older extent map Unlike the old find_new_extent() which uses btrfs_search_forward() to skip the older subtree, thus it will pick up unnecessary extent maps. This patch will fix the regression by introducing defrag_get_extent() to replace the btrfs_get_extent() call. This helper will: - Not cache the file extent we found It will search the file extent and manually convert it to em. - Use btrfs_search_forward() to skip entire ranges which is modified in the past This should reduce the IO for autodefrag. Reported-by: Filipe Manana Fixes: 7b508037d4ca ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 161 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 157 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 212157473ad8..ffebd420829e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1012,8 +1012,155 @@ out: return ret; } +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - bool locked) + u64 newer_than, bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -1035,7 +1182,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* get the big lock and read metadata off disk */ if (!locked) lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) unlock_extent_cached(io_tree, start, end, &cached); @@ -1063,7 +1210,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len, locked); + /* + * We want to check if the next extent can be merged with the current + * one, which can be an extent created in a past generation, so we pass + * a minimum generation of 0 to defrag_lookup_extent(). + */ + next = defrag_lookup_extent(inode, em->start + em->len, 0, locked); /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) goto out; @@ -1214,7 +1366,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, u64 range_len; last_is_target = false; - em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + em = defrag_lookup_extent(&inode->vfs_inode, cur, + newer_than, locked); if (!em) break; From 199257a78bb01341c3ba6e85bdcf3a2e6e452c6d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 11 Feb 2022 14:46:13 +0800 Subject: [PATCH 364/773] btrfs: defrag: don't use merged extent map for their generation check For extent maps, if they are not compressed extents and are adjacent by logical addresses and file offsets, they can be merged into one larger extent map. Such merged extent map will have the higher generation of all the original ones. But this brings a problem for autodefrag, as it relies on accurate extent_map::generation to determine if one extent should be defragged. For merged extent maps, their higher generation can mark some older extents to be defragged while the original extent map doesn't meet the minimal generation threshold. Thus this will cause extra IO. So solve the problem, here we introduce a new flag, EXTENT_FLAG_MERGED, to indicate if the extent map is merged from one or more ems. And for autodefrag, if we find a merged extent map, and its generation meets the generation requirement, we just don't use this one, and go back to defrag_get_extent() to read extent maps from subvolume trees. This could cause more read IO, but should result less defrag data write, so in the long run it should be a win for autodefrag. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 2 ++ fs/btrfs/extent_map.h | 8 ++++++++ fs/btrfs/ioctl.c | 14 ++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 5a36add21305..c28ceddefae4 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -261,6 +261,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); @@ -278,6 +279,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); free_extent_map(merge); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 8e217337dff9..d2fa32ffe304 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -25,6 +25,8 @@ enum { EXTENT_FLAG_FILLING, /* filesystem extent mapping type */ EXTENT_FLAG_FS_MAPPING, + /* This em is merged from two or more physically adjacent ems */ + EXTENT_FLAG_MERGED, }; struct extent_map { @@ -40,6 +42,12 @@ struct extent_map { u64 ram_bytes; u64 block_start; u64 block_len; + + /* + * Generation of the extent map, for merged em it's the highest + * generation of all merged ems. + * For non-merged extents, it's from btrfs_file_extent_item::generation. + */ u64 generation; unsigned long flags; /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ffebd420829e..1398d7b64c4e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1175,6 +1175,20 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + if (!em) { struct extent_state *cached = NULL; u64 end = start + sectorsize - 1; From 26fbac2517fcad34fa3f950151fd4c0240fb2935 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Feb 2022 18:20:59 +0100 Subject: [PATCH 365/773] btrfs: autodefrag: only scan one inode once Although we have btrfs_requeue_inode_defrag(), for autodefrag we are still just exhausting all inode_defrag items in the tree. This means, it doesn't make much difference to requeue an inode_defrag, other than scan the inode from the beginning till its end. Change the behaviour to always scan from offset 0 of an inode, and till the end. By this we get the following benefit: - Straight-forward code - No more re-queue related check - Fewer members in inode_defrag We still keep the same btrfs_get_fs_root() and btrfs_iget() check for each loop, and added extra should_auto_defrag() check per-loop. Note: the patch needs to be backported and is intentionally written to minimize the diff size, code will be cleaned up later. CC: stable@vger.kernel.org # 5.16 Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 84 +++++++++++++------------------------------------ 1 file changed, 22 insertions(+), 62 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 11204dbbe053..36a81368ed46 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -49,12 +49,6 @@ struct inode_defrag { /* root objectid */ u64 root; - - /* last offset we were able to defrag */ - u64 last_offset; - - /* if we've wrapped around back to zero once already */ - int cycled; }; static int __compare_inode_defrag(struct inode_defrag *defrag1, @@ -107,8 +101,6 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, */ if (defrag->transid < entry->transid) entry->transid = defrag->transid; - if (defrag->last_offset > entry->last_offset) - entry->last_offset = defrag->last_offset; return -EEXIST; } } @@ -178,34 +170,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, return 0; } -/* - * Requeue the defrag object. If there is a defrag object that points to - * the same inode in the tree, we will merge them together (by - * __btrfs_add_inode_defrag()) and free the one that we want to requeue. - */ -static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - - if (!__need_auto_defrag(fs_info)) - goto out; - - /* - * Here we don't check the IN_DEFRAG flag, because we need merge - * them together. - */ - spin_lock(&fs_info->defrag_inodes_lock); - ret = __btrfs_add_inode_defrag(inode, defrag); - spin_unlock(&fs_info->defrag_inodes_lock); - if (ret) - goto out; - return; -out: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); -} - /* * pick the defragable inode that we want, if it doesn't exist, we will get * the next one. @@ -278,8 +242,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_root *inode_root; struct inode *inode; struct btrfs_ioctl_defrag_range_args range; - int num_defrag; - int ret; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; /* get the inode */ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); @@ -295,39 +265,29 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, goto cleanup; } + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; - range.start = defrag->last_offset; + range.start = cur; sb_start_write(fs_info->sb); - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == BTRFS_DEFRAG_BATCH) { - defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); - } else { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } - iput(inode); - return 0; + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + cleanup: kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; From dd2288f4a020d693360e3e8d72f8b9d9c25f5ef6 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 18 Feb 2022 09:25:20 +0100 Subject: [PATCH 366/773] parisc/unaligned: Fix fldd and fstd unaligned handlers on 32-bit kernel Usually the kernel provides fixup routines to emulate the fldd and fstd floating-point instructions if they load or store 8-byte from/to a not natuarally aligned memory location. On a 32-bit kernel I noticed that those unaligned handlers didn't worked and instead the application got a SEGV. While checking the code I found two problems: First, the OPCODE_FLDD_L and OPCODE_FSTD_L cases were ifdef'ed out by the CONFIG_PA20 option, and as such those weren't built on a pure 32-bit kernel. This is now fixed by moving the CONFIG_PA20 #ifdef to prevent the compilation of OPCODE_LDD_L and OPCODE_FSTD_L only, and handling the fldd and fstd instructions. The second problem are two bugs in the 32-bit inline assembly code, where the wrong registers where used. The calculation of the natural alignment used %2 (vall) instead of %3 (ior), and the first word was stored back to address %1 (valh) instead of %3 (ior). Signed-off-by: Helge Deller Cc: stable@vger.kernel.org --- arch/parisc/kernel/unaligned.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index 237d20dd5622..a238b7fe8908 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -397,7 +397,7 @@ static int emulate_std(struct pt_regs *regs, int frreg, int flop) __asm__ __volatile__ ( " mtsp %4, %%sr1\n" " zdep %2, 29, 2, %%r19\n" -" dep %%r0, 31, 2, %2\n" +" dep %%r0, 31, 2, %3\n" " mtsar %%r19\n" " zvdepi -2, 32, %%r19\n" "1: ldw 0(%%sr1,%3),%%r20\n" @@ -409,7 +409,7 @@ static int emulate_std(struct pt_regs *regs, int frreg, int flop) " andcm %%r21, %%r19, %%r21\n" " or %1, %%r20, %1\n" " or %2, %%r21, %2\n" -"3: stw %1,0(%%sr1,%1)\n" +"3: stw %1,0(%%sr1,%3)\n" "4: stw %%r1,4(%%sr1,%3)\n" "5: stw %2,8(%%sr1,%3)\n" " copy %%r0, %0\n" @@ -596,7 +596,6 @@ void handle_unaligned(struct pt_regs *regs) ret = ERR_NOTHANDLED; /* "undefined", but lets kill them. */ break; } -#ifdef CONFIG_PA20 switch (regs->iir & OPCODE2_MASK) { case OPCODE_FLDD_L: @@ -607,14 +606,15 @@ void handle_unaligned(struct pt_regs *regs) flop=1; ret = emulate_std(regs, R2(regs->iir),1); break; +#ifdef CONFIG_PA20 case OPCODE_LDD_L: ret = emulate_ldd(regs, R2(regs->iir),0); break; case OPCODE_STD_L: ret = emulate_std(regs, R2(regs->iir),0); break; - } #endif + } switch (regs->iir & OPCODE3_MASK) { case OPCODE_FLDW_L: From a97279836867b1cb50a3d4f0b1bf60e0abe6d46c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 18 Feb 2022 23:40:14 +0100 Subject: [PATCH 367/773] parisc/unaligned: Fix ldw() and stw() unalignment handlers Fix 3 bugs: a) emulate_stw() doesn't return the error code value, so faulting instructions are not reported and aborted. b) Tell emulate_ldw() to handle fldw_l as floating point instruction c) Tell emulate_ldw() to handle ldw_m as integer instruction Signed-off-by: Helge Deller Cc: stable@vger.kernel.org --- arch/parisc/kernel/unaligned.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index a238b7fe8908..286cec4d86d7 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -340,7 +340,7 @@ static int emulate_stw(struct pt_regs *regs, int frreg, int flop) : "r" (val), "r" (regs->ior), "r" (regs->isr) : "r19", "r20", "r21", "r22", "r1", FIXUP_BRANCH_CLOBBER ); - return 0; + return ret; } static int emulate_std(struct pt_regs *regs, int frreg, int flop) { @@ -619,10 +619,10 @@ void handle_unaligned(struct pt_regs *regs) { case OPCODE_FLDW_L: flop=1; - ret = emulate_ldw(regs, R2(regs->iir),0); + ret = emulate_ldw(regs, R2(regs->iir), 1); break; case OPCODE_LDW_M: - ret = emulate_ldw(regs, R2(regs->iir),1); + ret = emulate_ldw(regs, R2(regs->iir), 0); break; case OPCODE_FSTW_L: From 3f1271b54edcc692da5a3663f2aa2a64781f9bc3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 22 Feb 2022 11:08:01 -0500 Subject: [PATCH 368/773] PCI: Mark all AMD Navi10 and Navi14 GPU ATS as broken MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are enough VBIOS escapes without the proper workaround that some users still hit this. Microsoft never productized ATS on Windows so OEM platforms that were Windows-only didn't always validate ATS. The advantages of ATS are not worth it compared to the potential instabilities on harvested boards. Disable ATS on all Navi10 and Navi14 boards. Symptoms include: amdgpu 0000:07:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0007 address=0xffffc02000 flags=0x0000] AMD-Vi: Event logged [IO_PAGE_FAULT device=07:00.0 domain=0x0007 address=0xffffc02000 flags=0x0000] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 timeout, signaled seq=6047, emitted seq=6049 amdgpu 0000:07:00.0: amdgpu: GPU reset begin! amdgpu 0000:07:00.0: amdgpu: GPU reset succeeded, trying to resume amdgpu 0000:07:00.0: [drm:amdgpu_ring_test_helper [amdgpu]] *ERROR* ring sdma0 test failed (-110) [drm:amdgpu_device_ip_resume_phase2 [amdgpu]] *ERROR* resume of IP block failed -110 amdgpu 0000:07:00.0: amdgpu: GPU reset(1) failed Related commits: e8946a53e2a6 ("PCI: Mark AMD Navi14 GPU ATS as broken") a2da5d8cc0b0 ("PCI: Mark AMD Raven iGPU ATS as broken in some platforms") 45beb31d3afb ("PCI: Mark AMD Navi10 GPU rev 0x00 ATS as broken") 5e89cd303e3a ("PCI: Mark AMD Navi14 GPU rev 0xc5 ATS as broken") d28ca864c493 ("PCI: Mark AMD Stoney Radeon R7 GPU ATS as broken") 9b44b0b09dec ("PCI: Mark AMD Stoney GPU ATS as broken") [bhelgaas: add symptoms and related commits] Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1760 Link: https://lore.kernel.org/r/20220222160801.841643-1-alexander.deucher@amd.com Signed-off-by: Alex Deucher Signed-off-by: Bjorn Helgaas Acked-by: Christian König Acked-by: Guchun Chen --- drivers/pci/quirks.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index d2dd6a6cda60..65f7f6b0576c 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5344,11 +5344,6 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, 0x0422, quirk_no_ext_tags); */ static void quirk_amd_harvest_no_ats(struct pci_dev *pdev) { - if ((pdev->device == 0x7312 && pdev->revision != 0x00) || - (pdev->device == 0x7340 && pdev->revision != 0xc5) || - (pdev->device == 0x7341 && pdev->revision != 0x00)) - return; - if (pdev->device == 0x15d8) { if (pdev->revision == 0xcf && pdev->subsystem_vendor == 0xea50 && @@ -5370,10 +5365,19 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x98e4, quirk_amd_harvest_no_ats); /* AMD Iceland dGPU */ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x6900, quirk_amd_harvest_no_ats); /* AMD Navi10 dGPU */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7310, quirk_amd_harvest_no_ats); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7312, quirk_amd_harvest_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7318, quirk_amd_harvest_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7319, quirk_amd_harvest_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x731a, quirk_amd_harvest_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x731b, quirk_amd_harvest_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x731e, quirk_amd_harvest_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x731f, quirk_amd_harvest_no_ats); /* AMD Navi14 dGPU */ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7340, quirk_amd_harvest_no_ats); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7341, quirk_amd_harvest_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7347, quirk_amd_harvest_no_ats); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x734f, quirk_amd_harvest_no_ats); /* AMD Raven platform iGPU */ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x15d8, quirk_amd_harvest_no_ats); #endif /* CONFIG_PCI_ATS */ From 515415d316168c6521d74ea8280287e28d7303e6 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 19 Feb 2022 13:07:55 +0100 Subject: [PATCH 369/773] ARM: boot: dts: bcm2711: Fix HVS register range While the HVS has the same context memory size in the BCM2711 than in the previous SoCs, the range allocated to the registers doubled and it now takes 16k + 16k, compared to 8k + 16k before. The KMS driver will use the whole context RAM though, eventually resulting in a pointer dereference error when we access the higher half of the context memory since it hasn't been mapped. Fixes: 4564363351e2 ("ARM: dts: bcm2711: Enable the display pipeline") Signed-off-by: Maxime Ripard Signed-off-by: Stefan Wahren Signed-off-by: Florian Fainelli --- arch/arm/boot/dts/bcm2711.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/bcm2711.dtsi b/arch/arm/boot/dts/bcm2711.dtsi index dff18fc9a906..21294f775a20 100644 --- a/arch/arm/boot/dts/bcm2711.dtsi +++ b/arch/arm/boot/dts/bcm2711.dtsi @@ -290,6 +290,7 @@ hvs: hvs@7e400000 { compatible = "brcm,bcm2711-hvs"; + reg = <0x7e400000 0x8000>; interrupts = ; }; From 081bdc9fe05bb23248f5effb6f811da3da4b8252 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Feb 2022 13:05:11 -0800 Subject: [PATCH 370/773] RDMA/ib_srp: Fix a deadlock Remove the flush_workqueue(system_long_wq) call since flushing system_long_wq is deadlock-prone and since that call is redundant with a preceding cancel_work_sync() Link: https://lore.kernel.org/r/20220215210511.28303-3-bvanassche@acm.org Fixes: ef6c49d87c34 ("IB/srp: Eliminate state SRP_TARGET_DEAD") Reported-by: syzbot+831661966588c802aae9@syzkaller.appspotmail.com Signed-off-by: Bart Van Assche Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index e174e853f8a4..285b766e4e70 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -4047,9 +4047,11 @@ static void srp_remove_one(struct ib_device *device, void *client_data) spin_unlock(&host->target_lock); /* - * Wait for tl_err and target port removal tasks. + * srp_queue_remove_work() queues a call to + * srp_remove_target(). The latter function cancels + * target->tl_err_work so waiting for the remove works to + * finish is sufficient. */ - flush_workqueue(system_long_wq); flush_workqueue(srp_remove_wq); kfree(host); From ae42f9288846353982e2eab181fb41e7fd8bf60f Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Thu, 17 Feb 2022 01:56:55 +0530 Subject: [PATCH 371/773] gpio: Return EPROBE_DEFER if gc->to_irq is NULL We are racing the registering of .to_irq when probing the i2c driver. This results in random failure of touchscreen devices. Following explains the race condition better. [gpio driver] gpio driver registers gpio chip [gpio consumer] gpio is acquired [gpio consumer] gpiod_to_irq() fails with -ENXIO [gpio driver] gpio driver registers irqchip gpiod_to_irq works at this point, but -ENXIO is fatal We could see the following errors in dmesg logs when gc->to_irq is NULL [2.101857] i2c_hid i2c-FTS3528:00: HID over i2c has not been provided an Int IRQ [2.101953] i2c_hid: probe of i2c-FTS3528:00 failed with error -22 To avoid this situation, defer probing until to_irq is registered. Returning -EPROBE_DEFER would be the first step towards avoiding the failure of devices due to the race in registration of .to_irq. Final solution to this issue would be to avoid using gc irq members until they are fully initialized. This issue has been reported many times in past and people have been using workarounds like changing the pinctrl_amd to built-in instead of loading it as a module or by adding a softdep for pinctrl_amd into the config file. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209413 Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Reported-by: kernel test robot Signed-off-by: Shreeya Patel Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 3859911b61e9..a3d14277f17c 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3147,6 +3147,16 @@ int gpiod_to_irq(const struct gpio_desc *desc) return retirq; } +#ifdef CONFIG_GPIOLIB_IRQCHIP + if (gc->irq.chip) { + /* + * Avoid race condition with other code, which tries to lookup + * an IRQ before the irqchip has been properly registered, + * i.e. while gpiochip is still being brought up. + */ + return -EPROBE_DEFER; + } +#endif return -ENXIO; } EXPORT_SYMBOL_GPL(gpiod_to_irq); From 7294863a6f01248d72b61d38478978d638641bee Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 1 Feb 2022 10:26:33 -0600 Subject: [PATCH 372/773] drm/amd: Check if ASPM is enabled from PCIe subsystem commit 0064b0ce85bb ("drm/amd/pm: enable ASPM by default") enabled ASPM by default but a variety of hardware configurations it turns out that this caused a regression. * PPC64LE hardware does not support ASPM at a hardware level. CONFIG_PCIEASPM is often disabled on these architectures. * Some dGPUs on ALD platforms don't work with ASPM enabled and PCIe subsystem disables it Check with the PCIe subsystem to see that ASPM has been enabled or not. Fixes: 0064b0ce85bb ("drm/amd/pm: enable ASPM by default") Link: https://wiki.raptorcs.com/w/images/a/ad/P9_PHB_version1.0_27July2018_pub.pdf Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1723 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1739 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1885 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1907 Tested-by: koba.ko@canonical.com Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 63a089992645..0ead08ba58c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2011,6 +2011,9 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, return -ENODEV; } + if (amdgpu_aspm == -1 && !pcie_aspm_enabled(pdev)) + amdgpu_aspm = 0; + if (amdgpu_virtual_display || amdgpu_device_asic_has_dc_support(flags & AMD_ASIC_MASK)) supports_atomic = true; From 3743e7f6fcb938b7d8b7967e6a9442805e269b3d Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Tue, 25 Jan 2022 12:04:34 -0500 Subject: [PATCH 373/773] drm/amd/display: Fix stream->link_enc unassigned during stream removal [Why] Found when running igt@kms_atomic. Userspace attempts to do a TEST_COMMIT when 0 streams which calls dc_remove_stream_from_ctx. This in turn calls link_enc_unassign which ends up modifying stream->link = NULL directly, causing the global link_enc to be removed preventing further link activity and future link validation from passing. [How] We take care of link_enc unassignment at the start of link_enc_cfg_link_encs_assign so this call is no longer necessary. Fixes global state from being modified while unlocked. Reviewed-by: Jimmy Kizito Acked-by: Jasdeep Dhillon Signed-off-by: Nicholas Kazlauskas Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index b3912ff9dc91..18757c158523 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -1964,10 +1964,6 @@ enum dc_status dc_remove_stream_from_ctx( dc->res_pool, del_pipe->stream_res.stream_enc, false); - /* Release link encoder from stream in new dc_state. */ - if (dc->res_pool->funcs->link_enc_unassign) - dc->res_pool->funcs->link_enc_unassign(new_ctx, del_pipe->stream); - #if defined(CONFIG_DRM_AMD_DC_DCN) if (is_dp_128b_132b_signal(del_pipe)) { update_hpo_dp_stream_engine_usage( From 1e2be869c8a7247a7253ef4f461f85e2f5931b95 Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Thu, 17 Feb 2022 15:29:41 +0800 Subject: [PATCH 374/773] drm/amdgpu: do not enable asic reset for raven2 The GPU reset function of raven2 is not maintained or tested, so it should be very unstable. Now the amdgpu_asic_reset function is added to amdgpu_pmops_suspend, which causes the S3 test of raven2 to fail, so the asic_reset of raven2 is ignored here. Fixes: daf8de0874ab5b ("drm/amdgpu: always reset the asic in suspend (v2)") Signed-off-by: Chen Gong Acked-by: Alex Deucher Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc15.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index d0e37f7f196c..12f80fdc1fbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -619,8 +619,8 @@ soc15_asic_reset_method(struct amdgpu_device *adev) static int soc15_asic_reset(struct amdgpu_device *adev) { /* original raven doesn't have full asic reset */ - if ((adev->apu_flags & AMD_APU_IS_RAVEN) && - !(adev->apu_flags & AMD_APU_IS_RAVEN2)) + if ((adev->apu_flags & AMD_APU_IS_RAVEN) || + (adev->apu_flags & AMD_APU_IS_RAVEN2)) return 0; switch (soc15_asic_reset_method(adev)) { From 97c61e0b7c596cc5f683da30289f92c2e1b4b799 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 18 Feb 2022 12:57:52 +0800 Subject: [PATCH 375/773] Revert "drm/amdgpu: add modifiers in amdgpu_vkms_plane_init()" This reverts commit 4046afcebfc3c8c0dd5666c2671b2c192b344f78. No need to support modifier in virtual kms, otherwise, in SRIOV mode, when lanuching X server, set crtc will fail due to mismatch between primary plane modifier and framebuffer modifier. Signed-off-by: Guchun Chen Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c index d99c8779b51e..5224d9a39737 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c @@ -391,7 +391,6 @@ static struct drm_plane *amdgpu_vkms_plane_init(struct drm_device *dev, int index) { struct drm_plane *plane; - uint64_t modifiers[] = {DRM_FORMAT_MOD_LINEAR, DRM_FORMAT_MOD_INVALID}; int ret; plane = kzalloc(sizeof(*plane), GFP_KERNEL); @@ -402,7 +401,7 @@ static struct drm_plane *amdgpu_vkms_plane_init(struct drm_device *dev, &amdgpu_vkms_plane_funcs, amdgpu_vkms_formats, ARRAY_SIZE(amdgpu_vkms_formats), - modifiers, type, NULL); + NULL, type, NULL); if (ret) { kfree(plane); return ERR_PTR(ret); From e2b993302f40c4eb714ecf896dd9e1c5be7d4cd7 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 18 Feb 2022 13:05:26 +0800 Subject: [PATCH 376/773] drm/amdgpu: bypass tiling flag check in virtual display case (v2) vkms leverages common amdgpu framebuffer creation, and also as it does not support FB modifier, there is no need to check tiling flags when initing framebuffer when virtual display is enabled. This can fix below calltrace: amdgpu 0000:00:08.0: GFX9+ requires FB check based on format modifier WARNING: CPU: 0 PID: 1023 at drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:1150 amdgpu_display_framebuffer_init+0x8e7/0xb40 [amdgpu] v2: check adev->enable_virtual_display instead as vkms can be enabled in bare metal as well. Signed-off-by: Leslie Shi Signed-off-by: Guchun Chen Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 82011e75ed85..c4387b38229c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -1141,7 +1141,7 @@ int amdgpu_display_framebuffer_init(struct drm_device *dev, if (ret) return ret; - if (!dev->mode_config.allow_fb_modifiers) { + if (!dev->mode_config.allow_fb_modifiers && !adev->enable_virtual_display) { drm_WARN_ONCE(dev, adev->family >= AMDGPU_FAMILY_AI, "GFX9+ requires FB check based on format modifier\n"); ret = check_tiling_flags_gfx6(rfb); From c1a66c3bc425ff93774fb2f6eefa67b83170dd7e Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Mon, 21 Feb 2022 17:53:56 +0800 Subject: [PATCH 377/773] drm/amdgpu: check vm ready by amdgpu_vm->evicting flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workstation application ANSA/META v21.1.4 get this error dmesg when running CI test suite provided by ANSA/META: [drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA (-16) This is caused by: 1. create a 256MB buffer in invisible VRAM 2. CPU map the buffer and access it causes vm_fault and try to move it to visible VRAM 3. force visible VRAM space and traverse all VRAM bos to check if evicting this bo is valuable 4. when checking a VM bo (in invisible VRAM), amdgpu_vm_evictable() will set amdgpu_vm->evicting, but latter due to not in visible VRAM, won't really evict it so not add it to amdgpu_vm->evicted 5. before next CS to clear the amdgpu_vm->evicting, user VM ops ioctl will pass amdgpu_vm_ready() (check amdgpu_vm->evicted) but fail in amdgpu_vm_bo_update_mapping() (check amdgpu_vm->evicting) and get this error log This error won't affect functionality as next CS will finish the waiting VM ops. But we'd better clear the error log by checking the amdgpu_vm->evicting flag in amdgpu_vm_ready() to stop calling amdgpu_vm_bo_update_mapping() later. Another reason is amdgpu_vm->evicted list holds all BOs (both user buffer and page table), but only page table BOs' eviction prevent VM ops. amdgpu_vm->evicting flag is set only for page table BOs, so we should use evicting flag instead of evicted list in amdgpu_vm_ready(). The side effect of this change is: previously blocked VM op (user buffer in "evicted" list but no page table in it) gets done immediately. v2: update commit comments. Acked-by: Paul Menzel Reviewed-by: Christian König Signed-off-by: Qiang Yu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index b37fc7d7d2c7..d62190b3dd9b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -768,11 +768,16 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, * Check if all VM PDs/PTs are ready for updates * * Returns: - * True if eviction list is empty. + * True if VM is not evicting. */ bool amdgpu_vm_ready(struct amdgpu_vm *vm) { - return list_empty(&vm->evicted); + bool ret; + + amdgpu_vm_eviction_lock(vm); + ret = !vm->evicting; + amdgpu_vm_eviction_unlock(vm); + return ret; } /** From f908a35b22180c4da64cf2647e4f5f0cd3054da7 Mon Sep 17 00:00:00 2001 From: Meir Lichtinger Date: Mon, 10 Jan 2022 10:14:41 +0200 Subject: [PATCH 378/773] net/mlx5: Update the list of the PCI supported devices Add the upcoming BlueField-4 and ConnectX-8 device IDs. Fixes: 2e9d3e83ab82 ("net/mlx5: Update the list of the PCI supported devices") Signed-off-by: Meir Lichtinger Reviewed-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 2c774f367199..13f913c13a2d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1840,10 +1840,12 @@ static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF}, /* ConnectX Family mlx5Gen Virtual Function */ { PCI_VDEVICE(MELLANOX, 0x101f) }, /* ConnectX-6 LX */ { PCI_VDEVICE(MELLANOX, 0x1021) }, /* ConnectX-7 */ + { PCI_VDEVICE(MELLANOX, 0x1023) }, /* ConnectX-8 */ { PCI_VDEVICE(MELLANOX, 0xa2d2) }, /* BlueField integrated ConnectX-5 network controller */ { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF}, /* BlueField integrated ConnectX-5 network controller VF */ { PCI_VDEVICE(MELLANOX, 0xa2d6) }, /* BlueField-2 integrated ConnectX-6 Dx network controller */ { PCI_VDEVICE(MELLANOX, 0xa2dc) }, /* BlueField-3 integrated ConnectX-7 network controller */ + { PCI_VDEVICE(MELLANOX, 0xa2df) }, /* BlueField-4 integrated ConnectX-8 network controller */ { 0, } }; From e5b2bc30c21139ae10f0e56989389d0bc7b7b1d6 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Fri, 24 Dec 2021 01:07:30 +0200 Subject: [PATCH 379/773] net/mlx5: DR, Cache STE shadow memory During rule insertion on each ICM memory chunk we also allocate shadow memory used for management. This includes the hw_ste, dr_ste and miss list per entry. Since the scale of these allocations is large we noticed a performance hiccup that happens once malloc and free are stressed. In extreme usecases when ~1M chunks are freed at once, it might take up to 40 seconds to complete this, up to the point the kernel sees this as self-detected stall on CPU: rcu: INFO: rcu_sched self-detected stall on CPU To resolve this we will increase the reuse of shadow memory. Doing this we see that a time in the aforementioned usecase dropped from ~40 seconds to ~8-10 seconds. Fixes: 29cf8febd185 ("net/mlx5: DR, ICM pool memory allocator") Signed-off-by: Alex Vesker Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_icm_pool.c | 109 ++++++++++++------ .../mellanox/mlx5/core/steering/mlx5dr.h | 5 + 2 files changed, 79 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c index 7f6fd9c5e371..f496b7e9401b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c @@ -136,37 +136,35 @@ static void dr_icm_pool_mr_destroy(struct mlx5dr_icm_mr *icm_mr) kvfree(icm_mr); } -static int dr_icm_chunk_ste_init(struct mlx5dr_icm_chunk *chunk) +static int dr_icm_buddy_get_ste_size(struct mlx5dr_icm_buddy_mem *buddy) { - chunk->ste_arr = kvzalloc(chunk->num_of_entries * - sizeof(chunk->ste_arr[0]), GFP_KERNEL); - if (!chunk->ste_arr) - return -ENOMEM; + /* We support only one type of STE size, both for ConnectX-5 and later + * devices. Once the support for match STE which has a larger tag is + * added (32B instead of 16B), the STE size for devices later than + * ConnectX-5 needs to account for that. + */ + return DR_STE_SIZE_REDUCED; +} - chunk->hw_ste_arr = kvzalloc(chunk->num_of_entries * - DR_STE_SIZE_REDUCED, GFP_KERNEL); - if (!chunk->hw_ste_arr) - goto out_free_ste_arr; +static void dr_icm_chunk_ste_init(struct mlx5dr_icm_chunk *chunk, int offset) +{ + struct mlx5dr_icm_buddy_mem *buddy = chunk->buddy_mem; + int index = offset / DR_STE_SIZE; - chunk->miss_list = kvmalloc(chunk->num_of_entries * - sizeof(chunk->miss_list[0]), GFP_KERNEL); - if (!chunk->miss_list) - goto out_free_hw_ste_arr; - - return 0; - -out_free_hw_ste_arr: - kvfree(chunk->hw_ste_arr); -out_free_ste_arr: - kvfree(chunk->ste_arr); - return -ENOMEM; + chunk->ste_arr = &buddy->ste_arr[index]; + chunk->miss_list = &buddy->miss_list[index]; + chunk->hw_ste_arr = buddy->hw_ste_arr + + index * dr_icm_buddy_get_ste_size(buddy); } static void dr_icm_chunk_ste_cleanup(struct mlx5dr_icm_chunk *chunk) { - kvfree(chunk->miss_list); - kvfree(chunk->hw_ste_arr); - kvfree(chunk->ste_arr); + struct mlx5dr_icm_buddy_mem *buddy = chunk->buddy_mem; + + memset(chunk->hw_ste_arr, 0, + chunk->num_of_entries * dr_icm_buddy_get_ste_size(buddy)); + memset(chunk->ste_arr, 0, + chunk->num_of_entries * sizeof(chunk->ste_arr[0])); } static enum mlx5dr_icm_type @@ -189,6 +187,44 @@ static void dr_icm_chunk_destroy(struct mlx5dr_icm_chunk *chunk, kvfree(chunk); } +static int dr_icm_buddy_init_ste_cache(struct mlx5dr_icm_buddy_mem *buddy) +{ + int num_of_entries = + mlx5dr_icm_pool_chunk_size_to_entries(buddy->pool->max_log_chunk_sz); + + buddy->ste_arr = kvcalloc(num_of_entries, + sizeof(struct mlx5dr_ste), GFP_KERNEL); + if (!buddy->ste_arr) + return -ENOMEM; + + /* Preallocate full STE size on non-ConnectX-5 devices since + * we need to support both full and reduced with the same cache. + */ + buddy->hw_ste_arr = kvcalloc(num_of_entries, + dr_icm_buddy_get_ste_size(buddy), GFP_KERNEL); + if (!buddy->hw_ste_arr) + goto free_ste_arr; + + buddy->miss_list = kvmalloc(num_of_entries * sizeof(struct list_head), GFP_KERNEL); + if (!buddy->miss_list) + goto free_hw_ste_arr; + + return 0; + +free_hw_ste_arr: + kvfree(buddy->hw_ste_arr); +free_ste_arr: + kvfree(buddy->ste_arr); + return -ENOMEM; +} + +static void dr_icm_buddy_cleanup_ste_cache(struct mlx5dr_icm_buddy_mem *buddy) +{ + kvfree(buddy->ste_arr); + kvfree(buddy->hw_ste_arr); + kvfree(buddy->miss_list); +} + static int dr_icm_buddy_create(struct mlx5dr_icm_pool *pool) { struct mlx5dr_icm_buddy_mem *buddy; @@ -208,11 +244,19 @@ static int dr_icm_buddy_create(struct mlx5dr_icm_pool *pool) buddy->icm_mr = icm_mr; buddy->pool = pool; + if (pool->icm_type == DR_ICM_TYPE_STE) { + /* Reduce allocations by preallocating and reusing the STE structures */ + if (dr_icm_buddy_init_ste_cache(buddy)) + goto err_cleanup_buddy; + } + /* add it to the -start- of the list in order to search in it first */ list_add(&buddy->list_node, &pool->buddy_mem_list); return 0; +err_cleanup_buddy: + mlx5dr_buddy_cleanup(buddy); err_free_buddy: kvfree(buddy); free_mr: @@ -234,6 +278,9 @@ static void dr_icm_buddy_destroy(struct mlx5dr_icm_buddy_mem *buddy) mlx5dr_buddy_cleanup(buddy); + if (buddy->pool->icm_type == DR_ICM_TYPE_STE) + dr_icm_buddy_cleanup_ste_cache(buddy); + kvfree(buddy); } @@ -261,26 +308,18 @@ dr_icm_chunk_create(struct mlx5dr_icm_pool *pool, chunk->byte_size = mlx5dr_icm_pool_chunk_size_to_byte(chunk_size, pool->icm_type); chunk->seg = seg; + chunk->buddy_mem = buddy_mem_pool; - if (pool->icm_type == DR_ICM_TYPE_STE && dr_icm_chunk_ste_init(chunk)) { - mlx5dr_err(pool->dmn, - "Failed to init ste arrays (order: %d)\n", - chunk_size); - goto out_free_chunk; - } + if (pool->icm_type == DR_ICM_TYPE_STE) + dr_icm_chunk_ste_init(chunk, offset); buddy_mem_pool->used_memory += chunk->byte_size; - chunk->buddy_mem = buddy_mem_pool; INIT_LIST_HEAD(&chunk->chunk_list); /* chunk now is part of the used_list */ list_add_tail(&chunk->chunk_list, &buddy_mem_pool->used_list); return chunk; - -out_free_chunk: - kvfree(chunk); - return NULL; } static bool dr_icm_pool_is_sync_required(struct mlx5dr_icm_pool *pool) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index c7c93131b762..dfa223415fe2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -160,6 +160,11 @@ struct mlx5dr_icm_buddy_mem { * sync_ste command sets them free. */ struct list_head hot_list; + + /* Memory optimisation */ + struct mlx5dr_ste *ste_arr; + struct list_head *miss_list; + u8 *hw_ste_arr; }; int mlx5dr_buddy_init(struct mlx5dr_icm_buddy_mem *buddy, From 0aec12d97b2036af0946e3d582144739860ac07b Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 11 Jan 2022 03:00:03 +0200 Subject: [PATCH 380/773] net/mlx5: DR, Fix slab-out-of-bounds in mlx5_cmd_dr_create_fte When adding a rule with 32 destinations, we hit the following out-of-band access issue: BUG: KASAN: slab-out-of-bounds in mlx5_cmd_dr_create_fte+0x18ee/0x1e70 This patch fixes the issue by both increasing the allocated buffers to accommodate for the needed actions and by checking the number of actions to prevent this issue when a rule with too many actions is provided. Fixes: 1ffd498901c1 ("net/mlx5: DR, Increase supported num of actions to 32") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/fs_dr.c | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index a476da2424f8..3f311462bedf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -233,7 +233,11 @@ static bool contain_vport_reformat_action(struct mlx5_flow_rule *dst) dst->dest_attr.vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID; } -#define MLX5_FLOW_CONTEXT_ACTION_MAX 32 +/* We want to support a rule with 32 destinations, which means we need to + * account for 32 destinations plus usually a counter plus one more action + * for a multi-destination flow table. + */ +#define MLX5_FLOW_CONTEXT_ACTION_MAX 34 static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *ft, struct mlx5_flow_group *group, @@ -403,9 +407,9 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, enum mlx5_flow_destination_type type = dst->dest_attr.type; u32 id; - if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || - num_term_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX) { - err = -ENOSPC; + if (fs_dr_num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || + num_term_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; goto free_actions; } @@ -478,8 +482,9 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, MLX5_FLOW_DESTINATION_TYPE_COUNTER) continue; - if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { - err = -ENOSPC; + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || + fs_dr_num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; goto free_actions; } @@ -499,14 +504,28 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, params.match_sz = match_sz; params.match_buf = (u64 *)fte->val; if (num_term_actions == 1) { - if (term_actions->reformat) + if (term_actions->reformat) { + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } actions[num_actions++] = term_actions->reformat; + } + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } actions[num_actions++] = term_actions->dest; } else if (num_term_actions > 1) { bool ignore_flow_level = !!(fte->action.flags & FLOW_ACT_IGNORE_FLOW_LEVEL); + if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX || + fs_dr_num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } tmp_action = mlx5dr_action_create_mult_dest_tbl(domain, term_actions, num_term_actions, From ffb0753b954763d94f52c901adfe58ed0d4005e6 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 13 Jan 2022 14:52:48 +0200 Subject: [PATCH 381/773] net/mlx5: DR, Don't allow match on IP w/o matching on full ethertype/ip_version Currently SMFS allows adding rule with matching on src/dst IP w/o matching on full ethertype or ip_version, which is not supported by HW. This patch fixes this issue and adds the check as it is done in DMFS. Fixes: 26d688e33f88 ("net/mlx5: DR, Add Steering entry (STE) utilities") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_matcher.c | 20 +++--------- .../mellanox/mlx5/core/steering/dr_ste.c | 32 ++++++++++++++++++- .../mellanox/mlx5/core/steering/dr_types.h | 10 ++++++ 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index e87cf498c77b..38971fe1dfe1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -13,18 +13,6 @@ static bool dr_mask_is_dmac_set(struct mlx5dr_match_spec *spec) return (spec->dmac_47_16 || spec->dmac_15_0); } -static bool dr_mask_is_src_addr_set(struct mlx5dr_match_spec *spec) -{ - return (spec->src_ip_127_96 || spec->src_ip_95_64 || - spec->src_ip_63_32 || spec->src_ip_31_0); -} - -static bool dr_mask_is_dst_addr_set(struct mlx5dr_match_spec *spec) -{ - return (spec->dst_ip_127_96 || spec->dst_ip_95_64 || - spec->dst_ip_63_32 || spec->dst_ip_31_0); -} - static bool dr_mask_is_l3_base_set(struct mlx5dr_match_spec *spec) { return (spec->ip_protocol || spec->frag || spec->tcp_flags || @@ -503,11 +491,11 @@ static int dr_matcher_set_ste_builders(struct mlx5dr_matcher *matcher, &mask, inner, rx); if (outer_ipv == DR_RULE_IPV6) { - if (dr_mask_is_dst_addr_set(&mask.outer)) + if (DR_MASK_IS_DST_IP_SET(&mask.outer)) mlx5dr_ste_build_eth_l3_ipv6_dst(ste_ctx, &sb[idx++], &mask, inner, rx); - if (dr_mask_is_src_addr_set(&mask.outer)) + if (DR_MASK_IS_SRC_IP_SET(&mask.outer)) mlx5dr_ste_build_eth_l3_ipv6_src(ste_ctx, &sb[idx++], &mask, inner, rx); @@ -610,11 +598,11 @@ static int dr_matcher_set_ste_builders(struct mlx5dr_matcher *matcher, &mask, inner, rx); if (inner_ipv == DR_RULE_IPV6) { - if (dr_mask_is_dst_addr_set(&mask.inner)) + if (DR_MASK_IS_DST_IP_SET(&mask.inner)) mlx5dr_ste_build_eth_l3_ipv6_dst(ste_ctx, &sb[idx++], &mask, inner, rx); - if (dr_mask_is_src_addr_set(&mask.inner)) + if (DR_MASK_IS_SRC_IP_SET(&mask.inner)) mlx5dr_ste_build_eth_l3_ipv6_src(ste_ctx, &sb[idx++], &mask, inner, rx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index 7e61742e58a0..187e29b409b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -602,12 +602,34 @@ int mlx5dr_ste_set_action_decap_l3_list(struct mlx5dr_ste_ctx *ste_ctx, used_hw_action_num); } +static int dr_ste_build_pre_check_spec(struct mlx5dr_domain *dmn, + struct mlx5dr_match_spec *spec) +{ + if (spec->ip_version) { + if (spec->ip_version != 0xf) { + mlx5dr_err(dmn, + "Partial ip_version mask with src/dst IP is not supported\n"); + return -EINVAL; + } + } else if (spec->ethertype != 0xffff && + (DR_MASK_IS_SRC_IP_SET(spec) || DR_MASK_IS_DST_IP_SET(spec))) { + mlx5dr_err(dmn, + "Partial/no ethertype mask with src/dst IP is not supported\n"); + return -EINVAL; + } + + return 0; +} + int mlx5dr_ste_build_pre_check(struct mlx5dr_domain *dmn, u8 match_criteria, struct mlx5dr_match_param *mask, struct mlx5dr_match_param *value) { - if (!value && (match_criteria & DR_MATCHER_CRITERIA_MISC)) { + if (value) + return 0; + + if (match_criteria & DR_MATCHER_CRITERIA_MISC) { if (mask->misc.source_port && mask->misc.source_port != 0xffff) { mlx5dr_err(dmn, "Partial mask source_port is not supported\n"); @@ -621,6 +643,14 @@ int mlx5dr_ste_build_pre_check(struct mlx5dr_domain *dmn, } } + if ((match_criteria & DR_MATCHER_CRITERIA_OUTER) && + dr_ste_build_pre_check_spec(dmn, &mask->outer)) + return -EINVAL; + + if ((match_criteria & DR_MATCHER_CRITERIA_INNER) && + dr_ste_build_pre_check_spec(dmn, &mask->inner)) + return -EINVAL; + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 1b3d484b99be..55fcb751e24a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -798,6 +798,16 @@ struct mlx5dr_match_param { (_misc3)->icmpv4_code || \ (_misc3)->icmpv4_header_data) +#define DR_MASK_IS_SRC_IP_SET(_spec) ((_spec)->src_ip_127_96 || \ + (_spec)->src_ip_95_64 || \ + (_spec)->src_ip_63_32 || \ + (_spec)->src_ip_31_0) + +#define DR_MASK_IS_DST_IP_SET(_spec) ((_spec)->dst_ip_127_96 || \ + (_spec)->dst_ip_95_64 || \ + (_spec)->dst_ip_63_32 || \ + (_spec)->dst_ip_31_0) + struct mlx5dr_esw_caps { u64 drop_icm_address_rx; u64 drop_icm_address_tx; From ecd9c5cd46e013659e2fad433057bad1ba66888e Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 29 Dec 2021 22:22:05 +0200 Subject: [PATCH 382/773] net/mlx5: DR, Fix the threshold that defines when pool sync is initiated When deciding whether to start syncing and actually free all the "hot" ICM chunks, we need to consider the type of the ICM chunks that we're dealing with. For instance, the amount of available ICM for MODIFY_ACTION is significantly lower than the usual STE ICM, so the threshold should account for that - otherwise we can deplete MODIFY_ACTION memory just by creating and deleting the same modify header action in a continuous loop. This patch replaces the hard-coded threshold with a dynamic value. Fixes: 1c58651412bb ("net/mlx5: DR, ICM memory pools sync optimization") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_icm_pool.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c index f496b7e9401b..e289cfdbce07 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c @@ -4,7 +4,6 @@ #include "dr_types.h" #define DR_ICM_MODIFY_HDR_ALIGN_BASE 64 -#define DR_ICM_SYNC_THRESHOLD_POOL (64 * 1024 * 1024) struct mlx5dr_icm_pool { enum mlx5dr_icm_type icm_type; @@ -324,10 +323,14 @@ dr_icm_chunk_create(struct mlx5dr_icm_pool *pool, static bool dr_icm_pool_is_sync_required(struct mlx5dr_icm_pool *pool) { - if (pool->hot_memory_size > DR_ICM_SYNC_THRESHOLD_POOL) - return true; + int allow_hot_size; - return false; + /* sync when hot memory reaches half of the pool size */ + allow_hot_size = + mlx5dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, + pool->icm_type) / 2; + + return pool->hot_memory_size > allow_hot_size; } static int dr_icm_pool_sync_all_buddy_pools(struct mlx5dr_icm_pool *pool) From 7f839965b2d77e1926ad08b23c51d60988f10a99 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 16 Feb 2022 11:01:04 +0200 Subject: [PATCH 383/773] net/mlx5: Update log_max_qp value to be 17 at most Currently, log_max_qp value is dependent on what FW reports as its max capability. In reality, due to a bug, some FWs report a value greater than 17, even though they don't support log_max_qp > 17. This FW issue led the driver to exhaust memory on startup. Thus, log_max_qp value is set to be no more than 17 regardless of what FW reports, as it was before the cited commit. Fixes: f79a609ea6bf ("net/mlx5: Update log_max_qp value to FW max capability") Signed-off-by: Maher Sanalla Reviewed-by: Avihai Horon Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 13f913c13a2d..bba72b220cc3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -526,7 +526,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) /* Check log_max_qp from HCA caps to set in current profile */ if (prof->log_max_qp == LOG_MAX_SUPPORTED_QPS) { - prof->log_max_qp = MLX5_CAP_GEN_MAX(dev, log_max_qp); + prof->log_max_qp = min_t(u8, 17, MLX5_CAP_GEN_MAX(dev, log_max_qp)); } else if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < prof->log_max_qp) { mlx5_core_warn(dev, "log_max_qp value in current profile is %d, changing it to HCA capability limit (%d)\n", prof->log_max_qp, From 07666c75ad17d7389b18ac0235c8cf41e1504ea8 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sat, 29 Jan 2022 01:39:24 +0200 Subject: [PATCH 384/773] net/mlx5: Fix wrong limitation of metadata match on ecpf Match metadata support check returns false for ecpf device. However, this support does exist for ecpf and therefore this limitation should be removed to allow feature such as stacked devices and internal port offloaded to be supported. Fixes: 92ab1eb392c6 ("net/mlx5: E-Switch, Enable vport metadata matching if firmware supports it") Signed-off-by: Ariel Levkovich Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 9a7b25692505..cfcd72bad9af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2838,10 +2838,6 @@ bool mlx5_esw_vport_match_metadata_supported(const struct mlx5_eswitch *esw) if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, flow_source)) return false; - if (mlx5_core_is_ecpf_esw_manager(esw->dev) || - mlx5_ecpf_vport_exists(esw->dev)) - return false; - return true; } From be7f4b0ab149afd19514929fad824b2117d238c9 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 14 Dec 2021 03:52:53 +0200 Subject: [PATCH 385/773] net/mlx5: Fix tc max supported prio for nic mode Only prio 1 is supported if firmware doesn't support ignore flow level for nic mode. The offending commit removed the check wrongly. Add it back. Fixes: 9a99c8f1253a ("net/mlx5e: E-Switch, Offload all chain 0 priorities when modify header and forward action is not supported") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index df58cba37930..1e8ec4f236b2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -121,6 +121,9 @@ u32 mlx5_chains_get_nf_ft_chain(struct mlx5_fs_chains *chains) u32 mlx5_chains_get_prio_range(struct mlx5_fs_chains *chains) { + if (!mlx5_chains_prios_supported(chains)) + return 1; + if (mlx5_chains_ignore_flow_level_supported(chains)) return UINT_MAX; From b645e57debca846f51b3209907546ea857ddd3f5 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 24 Jan 2022 21:25:04 +0200 Subject: [PATCH 386/773] net/mlx5: Fix possible deadlock on rule deletion Add missing call to up_write_ref_node() which releases the semaphore in case the FTE doesn't have destinations, such in drop rule case. Fixes: 465e7baab6d9 ("net/mlx5: Fix deletion of duplicate rules") Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index b628917e38e4..537c82b9aa53 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2074,6 +2074,8 @@ void mlx5_del_flow_rules(struct mlx5_flow_handle *handle) fte->node.del_hw_func = NULL; up_write_ref_node(&fte->node, false); tree_put_node(&fte->node, false); + } else { + up_write_ref_node(&fte->node, false); } kfree(handle); } From 0b89429722353d112f8b8b29ca397e95fa994d27 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 2 Feb 2022 16:07:21 +0200 Subject: [PATCH 387/773] net/mlx5e: Fix wrong return value on ioctl EEPROM query failure The ioctl EEPROM query wrongly returns success on read failures, fix that by returning the appropriate error code. Fixes: bb64143eee8c ("net/mlx5e: Add ethtool support for dump module EEPROM") Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 57d755db1cf5..6e80585d731f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1792,7 +1792,7 @@ static int mlx5e_get_module_eeprom(struct net_device *netdev, if (size_read < 0) { netdev_err(priv->netdev, "%s: mlx5_query_eeprom failed:0x%x\n", __func__, size_read); - return 0; + return size_read; } i += size_read; From 7eaf1f37b8817c608c4e959d69986ef459d345cd Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Jan 2022 10:26:19 +0200 Subject: [PATCH 388/773] net/mlx5e: kTLS, Use CHECKSUM_UNNECESSARY for device-offloaded packets For RX TLS device-offloaded packets, the HW spec guarantees checksum validation for the offloaded packets, but does not define whether the CQE.checksum field matches the original packet (ciphertext) or the decrypted one (plaintext). This latitude allows architetctural improvements between generations of chips, resulting in different decisions regarding the value type of CQE.checksum. Hence, for these packets, the device driver should not make use of this CQE field. Here we block CHECKSUM_COMPLETE usage for RX TLS device-offloaded packets, and use CHECKSUM_UNNECESSARY instead. Value of the packet's tcp_hdr.csum is not modified by the HW, and it always matches the original ciphertext. Fixes: 1182f3659357 ("net/mlx5e: kTLS, Add kTLS RX HW offload support") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index ee0a8f5206e3..6530d7bd5045 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1349,7 +1349,8 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, } /* True when explicitly set via priv flag, or XDP prog is loaded */ - if (test_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &rq->state)) + if (test_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &rq->state) || + get_cqe_tls_offload(cqe)) goto csum_unnecessary; /* CQE csum doesn't cover padding octets in short ethernet From 23216d387c40b090b221ad457c95912fb47eb11e Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 4 Jan 2022 10:38:02 +0200 Subject: [PATCH 389/773] net/mlx5e: TC, Reject rules with drop and modify hdr action This kind of action is not supported by firmware and generates a syndrome. kernel: mlx5_core 0000:08:00.0: mlx5_cmd_check:777:(pid 102063): SET_FLOW_TABLE_ENTRY(0x936) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x8708c3) Fixes: d7e75a325cb2 ("net/mlx5e: Add offloading of E-Switch TC pedit (header re-write) actions") Signed-off-by: Roi Dayan Reviewed-by: Maor Dickman Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 2022fa4a9598..34700cf1285e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3204,6 +3204,12 @@ actions_match_supported(struct mlx5e_priv *priv, return false; } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { + NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); + return false; + } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && !modify_header_match_supported(priv, &parse_attr->spec, flow_action, actions, ct_flow, ct_clear, extack)) From 3d65492a86d4e6675734646929759138a023d914 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 17 Jan 2022 15:00:30 +0200 Subject: [PATCH 390/773] net/mlx5e: TC, Reject rules with forward and drop actions Such rules are redundant but allowed and passed to the driver. The driver does not support offloading such rules so return an error. Fixes: 03a9d11e6eeb ("net/mlx5e: Add TC drop and mirred/redirect action parsing for SRIOV offloads") Signed-off-by: Roi Dayan Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 34700cf1285e..b27532a9301e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3204,6 +3204,12 @@ actions_match_supported(struct mlx5e_priv *priv, return false; } + if (!(~actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { + NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action"); + return false; + } + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); From fb7e76ea3f3b6238dda2f19a4212052d2caf00aa Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 3 Feb 2022 09:42:19 +0200 Subject: [PATCH 391/773] net/mlx5e: TC, Skip redundant ct clear actions Offload of ct clear action is just resetting the reg_c register. It's done by allocating modify hdr resources which is limited. Doing it multiple times is redundant and wasting modify hdr resources and if resources depleted the driver will fail offloading the rule. Ignore redundant ct clear actions after the first one. Fixes: 806401c20a0f ("net/mlx5e: CT, Fix multiple allocations and memleak of mod acts") Signed-off-by: Roi Dayan Reviewed-by: Ariel Levkovich Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h index 26efa33de56f..10a40487d536 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h @@ -16,6 +16,7 @@ struct mlx5e_tc_act_parse_state { unsigned int num_actions; struct mlx5e_tc_flow *flow; struct netlink_ext_ack *extack; + bool ct_clear; bool encap; bool decap; bool mpls_push; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c index 06ec30cdb269..58cc33f1363d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c @@ -27,8 +27,13 @@ tc_act_parse_ct(struct mlx5e_tc_act_parse_state *parse_state, struct mlx5e_priv *priv, struct mlx5_flow_attr *attr) { + bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR; int err; + /* It's redundant to do ct clear more than once. */ + if (clear_action && parse_state->ct_clear) + return 0; + err = mlx5_tc_ct_parse_action(parse_state->ct_priv, attr, &attr->parse_attr->mod_hdr_acts, act, parse_state->extack); @@ -40,6 +45,8 @@ tc_act_parse_ct(struct mlx5e_tc_act_parse_state *parse_state, if (mlx5e_is_eswitch_flow(parse_state->flow)) attr->esw_attr->split_count = attr->esw_attr->out_count; + parse_state->ct_clear = clear_action; + return 0; } From 7fac0529038021919ef56a9c3218d8012f187cbb Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Tue, 1 Feb 2022 11:24:41 +0200 Subject: [PATCH 392/773] net/mlx5e: Add feature check for set fec counters Fec counters support is checked via the PCAM feature_cap_mask, bit 0: PPCNT_counter_group_Phy_statistical_counter_group. Add feature check to avoid faulty behavior. Fixes: 0a1498ebfa55 ("net/mlx5e: Expose FEC counters via ethtool") Signed-off-by: Lama Kayal Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 26e326fe503c..00f1d16db456 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -1254,9 +1254,6 @@ static void fec_set_corrected_bits_total(struct mlx5e_priv *priv, u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); - if (!MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group)) - return; - MLX5_SET(ppcnt_reg, in, local_port, 1); MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP); if (mlx5_core_access_reg(mdev, in, sz, ppcnt_phy_statistical, @@ -1272,6 +1269,9 @@ static void fec_set_corrected_bits_total(struct mlx5e_priv *priv, void mlx5e_stats_fec_get(struct mlx5e_priv *priv, struct ethtool_fec_stats *fec_stats) { + if (!MLX5_CAP_PCAM_FEATURE(priv->mdev, ppcnt_statistical_group)) + return; + fec_set_corrected_bits_total(priv, fec_stats); fec_set_block_stats(priv, fec_stats); } From c63741b426e11062631b013c3396f5452bbc0034 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Thu, 6 Jan 2022 14:10:18 +0200 Subject: [PATCH 393/773] net/mlx5e: Fix MPLSoUDP encap to use MPLS action information Currently the MPLSoUDP encap builds the MPLS header using encap action information (tunnel id, ttl and tos) instead of the MPLS action information (label, ttl, tc and bos) which is wrong. Fix by storing the MPLS action information during the flow action parse and later using it to create the encap MPLS header. Fixes: f828ca6a2fb6 ("net/mlx5e: Add support for hw encapsulation of MPLS over UDP") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/tc/act/act.h | 1 + .../ethernet/mellanox/mlx5/core/en/tc/act/mirred.c | 6 ++++++ .../net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c | 11 +++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h | 1 + .../net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 3 +++ .../ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c | 5 ++--- drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 8 ++++++++ 7 files changed, 32 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h index 10a40487d536..9cc844bd00f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h @@ -22,6 +22,7 @@ struct mlx5e_tc_act_parse_state { bool mpls_push; bool ptype_host; const struct ip_tunnel_info *tun_info; + struct mlx5e_mpls_info mpls_info; struct pedit_headers_action hdrs[__PEDIT_CMD_MAX]; int ifindexes[MLX5_MAX_FLOW_FWD_VPORTS]; int if_count; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c index c614fc7fdc9c..2e615e0ba972 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c @@ -177,6 +177,12 @@ parse_mirred_encap(struct mlx5e_tc_act_parse_state *parse_state, return -ENOMEM; parse_state->encap = false; + + if (parse_state->mpls_push) { + memcpy(&parse_attr->mpls_info[esw_attr->out_count], + &parse_state->mpls_info, sizeof(parse_state->mpls_info)); + parse_state->mpls_push = false; + } esw_attr->dests[esw_attr->out_count].flags |= MLX5_ESW_DEST_ENCAP; esw_attr->out_count++; /* attr->dests[].rep is resolved when we handle encap */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c index 784fc4f68b1e..89ca88c78840 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c @@ -22,6 +22,16 @@ tc_act_can_offload_mpls_push(struct mlx5e_tc_act_parse_state *parse_state, return true; } +static void +copy_mpls_info(struct mlx5e_mpls_info *mpls_info, + const struct flow_action_entry *act) +{ + mpls_info->label = act->mpls_push.label; + mpls_info->tc = act->mpls_push.tc; + mpls_info->bos = act->mpls_push.bos; + mpls_info->ttl = act->mpls_push.ttl; +} + static int tc_act_parse_mpls_push(struct mlx5e_tc_act_parse_state *parse_state, const struct flow_action_entry *act, @@ -29,6 +39,7 @@ tc_act_parse_mpls_push(struct mlx5e_tc_act_parse_state *parse_state, struct mlx5_flow_attr *attr) { parse_state->mpls_push = true; + copy_mpls_info(&parse_state->mpls_info, act); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h index f832c26ff2c3..70b40ae384e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h @@ -35,6 +35,7 @@ enum { struct mlx5e_tc_flow_parse_attr { const struct ip_tunnel_info *tun_info[MLX5_MAX_FLOW_FWD_VPORTS]; + struct mlx5e_mpls_info mpls_info[MLX5_MAX_FLOW_FWD_VPORTS]; struct net_device *filter_dev; struct mlx5_flow_spec spec; struct mlx5e_tc_mod_hdr_acts mod_hdr_acts; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 9918ed8c059b..d39d0dae22fc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -750,6 +750,7 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, struct mlx5e_tc_flow_parse_attr *parse_attr; struct mlx5_flow_attr *attr = flow->attr; const struct ip_tunnel_info *tun_info; + const struct mlx5e_mpls_info *mpls_info; unsigned long tbl_time_before = 0; struct mlx5e_encap_entry *e; struct mlx5e_encap_key key; @@ -760,6 +761,7 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, parse_attr = attr->parse_attr; tun_info = parse_attr->tun_info[out_index]; + mpls_info = &parse_attr->mpls_info[out_index]; family = ip_tunnel_info_af(tun_info); key.ip_tun_key = &tun_info->key; key.tc_tunnel = mlx5e_get_tc_tun(mirred_dev); @@ -810,6 +812,7 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, goto out_err_init; } e->tun_info = tun_info; + memcpy(&e->mpls_info, mpls_info, sizeof(*mpls_info)); err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack); if (err) goto out_err_init; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c index 60952b33b568..f40dbfcb6437 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c @@ -30,16 +30,15 @@ static int generate_ip_tun_hdr(char buf[], struct mlx5e_encap_entry *r) { const struct ip_tunnel_key *tun_key = &r->tun_info->key; + const struct mlx5e_mpls_info *mpls_info = &r->mpls_info; struct udphdr *udp = (struct udphdr *)(buf); struct mpls_shim_hdr *mpls; - u32 tun_id; - tun_id = be32_to_cpu(tunnel_id_to_key32(tun_key->tun_id)); mpls = (struct mpls_shim_hdr *)(udp + 1); *ip_proto = IPPROTO_UDP; udp->dest = tun_key->tp_dst; - *mpls = mpls_entry_encode(tun_id, tun_key->ttl, tun_key->tos, true); + *mpls = mpls_entry_encode(mpls_info->label, mpls_info->ttl, mpls_info->tc, mpls_info->bos); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index b01dacb6f527..b3f7520dfd08 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -183,6 +183,13 @@ struct mlx5e_decap_entry { struct rcu_head rcu; }; +struct mlx5e_mpls_info { + u32 label; + u8 tc; + u8 bos; + u8 ttl; +}; + struct mlx5e_encap_entry { /* attached neigh hash entry */ struct mlx5e_neigh_hash_entry *nhe; @@ -196,6 +203,7 @@ struct mlx5e_encap_entry { struct list_head route_list; struct mlx5_pkt_reformat *pkt_reformat; const struct ip_tunnel_info *tun_info; + struct mlx5e_mpls_info mpls_info; unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ struct net_device *out_dev; From fdc18e4e4bded2a08638cdcd22dc087a64b9ddad Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Thu, 6 Jan 2022 14:46:24 +0200 Subject: [PATCH 394/773] net/mlx5e: MPLSoUDP decap, fix check for unsupported matches Currently offload of rule on bareudp device require tunnel key in order to match on mpls fields and without it the mpls fields are ignored, this is incorrect due to the fact udp tunnel doesn't have key to match on. Fix by returning error in case flow is matching on tunnel key. Fixes: 72046a91d134 ("net/mlx5e: Allow to match on mpls parameters") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en/tc_tun_mplsoudp.c | 28 ++++++++----------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c index f40dbfcb6437..c5b1617d556f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c @@ -59,37 +59,31 @@ static int parse_tunnel(struct mlx5e_priv *priv, void *headers_v) { struct flow_rule *rule = flow_cls_offload_flow_rule(f); - struct flow_match_enc_keyid enc_keyid; struct flow_match_mpls match; void *misc2_c; void *misc2_v; - misc2_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, - misc_parameters_2); - misc2_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, - misc_parameters_2); - - if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS)) - return 0; - - if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) - return 0; - - flow_rule_match_enc_keyid(rule, &enc_keyid); - - if (!enc_keyid.mask->keyid) - return 0; - if (!MLX5_CAP_ETH(priv->mdev, tunnel_stateless_mpls_over_udp) && !(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & MLX5_FLEX_PROTO_CW_MPLS_UDP)) return -EOPNOTSUPP; + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) + return -EOPNOTSUPP; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS)) + return 0; + flow_rule_match_mpls(rule, &match); /* Only support matching the first LSE */ if (match.mask->used_lses != 1) return -EOPNOTSUPP; + misc2_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + misc2_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2_c, outer_first_mpls_over_udp.mpls_label, match.mask->ls[0].mpls_label); From 5ee02b7a800654ff9549807bcf0b4c9fd5cf25f9 Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Mon, 21 Feb 2022 12:26:11 +0200 Subject: [PATCH 395/773] net/mlx5e: Add missing increment of count Add mistakenly missing increment of count variable when looping over output buffer in mlx5e_self_test(). This resolves the issue of garbage values output when querying with self test via ethtool. before: $ ethtool -t eth2 The test result is PASS The test extra info: Link Test 0 Speed Test 1768697188 Health Test 758528120 Loopback Test 3288687 after: $ ethtool -t eth2 The test result is PASS The test extra info: Link Test 0 Speed Test 0 Health Test 0 Loopback Test 0 Fixes: 7990b1b5e8bd ("net/mlx5e: loopback test is not supported in switchdev mode") Signed-off-by: Lama Kayal Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 8c9163d2c646..08a75654f5f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -334,6 +334,7 @@ void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest, netdev_info(ndev, "\t[%d] %s start..\n", i, st.name); buf[count] = st.st_func(priv); netdev_info(ndev, "\t[%d] %s end: result(%lld)\n", i, st.name, buf[count]); + count++; } mutex_unlock(&priv->state_lock); From ca49df96f9f5efd4f0f1e64f7c4c0c63a3329cb9 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 21 Feb 2022 17:54:34 +0200 Subject: [PATCH 396/773] net/mlx5e: Fix VF min/max rate parameters interchange mistake The VF min and max rate were passed incorrectly and resulted in wrongly interchanging them. Fix the order of parameters in mlx5_esw_qos_set_vport_rate(). Fixes: d7df09f5e7b4 ("net/mlx5: E-switch, Enable vport QoS on demand") Signed-off-by: Gal Pressman Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 11bbcd5f5b8b..694c54066955 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -697,7 +697,7 @@ void mlx5_esw_qos_vport_disable(struct mlx5_eswitch *esw, struct mlx5_vport *vpo } int mlx5_esw_qos_set_vport_rate(struct mlx5_eswitch *esw, struct mlx5_vport *vport, - u32 min_rate, u32 max_rate) + u32 max_rate, u32 min_rate) { int err; From 7414db411919980bc4c9bab11d5d040b5dd7667c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 23 Feb 2022 14:18:31 -0300 Subject: [PATCH 397/773] rtla: Fix systme -> system typo on man page Link: https://lkml.kernel.org/r/YhZsZxqk+IaFxorj@kernel.org Fixes: 496082df01bb08a4 ("rtla: Add rtla osnoise man page") Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/tools/rtla/common_osnoise_description.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/tools/rtla/common_osnoise_description.rst b/Documentation/tools/rtla/common_osnoise_description.rst index 8973c5df888f..d5d61615b967 100644 --- a/Documentation/tools/rtla/common_osnoise_description.rst +++ b/Documentation/tools/rtla/common_osnoise_description.rst @@ -1,7 +1,7 @@ The **rtla osnoise** tool is an interface for the *osnoise* tracer. The *osnoise* tracer dispatches a kernel thread per-cpu. These threads read the time in a loop while with preemption, softirq and IRQs enabled, thus -allowing all the sources of operating systme noise during its execution. +allowing all the sources of operating system noise during its execution. The *osnoise*'s tracer threads take note of the delta between each time read, along with an interference counter of all sources of interference. At the end of each period, the *osnoise* tracer displays a summary of From 7f14c7227f342d9932f9b918893c8814f86d2a0d Mon Sep 17 00:00:00 2001 From: Szymon Heidrich Date: Mon, 21 Feb 2022 13:24:56 +0100 Subject: [PATCH 398/773] USB: gadget: validate endpoint index for xilinx udc Assure that host may not manipulate the index to point past endpoint array. Signed-off-by: Szymon Heidrich Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/udc-xilinx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/gadget/udc/udc-xilinx.c b/drivers/usb/gadget/udc/udc-xilinx.c index 6ce886fb7bfe..2907fad04e2c 100644 --- a/drivers/usb/gadget/udc/udc-xilinx.c +++ b/drivers/usb/gadget/udc/udc-xilinx.c @@ -1615,6 +1615,8 @@ static void xudc_getstatus(struct xusb_udc *udc) break; case USB_RECIP_ENDPOINT: epnum = udc->setup.wIndex & USB_ENDPOINT_NUMBER_MASK; + if (epnum >= XUSB_MAX_ENDPOINTS) + goto stall; target_ep = &udc->ep[epnum]; epcfgreg = udc->read_fn(udc->addr + target_ep->offset); halt = epcfgreg & XUSB_EP_CFG_STALL_MASK; @@ -1682,6 +1684,10 @@ static void xudc_set_clear_feature(struct xusb_udc *udc) case USB_RECIP_ENDPOINT: if (!udc->setup.wValue) { endpoint = udc->setup.wIndex & USB_ENDPOINT_NUMBER_MASK; + if (endpoint >= XUSB_MAX_ENDPOINTS) { + xudc_ep0_stall(udc); + return; + } target_ep = &udc->ep[endpoint]; outinbit = udc->setup.wIndex & USB_ENDPOINT_DIR_MASK; outinbit = outinbit >> 7; From 84918a89d6efaff075de570b55642b6f4ceeac6d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 18 Feb 2022 18:32:45 +0100 Subject: [PATCH 399/773] usb: dwc3: gadget: Let the interrupt handler disable bottom halves. The interrupt service routine registered for the gadget is a primary handler which mask the interrupt source and a threaded handler which handles the source of the interrupt. Since the threaded handler is voluntary threaded, the IRQ-core does not disable bottom halves before invoke the handler like it does for the forced-threaded handler. Due to changes in networking it became visible that a network gadget's completions handler may schedule a softirq which remains unprocessed. The gadget's completion handler is usually invoked either in hard-IRQ or soft-IRQ context. In this context it is enough to just raise the softirq because the softirq itself will be handled once that context is left. In the case of the voluntary threaded handler, there is nothing that will process pending softirqs. Which means it remain queued until another random interrupt (on this CPU) fires and handles it on its exit path or another thread locks and unlocks a lock with the bh suffix. Worst case is that the CPU goes idle and the NOHZ complains about unhandled softirqs. Disable bottom halves before acquiring the lock (and disabling interrupts) and enable them after dropping the lock. This ensures that any pending softirqs will handled right away. Link: https://lkml.kernel.org/r/c2a64979-73d1-2c22-e048-c275c9f81558@samsung.com Fixes: e5f68b4a3e7b0 ("Revert "usb: dwc3: gadget: remove unnecessary _irqsave()"") Cc: stable Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/Yg/YPejVQH3KkRVd@linutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 183b90923f51..a0c883f19a41 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -4160,9 +4160,11 @@ static irqreturn_t dwc3_thread_interrupt(int irq, void *_evt) unsigned long flags; irqreturn_t ret = IRQ_NONE; + local_bh_disable(); spin_lock_irqsave(&dwc->lock, flags); ret = dwc3_process_event_buf(evt); spin_unlock_irqrestore(&dwc->lock, flags); + local_bh_enable(); return ret; } From aaaba1c86d04dac8e49bf508b492f81506257da3 Mon Sep 17 00:00:00 2001 From: Daehwan Jung Date: Tue, 22 Feb 2022 14:29:28 +0900 Subject: [PATCH 400/773] usb: gadget: rndis: add spinlock for rndis response list There's no lock for rndis response list. It could cause list corruption if there're two different list_add at the same time like below. It's better to add in rndis_add_response / rndis_free_response / rndis_get_next_response to prevent any race condition on response list. [ 361.894299] [1: irq/191-dwc3:16979] list_add corruption. next->prev should be prev (ffffff80651764d0), but was ffffff883dc36f80. (next=ffffff80651764d0). [ 361.904380] [1: irq/191-dwc3:16979] Call trace: [ 361.904391] [1: irq/191-dwc3:16979] __list_add_valid+0x74/0x90 [ 361.904401] [1: irq/191-dwc3:16979] rndis_msg_parser+0x168/0x8c0 [ 361.904409] [1: irq/191-dwc3:16979] rndis_command_complete+0x24/0x84 [ 361.904417] [1: irq/191-dwc3:16979] usb_gadget_giveback_request+0x20/0xe4 [ 361.904426] [1: irq/191-dwc3:16979] dwc3_gadget_giveback+0x44/0x60 [ 361.904434] [1: irq/191-dwc3:16979] dwc3_ep0_complete_data+0x1e8/0x3a0 [ 361.904442] [1: irq/191-dwc3:16979] dwc3_ep0_interrupt+0x29c/0x3dc [ 361.904450] [1: irq/191-dwc3:16979] dwc3_process_event_entry+0x78/0x6cc [ 361.904457] [1: irq/191-dwc3:16979] dwc3_process_event_buf+0xa0/0x1ec [ 361.904465] [1: irq/191-dwc3:16979] dwc3_thread_interrupt+0x34/0x5c Fixes: f6281af9d62e ("usb: gadget: rndis: use list_for_each_entry_safe") Cc: stable Signed-off-by: Daehwan Jung Link: https://lore.kernel.org/r/1645507768-77687-1-git-send-email-dh10.jung@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/rndis.c | 8 ++++++++ drivers/usb/gadget/function/rndis.h | 1 + 2 files changed, 9 insertions(+) diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c index b7ccf1803656..00b3f6b3bb31 100644 --- a/drivers/usb/gadget/function/rndis.c +++ b/drivers/usb/gadget/function/rndis.c @@ -922,6 +922,7 @@ struct rndis_params *rndis_register(void (*resp_avail)(void *v), void *v) params->resp_avail = resp_avail; params->v = v; INIT_LIST_HEAD(¶ms->resp_queue); + spin_lock_init(¶ms->resp_lock); pr_debug("%s: configNr = %d\n", __func__, i); return params; @@ -1015,12 +1016,14 @@ void rndis_free_response(struct rndis_params *params, u8 *buf) { rndis_resp_t *r, *n; + spin_lock(¶ms->resp_lock); list_for_each_entry_safe(r, n, ¶ms->resp_queue, list) { if (r->buf == buf) { list_del(&r->list); kfree(r); } } + spin_unlock(¶ms->resp_lock); } EXPORT_SYMBOL_GPL(rndis_free_response); @@ -1030,14 +1033,17 @@ u8 *rndis_get_next_response(struct rndis_params *params, u32 *length) if (!length) return NULL; + spin_lock(¶ms->resp_lock); list_for_each_entry_safe(r, n, ¶ms->resp_queue, list) { if (!r->send) { r->send = 1; *length = r->length; + spin_unlock(¶ms->resp_lock); return r->buf; } } + spin_unlock(¶ms->resp_lock); return NULL; } EXPORT_SYMBOL_GPL(rndis_get_next_response); @@ -1054,7 +1060,9 @@ static rndis_resp_t *rndis_add_response(struct rndis_params *params, u32 length) r->length = length; r->send = 0; + spin_lock(¶ms->resp_lock); list_add_tail(&r->list, ¶ms->resp_queue); + spin_unlock(¶ms->resp_lock); return r; } diff --git a/drivers/usb/gadget/function/rndis.h b/drivers/usb/gadget/function/rndis.h index f6167f7fea82..6206b8b7490f 100644 --- a/drivers/usb/gadget/function/rndis.h +++ b/drivers/usb/gadget/function/rndis.h @@ -174,6 +174,7 @@ typedef struct rndis_params { void (*resp_avail)(void *v); void *v; struct list_head resp_queue; + spinlock_t resp_lock; } rndis_params; /* RNDIS Message parser and other useless functions */ From 68af28426b3ca1bf9ba21c7d8bdd0ff639e5134c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 23 Feb 2022 11:52:37 -0600 Subject: [PATCH 401/773] platform/x86: amd-pmc: Set QOS during suspend on CZN w/ timer wakeup commit 59348401ebed ("platform/x86: amd-pmc: Add special handling for timer based S0i3 wakeup") adds support for using another platform timer in lieu of the RTC which doesn't work properly on some systems. This path was validated and worked well before submission. During the 5.16-rc1 merge window other patches were merged that caused this to stop working properly. When this feature was used with 5.16-rc1 or later some OEM laptops with the matching firmware requirements from that commit would shutdown instead of program a timer based wakeup. This was bisected to commit 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path"). This wasn't supposed to cause any negative impacts and also tested well on both Intel and ARM platforms. However this changed the semantics of when CPUs are allowed to be in the deepest state. For the AMD systems in question it appears this causes a firmware crash for timer based wakeup. It's hypothesized to be caused by the `amd-pmc` driver sending `OS_HINT` and all the CPUs going into a deep state while the timer is still being programmed. It's likely a firmware bug, but to avoid it don't allow setting CPUs into the deepest state while using CZN timer wakeup path. If later it's discovered that this also occurs from "regular" suspends without a timer as well or on other silicon, this may be later expanded to run in the suspend path for more scenarios. Cc: stable@vger.kernel.org # 5.16+ Suggested-by: Rafael J. Wysocki Link: https://lore.kernel.org/linux-acpi/BL1PR12MB51570F5BD05980A0DCA1F3F4E23A9@BL1PR12MB5157.namprd12.prod.outlook.com/T/#mee35f39c41a04b624700ab2621c795367f19c90e Fixes: 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path") Fixes: 23f62d7ab25b ("PM: sleep: Pause cpuidle later and resume it earlier during system transitions") Fixes: 59348401ebed ("platform/x86: amd-pmc: Add special handling for timer based S0i3 wakeup" Reviewed-by: Rafael J. Wysocki Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20220223175237.6209-1-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd-pmc.c | 42 ++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c index 4c72ba68b315..b1103f85a85a 100644 --- a/drivers/platform/x86/amd-pmc.c +++ b/drivers/platform/x86/amd-pmc.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -85,6 +86,9 @@ #define PMC_MSG_DELAY_MIN_US 50 #define RESPONSE_REGISTER_LOOP_MAX 20000 +/* QoS request for letting CPUs in idle states, but not the deepest */ +#define AMD_PMC_MAX_IDLE_STATE_LATENCY 3 + #define SOC_SUBSYSTEM_IP_MAX 12 #define DELAY_MIN_US 2000 #define DELAY_MAX_US 3000 @@ -131,6 +135,7 @@ struct amd_pmc_dev { struct device *dev; struct pci_dev *rdev; struct mutex lock; /* generic mutex lock */ + struct pm_qos_request amd_pmc_pm_qos_req; #if IS_ENABLED(CONFIG_DEBUG_FS) struct dentry *dbgfs_dir; #endif /* CONFIG_DEBUG_FS */ @@ -521,6 +526,14 @@ static int amd_pmc_verify_czn_rtc(struct amd_pmc_dev *pdev, u32 *arg) rc = rtc_alarm_irq_enable(rtc_device, 0); dev_dbg(pdev->dev, "wakeup timer programmed for %lld seconds\n", duration); + /* + * Prevent CPUs from getting into deep idle states while sending OS_HINT + * which is otherwise generally safe to send when at least one of the CPUs + * is not in deep idle states. + */ + cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req, AMD_PMC_MAX_IDLE_STATE_LATENCY); + wake_up_all_idle_cpus(); + return rc; } @@ -538,24 +551,31 @@ static int __maybe_unused amd_pmc_suspend(struct device *dev) /* Activate CZN specific RTC functionality */ if (pdev->cpu_id == AMD_CPU_ID_CZN) { rc = amd_pmc_verify_czn_rtc(pdev, &arg); - if (rc < 0) - return rc; + if (rc) + goto fail; } /* Dump the IdleMask before we send hint to SMU */ amd_pmc_idlemask_read(pdev, dev, NULL); msg = amd_pmc_get_os_hint(pdev); rc = amd_pmc_send_cmd(pdev, arg, NULL, msg, 0); - if (rc) + if (rc) { dev_err(pdev->dev, "suspend failed\n"); + goto fail; + } if (enable_stb) rc = amd_pmc_write_stb(pdev, AMD_PMC_STB_PREDEF); - if (rc) { + if (rc) { dev_err(pdev->dev, "error writing to STB\n"); - return rc; + goto fail; } + return 0; +fail: + if (pdev->cpu_id == AMD_CPU_ID_CZN) + cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req, + PM_QOS_DEFAULT_VALUE); return rc; } @@ -579,12 +599,15 @@ static int __maybe_unused amd_pmc_resume(struct device *dev) /* Write data incremented by 1 to distinguish in stb_read */ if (enable_stb) rc = amd_pmc_write_stb(pdev, AMD_PMC_STB_PREDEF + 1); - if (rc) { + if (rc) dev_err(pdev->dev, "error writing to STB\n"); - return rc; - } - return 0; + /* Restore the QoS request back to defaults if it was set */ + if (pdev->cpu_id == AMD_CPU_ID_CZN) + cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req, + PM_QOS_DEFAULT_VALUE); + + return rc; } static const struct dev_pm_ops amd_pmc_pm_ops = { @@ -722,6 +745,7 @@ static int amd_pmc_probe(struct platform_device *pdev) amd_pmc_get_smu_version(dev); platform_set_drvdata(pdev, dev); amd_pmc_dbgfs_register(dev); + cpu_latency_qos_add_request(&dev->amd_pmc_pm_qos_req, PM_QOS_DEFAULT_VALUE); return 0; err_pci_dev_put: From 21d90aaee8d5c2a097ef41f1430d97661233ecc6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 24 Feb 2022 11:18:48 +0100 Subject: [PATCH 402/773] surface: surface3_power: Fix battery readings on batteries without a serial number The battery on the 2nd hand Surface 3 which I recently bought appears to not have a serial number programmed in. This results in any I2C reads from the registers containing the serial number failing with an I2C NACK. This was causing mshw0011_bix() to fail causing the battery readings to not work at all. Ignore EREMOTEIO (I2C NACK) errors when retrieving the serial number and continue with an empty serial number to fix this. Fixes: b1f81b496b0d ("platform/x86: surface3_power: MSHW0011 rev-eng implementation") BugLink: https://github.com/linux-surface/linux-surface/issues/608 Reviewed-by: Benjamin Tissoires Reviewed-by: Maximilian Luz Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220224101848.7219-1-hdegoede@redhat.com --- drivers/platform/surface/surface3_power.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/platform/surface/surface3_power.c b/drivers/platform/surface/surface3_power.c index abac3eec565e..444ec81ba02d 100644 --- a/drivers/platform/surface/surface3_power.c +++ b/drivers/platform/surface/surface3_power.c @@ -232,14 +232,21 @@ static int mshw0011_bix(struct mshw0011_data *cdata, struct bix *bix) } bix->last_full_charg_capacity = ret; - /* get serial number */ + /* + * Get serial number, on some devices (with unofficial replacement + * battery?) reading any of the serial number range addresses gets + * nacked in this case just leave the serial number empty. + */ ret = i2c_smbus_read_i2c_block_data(client, MSHW0011_BAT0_REG_SERIAL_NO, sizeof(buf), buf); - if (ret != sizeof(buf)) { + if (ret == -EREMOTEIO) { + /* no serial number available */ + } else if (ret != sizeof(buf)) { dev_err(&client->dev, "Error reading serial no: %d\n", ret); return ret; + } else { + snprintf(bix->serial, ARRAY_SIZE(bix->serial), "%3pE%6pE", buf + 7, buf); } - snprintf(bix->serial, ARRAY_SIZE(bix->serial), "%3pE%6pE", buf + 7, buf); /* get cycle count */ ret = i2c_smbus_read_word_data(client, MSHW0011_BAT0_REG_CYCLE_CNT); From 558c303c9734af5a813739cd284879227f7297d2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 10 Nov 2021 14:48:00 +0000 Subject: [PATCH 403/773] arm64: Mitigate spectre style branch history side channels Speculation attacks against some high-performance processors can make use of branch history to influence future speculation. When taking an exception from user-space, a sequence of branches or a firmware call overwrites or invalidates the branch history. The sequence of branches is added to the vectors, and should appear before the first indirect branch. For systems using KPTI the sequence is added to the kpti trampoline where it has a free register as the exit from the trampoline is via a 'ret'. For systems not using KPTI, the same register tricks are used to free up a register in the vectors. For the firmware call, arch-workaround-3 clobbers 4 registers, so there is no choice but to save them to the EL1 stack. This only happens for entry from EL0, so if we take an exception due to the stack access, it will not become re-entrant. For KVM, the existing branch-predictor-hardening vectors are used. When a spectre version of these vectors is in use, the firmware call is sufficient to mitigate against Spectre-BHB. For the non-spectre versions, the sequence of branches is added to the indirect vector. Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/Kconfig | 9 + arch/arm64/include/asm/assembler.h | 14 +- arch/arm64/include/asm/cpufeature.h | 16 ++ arch/arm64/include/asm/cputype.h | 8 + arch/arm64/include/asm/spectre.h | 4 +- arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/asm/vectors.h | 5 + arch/arm64/kernel/cpu_errata.c | 7 + arch/arm64/kernel/image-vars.h | 3 + arch/arm64/kernel/proton-pack.c | 278 ++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/hyp-entry.S | 8 + arch/arm64/tools/cpucaps | 1 + 12 files changed, 352 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index cbcd42decb2a..c9631626c0b3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1382,6 +1382,15 @@ config UNMAP_KERNEL_AT_EL0 If unsure, say Y. +config MITIGATE_SPECTRE_BRANCH_HISTORY + bool "Mitigate Spectre style attacks against branch history" if EXPERT + default y + help + Speculation attacks against some high-performance processors can + make use of branch history to influence future speculation. + When taking an exception from user-space, a sequence of branches + or a firmware call overwrites the branch history. + config RODATA_FULL_DEFAULT_ENABLED bool "Apply r/o permissions of VM areas also to their linear aliases" default y diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 046c38ee2841..5cc97130843f 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -852,7 +852,9 @@ alternative_endif .macro __mitigate_spectre_bhb_loop tmp #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY - mov \tmp, #32 +alternative_cb spectre_bhb_patch_loop_iter + mov \tmp, #32 // Patched to correct the immediate +alternative_cb_end .Lspectre_bhb_loop\@: b . + 4 subs \tmp, \tmp, #1 @@ -861,6 +863,16 @@ alternative_endif #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ .endm + .macro mitigate_spectre_bhb_loop tmp +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +alternative_cb spectre_bhb_patch_loop_mitigation_enable + b .L_spectre_bhb_loop_done\@ // Patched to NOP +alternative_cb_end + __mitigate_spectre_bhb_loop \tmp +.L_spectre_bhb_loop_done\@: +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + /* Save/restores x0-x3 to the stack */ .macro __mitigate_spectre_bhb_fw #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index ef6be92b1921..b158fd447f3a 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -637,6 +637,22 @@ static inline bool cpu_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); } + +static inline bool supports_csv2p3(int scope) +{ + u64 pfr0; + u8 csv2_val; + + if (scope == SCOPE_LOCAL_CPU) + pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); + else + pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + csv2_val = cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_CSV2_SHIFT); + return csv2_val == 3; +} + const struct cpumask *system_32bit_el0_cpumask(void); DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 999b9149f856..bfbf0c4c7c5e 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -73,10 +73,14 @@ #define ARM_CPU_PART_CORTEX_A76 0xD0B #define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define ARM_CPU_PART_CORTEX_A77 0xD0D +#define ARM_CPU_PART_NEOVERSE_V1 0xD40 +#define ARM_CPU_PART_CORTEX_A78 0xD41 +#define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 +#define ARM_CPU_PART_CORTEX_A78C 0xD4B #define APM_CPU_PART_POTENZA 0x000 @@ -117,10 +121,14 @@ #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) +#define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) +#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) +#define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) +#define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index c617fe90a843..86e0cc9b9c68 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -94,6 +94,8 @@ void spectre_v4_enable_task_mitigation(struct task_struct *tsk); enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); - +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +u8 spectre_bhb_loop_affected(int scope); +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); #endif /* __ASSEMBLY__ */ #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 898bee0004ae..238b9f6a28fc 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -904,6 +904,7 @@ #endif /* id_aa64mmfr1 */ +#define ID_AA64MMFR1_ECBHB_SHIFT 60 #define ID_AA64MMFR1_AFP_SHIFT 44 #define ID_AA64MMFR1_ETS_SHIFT 36 #define ID_AA64MMFR1_TWED_SHIFT 32 diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index 3f76dfd9e074..1f65c37dc653 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -40,6 +40,11 @@ enum arm64_bp_harden_el1_vectors { EL1_VECTOR_KPTI, }; +#ifndef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +#define EL1_VECTOR_BHB_LOOP -1 +#define EL1_VECTOR_BHB_FW -1 +#endif /* !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + /* The vectors to use on return from EL0. e.g. to remap the kernel */ DECLARE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index b217941713a8..a401180e8d66 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -502,6 +502,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .matches = has_spectre_v4, .cpu_enable = spectre_v4_enable_mitigation, }, + { + .desc = "Spectre-BHB", + .capability = ARM64_SPECTRE_BHB, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = is_spectre_bhb_affected, + .cpu_enable = spectre_bhb_enable_mitigation, + }, #ifdef CONFIG_ARM64_ERRATUM_1418040 { .desc = "ARM erratum 1418040", diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 7eaf1f7c4168..56db964a4f0c 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -66,6 +66,9 @@ KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); KVM_NVHE_ALIAS(kvm_get_kimage_voffset); KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter); +KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); +KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 8a499b8373c0..fbbcd1d2eb19 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -24,9 +24,11 @@ #include #include +#include #include #include #include +#include #include /* @@ -796,6 +798,17 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) } } +/* + * Spectre BHB. + * + * A CPU is either: + * - Mitigated by a branchy loop a CPU specific number of times, and listed + * in our "loop mitigated list". + * - Mitigated in software by the firmware Spectre v2 call. + * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no + * software mitigation in the vectors is needed. + * - Has CSV2.3, so is unaffected. + */ static enum mitigation_state spectre_bhb_state; enum mitigation_state arm64_get_spectre_bhb_state(void) @@ -803,12 +816,227 @@ enum mitigation_state arm64_get_spectre_bhb_state(void) return spectre_bhb_state; } +enum bhb_mitigation_bits { + BHB_LOOP, + BHB_FW, + BHB_HW, +}; +static unsigned long system_bhb_mitigations; + +/* + * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any + * SCOPE_SYSTEM call will give the right answer. + */ +u8 spectre_bhb_loop_affected(int scope) +{ + u8 k = 0; + static u8 max_bhb_k; + + if (scope == SCOPE_LOCAL_CPU) { + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k8_list)) + k = 8; + + max_bhb_k = max(max_bhb_k, k); + } else { + k = max_bhb_k; + } + + return k; +} + +static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_3, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +static bool is_spectre_bhb_fw_affected(int scope) +{ + static bool system_affected; + enum mitigation_state fw_state; + bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; + static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + {}, + }; + bool cpu_in_list = is_midr_in_range_list(read_cpuid_id(), + spectre_bhb_firmware_mitigated_list); + + if (scope != SCOPE_LOCAL_CPU) + return system_affected; + + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { + system_affected = true; + return true; + } + + return false; +} + +static bool supports_ecbhb(int scope) +{ + u64 mmfr1; + + if (scope == SCOPE_LOCAL_CPU) + mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1); + else + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_ECBHB_SHIFT); +} + +bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, + int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (supports_csv2p3(scope)) + return false; + + if (spectre_bhb_loop_affected(scope)) + return true; + + if (is_spectre_bhb_fw_affected(scope)) + return true; + + return false; +} + +static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) +{ + const char *v = arm64_get_bp_hardening_vector(slot); + + if (slot < 0) + return; + + __this_cpu_write(this_cpu_vector, v); + + /* + * When KPTI is in use, the vectors are switched when exiting to + * user-space. + */ + if (arm64_kernel_unmapped_at_el0()) + return; + + write_sysreg(v, vbar_el1); + isb(); +} + +void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) +{ + bp_hardening_cb_t cpu_cb; + enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); + + if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) + return; + + if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { + /* No point mitigating Spectre-BHB alone. */ + } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { + pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); + } else if (cpu_mitigations_off()) { + pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { + state = SPECTRE_MITIGATED; + set_bit(BHB_HW, &system_bhb_mitigations); + } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have the + * branchy-loop added. A57/A72-r0 will already have selected + * the spectre-indirect vector, which is sufficient for BHB + * too. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); + state = SPECTRE_MITIGATED; + set_bit(BHB_LOOP, &system_bhb_mitigations); + } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { + fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); + if (fw_state == SPECTRE_MITIGATED) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); + } + } + + update_mitigation_state(&spectre_bhb_state, state); +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst) { BUG_ON(nr_inst != 1); + + if (test_bit(BHB_LOOP, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); } /* Patched to NOP when enabled */ @@ -817,4 +1045,54 @@ void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt, __le32 *updptr, int nr_inst) { BUG_ON(nr_inst != 1); + + if (test_bit(BHB_FW, &system_bhb_mitigations)) + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* Patched to correct the immediate */ +void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr++ = cpu_to_le32(insn); +} + +/* Patched to mov WA3 when supported */ +void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + u8 rd; + u32 insn; + + BUG_ON(nr_inst != 1); /* MOV -> MOV */ + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) || + !test_bit(BHB_FW, &system_bhb_mitigations)) + return; + + insn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); + + insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_ORR, + AARCH64_INSN_VARIANT_32BIT, + AARCH64_INSN_REG_ZR, rd, + ARM_SMCCC_ARCH_WORKAROUND_3); + if (WARN_ON_ONCE(insn == AARCH64_BREAK_FAULT)) + return; + + *updptr++ = cpu_to_le32(insn); } diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index b6b6801d96d5..12aa49324955 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -62,6 +62,10 @@ el1_sync: // Guest trapped into EL2 /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_1 ^ \ ARM_SMCCC_ARCH_WORKAROUND_2) + cbz w1, wa_epilogue + + eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_2 ^ \ + ARM_SMCCC_ARCH_WORKAROUND_3) cbnz w1, el1_trap wa_epilogue: @@ -192,7 +196,10 @@ SYM_CODE_END(__kvm_hyp_vector) sub sp, sp, #(8 * 4) stp x2, x3, [sp, #(8 * 0)] stp x0, x1, [sp, #(8 * 2)] + alternative_cb spectre_bhb_patch_wa3 + /* Patched to mov WA3 when supported */ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1 + alternative_cb_end smc #0 ldp x2, x3, [sp, #(8 * 0)] add sp, sp, #(8 * 2) @@ -205,6 +212,7 @@ SYM_CODE_END(__kvm_hyp_vector) spectrev2_smccc_wa1_smc .else stp x0, x1, [sp, #-16]! + mitigate_spectre_bhb_loop x0 .endif .if \indirect != 0 alternative_cb kvm_patch_vector_branch diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 9c65b1e25a96..cea7533cb304 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -44,6 +44,7 @@ MTE_ASYMM SPECTRE_V2 SPECTRE_V3A SPECTRE_V4 +SPECTRE_BHB SSBS SVE UNMAP_KERNEL_AT_EL0 From a5905d6af492ee6a4a2205f0d550b3f931b03d03 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 10 Dec 2021 11:16:18 +0000 Subject: [PATCH 404/773] KVM: arm64: Allow SMCCC_ARCH_WORKAROUND_3 to be discovered and migrated KVM allows the guest to discover whether the ARCH_WORKAROUND SMCCC are implemented, and to preserve that state during migration through its firmware register interface. Add the necessary boiler plate for SMCCC_ARCH_WORKAROUND_3. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/uapi/asm/kvm.h | 5 +++++ arch/arm64/kvm/hypercalls.c | 12 ++++++++++++ arch/arm64/kvm/psci.c | 18 +++++++++++++++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index b3edde68bc3e..323e251ed37b 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -281,6 +281,11 @@ struct kvm_arm_copy_mte_tags { #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED 3 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED (1U << 4) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 KVM_REG_ARM_FW_REG(3) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL 0 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL 1 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED 2 + /* SVE registers */ #define KVM_REG_ARM64_SVE (0x15 << KVM_REG_ARM_COPROC_SHIFT) diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 30da78f72b3b..202b8c455724 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -107,6 +107,18 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; } break; + case ARM_SMCCC_ARCH_WORKAROUND_3: + switch (arm64_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + break; + case SPECTRE_MITIGATED: + val[0] = SMCCC_RET_SUCCESS; + break; + case SPECTRE_UNAFFECTED: + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + break; + } + break; case ARM_SMCCC_HV_PV_TIME_FEATURES: val[0] = SMCCC_RET_SUCCESS; break; diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 3eae32876897..14b9726041ff 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -406,7 +406,7 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) { - return 3; /* PSCI version and two workaround registers */ + return 4; /* PSCI version and three workaround registers */ } int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) @@ -420,6 +420,9 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++)) return -EFAULT; + if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3, uindices++)) + return -EFAULT; + return 0; } @@ -459,6 +462,17 @@ static int get_kernel_wa_level(u64 regid) case SPECTRE_VULNERABLE: return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; } + break; + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: + switch (arm64_get_spectre_bhb_state()) { + case SPECTRE_VULNERABLE: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; + case SPECTRE_MITIGATED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL; + case SPECTRE_UNAFFECTED: + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED; + } + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL; } return -EINVAL; @@ -475,6 +489,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) break; case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; break; default: @@ -520,6 +535,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) } case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: if (val & ~KVM_REG_FEATURE_LEVEL_MASK) return -EINVAL; From 228a26b912287934789023b4132ba76065d9491c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 10 Dec 2021 14:32:56 +0000 Subject: [PATCH 405/773] arm64: Use the clearbhb instruction in mitigations Future CPUs may implement a clearbhb instruction that is sufficient to mitigate SpectreBHB. CPUs that implement this instruction, but not CSV2.3 must be affected by Spectre-BHB. Add support to use this instruction as the BHB mitigation on CPUs that support it. The instruction is in the hint space, so it will be treated by a NOP as older CPUs. Reviewed-by: Russell King (Oracle) Reviewed-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/include/asm/assembler.h | 17 +++++++++++++++++ arch/arm64/include/asm/cpufeature.h | 13 +++++++++++++ arch/arm64/include/asm/insn.h | 1 + arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/asm/vectors.h | 7 +++++++ arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/entry.S | 8 ++++++++ arch/arm64/kernel/image-vars.h | 1 + arch/arm64/kernel/proton-pack.c | 29 +++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/hyp-entry.S | 1 + 10 files changed, 79 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 5cc97130843f..6ebdc0f834a7 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -108,6 +108,13 @@ hint #20 .endm +/* + * Clear Branch History instruction + */ + .macro clearbhb + hint #22 + .endm + /* * Speculation barrier */ @@ -884,6 +891,16 @@ alternative_cb smccc_patch_fw_mitigation_conduit alternative_cb_end ldp x2, x3, [sp], #16 ldp x0, x1, [sp], #16 +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + + .macro mitigate_spectre_bhb_clear_insn +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY +alternative_cb spectre_bhb_patch_clearbhb + /* Patched to NOP when not supported */ + clearbhb + isb +alternative_cb_end #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ .endm #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index b158fd447f3a..a77b5f49b3a6 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -653,6 +653,19 @@ static inline bool supports_csv2p3(int scope) return csv2_val == 3; } +static inline bool supports_clearbhb(int scope) +{ + u64 isar2; + + if (scope == SCOPE_LOCAL_CPU) + isar2 = read_sysreg_s(SYS_ID_AA64ISAR2_EL1); + else + isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); + + return cpuid_feature_extract_unsigned_field(isar2, + ID_AA64ISAR2_CLEARBHB_SHIFT); +} + const struct cpumask *system_32bit_el0_cpumask(void); DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 6b776c8667b2..b02f0c328c8e 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -65,6 +65,7 @@ enum aarch64_insn_hint_cr_op { AARCH64_INSN_HINT_PSB = 0x11 << 5, AARCH64_INSN_HINT_TSB = 0x12 << 5, AARCH64_INSN_HINT_CSDB = 0x14 << 5, + AARCH64_INSN_HINT_CLEARBHB = 0x16 << 5, AARCH64_INSN_HINT_BTI = 0x20 << 5, AARCH64_INSN_HINT_BTIC = 0x22 << 5, diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 238b9f6a28fc..932d45b17877 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -773,6 +773,7 @@ #define ID_AA64ISAR1_GPI_IMP_DEF 0x1 /* id_aa64isar2 */ +#define ID_AA64ISAR2_CLEARBHB_SHIFT 28 #define ID_AA64ISAR2_RPRES_SHIFT 4 #define ID_AA64ISAR2_WFXT_SHIFT 0 diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h index 1f65c37dc653..f64613a96d53 100644 --- a/arch/arm64/include/asm/vectors.h +++ b/arch/arm64/include/asm/vectors.h @@ -32,6 +32,12 @@ enum arm64_bp_harden_el1_vectors { * canonical vectors. */ EL1_VECTOR_BHB_FW, + + /* + * Use the ClearBHB instruction, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_CLEAR_INSN, #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ /* @@ -43,6 +49,7 @@ enum arm64_bp_harden_el1_vectors { #ifndef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY #define EL1_VECTOR_BHB_LOOP -1 #define EL1_VECTOR_BHB_FW -1 +#define EL1_VECTOR_BHB_CLEAR_INSN -1 #endif /* !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ /* The vectors to use on return from EL0. e.g. to remap the kernel */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 45fed4974c44..d33687673f6b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -231,6 +231,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_CLEARBHB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_RPRES_SHIFT, 4, 0), ARM64_FTR_END, }; diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a62fee121138..4a3a653df07e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -657,6 +657,7 @@ alternative_else_nop_endif #define BHB_MITIGATION_NONE 0 #define BHB_MITIGATION_LOOP 1 #define BHB_MITIGATION_FW 2 +#define BHB_MITIGATION_INSN 3 .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 @@ -673,6 +674,11 @@ alternative_else_nop_endif __mitigate_spectre_bhb_loop x30 .endif // \bhb == BHB_MITIGATION_LOOP + .if \bhb == BHB_MITIGATION_INSN + clearbhb + isb + .endif // \bhb == BHB_MITIGATION_INSN + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy @@ -749,6 +755,7 @@ SYM_CODE_START_NOALIGN(tramp_vectors) #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_INSN #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE SYM_CODE_END(tramp_vectors) @@ -811,6 +818,7 @@ SYM_CODE_START(__bp_harden_el1_vectors) #ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY generate_el1_vector bhb=BHB_MITIGATION_LOOP generate_el1_vector bhb=BHB_MITIGATION_FW + generate_el1_vector bhb=BHB_MITIGATION_INSN #endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ SYM_CODE_END(__bp_harden_el1_vectors) .popsection diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 56db964a4f0c..55a1ced8eb77 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -69,6 +69,7 @@ KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); KVM_NVHE_ALIAS(spectre_bhb_patch_loop_iter); KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); +KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index fbbcd1d2eb19..d3fbff00993d 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -805,6 +805,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) * - Mitigated by a branchy loop a CPU specific number of times, and listed * in our "loop mitigated list". * - Mitigated in software by the firmware Spectre v2 call. + * - Has the ClearBHB instruction to perform the mitigation. * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no * software mitigation in the vectors is needed. * - Has CSV2.3, so is unaffected. @@ -820,6 +821,7 @@ enum bhb_mitigation_bits { BHB_LOOP, BHB_FW, BHB_HW, + BHB_INSN, }; static unsigned long system_bhb_mitigations; @@ -937,6 +939,9 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, if (supports_csv2p3(scope)) return false; + if (supports_clearbhb(scope)) + return true; + if (spectre_bhb_loop_affected(scope)) return true; @@ -984,6 +989,17 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { state = SPECTRE_MITIGATED; set_bit(BHB_HW, &system_bhb_mitigations); + } else if (supports_clearbhb(SCOPE_LOCAL_CPU)) { + /* + * Ensure KVM uses the indirect vector which will have ClearBHB + * added. + */ + if (!data->slot) + data->slot = HYP_VECTOR_INDIRECT; + + this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); + state = SPECTRE_MITIGATED; + set_bit(BHB_INSN, &system_bhb_mitigations); } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { /* * Ensure KVM uses the indirect vector which will have the @@ -1096,3 +1112,16 @@ void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt, *updptr++ = cpu_to_le32(insn); } + +/* Patched to NOP when not supported */ +void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 2); + + if (test_bit(BHB_INSN, &system_bhb_mitigations)) + return; + + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); + *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); +} diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 12aa49324955..7839d075729b 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -213,6 +213,7 @@ SYM_CODE_END(__kvm_hyp_vector) .else stp x0, x1, [sp, #-16]! mitigate_spectre_bhb_loop x0 + mitigate_spectre_bhb_clear_insn .endif .if \indirect != 0 alternative_cb kvm_patch_vector_branch From 558732df2122092259ab4ef85594bee11dbb9104 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 13 Feb 2022 15:42:33 +0800 Subject: [PATCH 406/773] btrfs: reduce extent threshold for autodefrag There is a big gap between inode_should_defrag() and autodefrag extent size threshold. For inode_should_defrag() it has a flexible @small_write value. For compressed extent is 16K, and for non-compressed extent it's 64K. However for autodefrag extent size threshold, it's always fixed to the default value (256K). This means, the following write sequence will trigger autodefrag to defrag ranges which didn't trigger autodefrag: pwrite 0 8k sync pwrite 8k 128K sync The latter 128K write will also be considered as a defrag target (if other conditions are met). While only that 8K write is really triggering autodefrag. Such behavior can cause extra IO for autodefrag. Close the gap, by copying the @small_write value into inode_defrag, so that later autodefrag can use the same @small_write value which triggered autodefrag. With the existing transid value, this allows autodefrag really to scan the ranges which triggered autodefrag. Although this behavior change is mostly reducing the extent_thresh value for autodefrag, I believe in the future we should allow users to specify the autodefrag extent threshold through mount options, but that's an other problem to consider in the future. CC: stable@vger.kernel.org # 5.16+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 15 ++++++++++++++- fs/btrfs/inode.c | 4 ++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8992e0096163..947f04789389 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3291,7 +3291,7 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); + struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36a81368ed46..a0179cc62913 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -49,6 +49,15 @@ struct inode_defrag { /* root objectid */ u64 root; + + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, + * thus needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; }; static int __compare_inode_defrag(struct inode_defrag *defrag1, @@ -101,6 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, */ if (defrag->transid < entry->transid) entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); return -EEXIST; } } @@ -126,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) * enabled */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) + struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -152,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { @@ -275,6 +287,7 @@ again: memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; + range.extent_thresh = defrag->extent_thresh; sb_start_write(fs_info->sb); ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b2403b6127f..76e530f76e3c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -560,12 +560,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, } static inline void inode_should_defrag(struct btrfs_inode *inode, - u64 start, u64 end, u64 num_bytes, u64 small_write) + u64 start, u64 end, u64 num_bytes, u32 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode); + btrfs_add_inode_defrag(NULL, inode, small_write); } /* From 0f4558ae91870692ce7f509c31c9d6ee721d8cdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Tue, 22 Feb 2022 01:18:16 +0100 Subject: [PATCH 407/773] Revert "xen-netback: remove 'hotplug-status' once it has served its purpose" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 1f2565780e9b7218cf92c7630130e82dcc0fe9c2. The 'hotplug-status' node should not be removed as long as the vif device remains configured. Otherwise the xen-netback would wait for re-running the network script even if it was already called (in case of the frontent re-connecting). But also, it _should_ be removed when the vif device is destroyed (for example when unbinding the driver) - otherwise hotplug script would not configure the device whenever it re-appear. Moving removal of the 'hotplug-status' node was a workaround for nothing calling network script after xen-netback module is reloaded. But when vif interface is re-created (on xen-netback unbind/bind for example), the script should be called, regardless of who does that - currently this case is not handled by the toolstack, and requires manual script call. Keeping hotplug-status=connected to skip the call is wrong and leads to not configured interface. More discussion at https://lore.kernel.org/xen-devel/afedd7cb-a291-e773-8b0d-4db9b291fa98@ipxe.org/T/#u Signed-off-by: Marek Marczykowski-Górecki Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20220222001817.2264967-1-marmarek@invisiblethingslab.com Signed-off-by: Jakub Kicinski --- drivers/net/xen-netback/xenbus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index d24b7a7993aa..3fad58d22155 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -256,6 +256,7 @@ static void backend_disconnect(struct backend_info *be) unsigned int queue_index; xen_unregister_watchers(vif); + xenbus_rm(XBT_NIL, be->dev->nodename, "hotplug-status"); #ifdef CONFIG_DEBUG_FS xenvif_debugfs_delif(vif); #endif /* CONFIG_DEBUG_FS */ @@ -675,7 +676,6 @@ static void hotplug_status_changed(struct xenbus_watch *watch, /* Not interested in this watch anymore. */ unregister_hotplug_status_watch(be); - xenbus_rm(XBT_NIL, be->dev->nodename, "hotplug-status"); } kfree(str); } From e8240addd0a3919e0fd7436416afe9aa6429c484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Tue, 22 Feb 2022 01:18:17 +0100 Subject: [PATCH 408/773] Revert "xen-netback: Check for hotplug-status existence before watching" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 2afeec08ab5c86ae21952151f726bfe184f6b23d. The reasoning in the commit was wrong - the code expected to setup the watch even if 'hotplug-status' didn't exist. In fact, it relied on the watch being fired the first time - to check if maybe 'hotplug-status' is already set to 'connected'. Not registering a watch for non-existing path (which is the case if hotplug script hasn't been executed yet), made the backend not waiting for the hotplug script to execute. This in turns, made the netfront think the interface is fully operational, while in fact it was not (the vif interface on xen-netback side might not be configured yet). This was a workaround for 'hotplug-status' erroneously being removed. But since that is reverted now, the workaround is not necessary either. More discussion at https://lore.kernel.org/xen-devel/afedd7cb-a291-e773-8b0d-4db9b291fa98@ipxe.org/T/#u Signed-off-by: Marek Marczykowski-Górecki Reviewed-by: Paul Durrant Reviewed-by: Michael Brown Link: https://lore.kernel.org/r/20220222001817.2264967-2-marmarek@invisiblethingslab.com Signed-off-by: Jakub Kicinski --- drivers/net/xen-netback/xenbus.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 3fad58d22155..990360d75cb6 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -824,15 +824,11 @@ static void connect(struct backend_info *be) xenvif_carrier_on(be->vif); unregister_hotplug_status_watch(be); - if (xenbus_exists(XBT_NIL, dev->nodename, "hotplug-status")) { - err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, - NULL, hotplug_status_changed, - "%s/%s", dev->nodename, - "hotplug-status"); - if (err) - goto err; + err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, NULL, + hotplug_status_changed, + "%s/%s", dev->nodename, "hotplug-status"); + if (!err) be->have_hotplug_status_watch = 1; - } netif_tx_wake_all_queues(be->vif->dev); From e13ad1443684f7afaff24cf207e85e97885256bd Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Wed, 23 Feb 2022 00:57:20 -0800 Subject: [PATCH 409/773] bnx2x: fix driver load from initrd Commit b7a49f73059f ("bnx2x: Utilize firmware 7.13.21.0") added new firmware support in the driver with maintaining older firmware compatibility. However, older firmware was not added in MODULE_FIRMWARE() which caused missing firmware files in initrd image leading to driver load failure from initrd. This patch adds MODULE_FIRMWARE() for older firmware version to have firmware files included in initrd. Fixes: b7a49f73059f ("bnx2x: Utilize firmware 7.13.21.0") Link: https://bugzilla.kernel.org/show_bug.cgi?id=215627 Signed-off-by: Manish Chopra Signed-off-by: Alok Prasad Signed-off-by: Ariel Elior Link: https://lore.kernel.org/r/20220223085720.12021-1-manishc@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 774c1f1a57c3..eedb48d945ed 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -100,6 +100,9 @@ MODULE_LICENSE("GPL"); MODULE_FIRMWARE(FW_FILE_NAME_E1); MODULE_FIRMWARE(FW_FILE_NAME_E1H); MODULE_FIRMWARE(FW_FILE_NAME_E2); +MODULE_FIRMWARE(FW_FILE_NAME_E1_V15); +MODULE_FIRMWARE(FW_FILE_NAME_E1H_V15); +MODULE_FIRMWARE(FW_FILE_NAME_E2_V15); int bnx2x_num_queues; module_param_named(num_queues, bnx2x_num_queues, int, 0444); From 7ff57e98fb78ad94edafbdc7435f2d745e9e6bb5 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 23 Feb 2022 11:02:52 +0100 Subject: [PATCH 410/773] net/smc: Use a mutex for locking "struct smc_pnettable" smc_pnetid_by_table_ib() uses read_lock() and then it calls smc_pnet_apply_ib() which, in turn, calls mutex_lock(&smc_ib_devices.mutex). read_lock() disables preemption. Therefore, the code acquires a mutex while in atomic context and it leads to a SAC bug. Fix this bug by replacing the rwlock with a mutex. Reported-and-tested-by: syzbot+4f322a6d84e991c38775@syzkaller.appspotmail.com Fixes: 64e28b52c7a6 ("net/smc: add pnet table namespace support") Confirmed-by: Tony Lu Signed-off-by: Fabio M. De Francesco Acked-by: Karsten Graul Link: https://lore.kernel.org/r/20220223100252.22562-1-fmdefrancesco@gmail.com Signed-off-by: Jakub Kicinski --- net/smc/smc_pnet.c | 42 +++++++++++++++++++++--------------------- net/smc/smc_pnet.h | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 0599246c0376..29f0a559d884 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -113,7 +113,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) pnettable = &sn->pnettable; /* remove table entry */ - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || @@ -131,7 +131,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) rc = 0; } } - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); /* if this is not the initial namespace, stop here */ if (net != &init_net) @@ -192,7 +192,7 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { @@ -206,7 +206,7 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev) break; } } - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } @@ -224,7 +224,7 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { dev_put_track(pnetelem->ndev, &pnetelem->dev_tracker); @@ -237,7 +237,7 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) break; } } - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } @@ -370,7 +370,7 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); rc = -EEXIST; new_netdev = true; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_ETH && !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { @@ -385,9 +385,9 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, GFP_ATOMIC); } list_add_tail(&new_pe->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); } else { - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); kfree(new_pe); goto out_put; } @@ -448,7 +448,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, new_pe->ib_port = ib_port; new_ibdev = true; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { @@ -458,9 +458,9 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, } if (new_ibdev) { list_add_tail(&new_pe->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); } else { - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); kfree(new_pe); } return (new_ibdev) ? 0 : -EEXIST; @@ -605,7 +605,7 @@ static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, pnettable = &sn->pnettable; /* dump pnettable entries */ - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; @@ -620,7 +620,7 @@ static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return idx; } @@ -864,7 +864,7 @@ int smc_pnet_net_init(struct net *net) struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; INIT_LIST_HEAD(&pnettable->pnetlist); - rwlock_init(&pnettable->lock); + mutex_init(&pnettable->lock); INIT_LIST_HEAD(&pnetids_ndev->list); rwlock_init(&pnetids_ndev->lock); @@ -944,7 +944,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ @@ -953,7 +953,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } @@ -1156,7 +1156,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && @@ -1166,7 +1166,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } @@ -1185,7 +1185,7 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { @@ -1194,7 +1194,7 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 14039272f7e4..80a88eea4949 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -29,7 +29,7 @@ struct smc_link_group; * @pnetlist: List of PNETIDs */ struct smc_pnettable { - rwlock_t lock; + struct mutex lock; struct list_head pnetlist; }; From 6c0d8833a605e195ae219b5042577ce52bf71fff Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Wed, 23 Feb 2022 14:19:56 +0100 Subject: [PATCH 411/773] ipv6: prevent a possible race condition with lifetimes valid_lft, prefered_lft and tstamp are always accessed under the lock "lock" in other places. Reading these without taking the lock may result in inconsistencies regarding the calculation of the valid and preferred variables since decisions are taken on these fields for those variables. Signed-off-by: Niels Dossche Reviewed-by: David Ahern Signed-off-by: Niels Dossche Link: https://lore.kernel.org/r/20220223131954.6570-1-niels.dossche@ugent.be Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3f23da8c0b10..6c8ab3e6e6fe 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4998,6 +4998,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto error; + spin_lock_bh(&ifa->lock); if (!((ifa->flags&IFA_F_PERMANENT) && (ifa->prefered_lft == INFINITY_LIFE_TIME))) { preferred = ifa->prefered_lft; @@ -5019,6 +5020,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } + spin_unlock_bh(&ifa->lock); if (!ipv6_addr_any(&ifa->peer_addr)) { if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 || From d9b5ae5c1b241b91480aa30408be12fe91af834a Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 23 Feb 2022 18:34:16 +0200 Subject: [PATCH 412/773] openvswitch: Fix setting ipv6 fields causing hw csum failure Ipv6 ttl, label and tos fields are modified without first pulling/pushing the ipv6 header, which would have updated the hw csum (if available). This might cause csum validation when sending the packet to the stack, as can be seen in the trace below. Fix this by updating skb->csum if available. Trace resulted by ipv6 ttl dec and then sending packet to conntrack [actions: set(ipv6(hlimit=63)),ct(zone=99)]: [295241.900063] s_pf0vf2: hw csum failure [295241.923191] Call Trace: [295241.925728] [295241.927836] dump_stack+0x5c/0x80 [295241.931240] __skb_checksum_complete+0xac/0xc0 [295241.935778] nf_conntrack_tcp_packet+0x398/0xba0 [nf_conntrack] [295241.953030] nf_conntrack_in+0x498/0x5e0 [nf_conntrack] [295241.958344] __ovs_ct_lookup+0xac/0x860 [openvswitch] [295241.968532] ovs_ct_execute+0x4a7/0x7c0 [openvswitch] [295241.979167] do_execute_actions+0x54a/0xaa0 [openvswitch] [295242.001482] ovs_execute_actions+0x48/0x100 [openvswitch] [295242.006966] ovs_dp_process_packet+0x96/0x1d0 [openvswitch] [295242.012626] ovs_vport_receive+0x6c/0xc0 [openvswitch] [295242.028763] netdev_frame_hook+0xc0/0x180 [openvswitch] [295242.034074] __netif_receive_skb_core+0x2ca/0xcb0 [295242.047498] netif_receive_skb_internal+0x3e/0xc0 [295242.052291] napi_gro_receive+0xba/0xe0 [295242.056231] mlx5e_handle_rx_cqe_mpwrq_rep+0x12b/0x250 [mlx5_core] [295242.062513] mlx5e_poll_rx_cq+0xa0f/0xa30 [mlx5_core] [295242.067669] mlx5e_napi_poll+0xe1/0x6b0 [mlx5_core] [295242.077958] net_rx_action+0x149/0x3b0 [295242.086762] __do_softirq+0xd7/0x2d6 [295242.090427] irq_exit+0xf7/0x100 [295242.093748] do_IRQ+0x7f/0xd0 [295242.096806] common_interrupt+0xf/0xf [295242.100559] [295242.102750] RIP: 0033:0x7f9022e88cbd [295242.125246] RSP: 002b:00007f9022282b20 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffda [295242.132900] RAX: 0000000000000005 RBX: 0000000000000010 RCX: 0000000000000000 [295242.140120] RDX: 00007f9022282ba8 RSI: 00007f9022282a30 RDI: 00007f9014005c30 [295242.147337] RBP: 00007f9014014d60 R08: 0000000000000020 R09: 00007f90254a8340 [295242.154557] R10: 00007f9022282a28 R11: 0000000000000246 R12: 0000000000000000 [295242.161775] R13: 00007f902308c000 R14: 000000000000002b R15: 00007f9022b71f40 Fixes: 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action") Signed-off-by: Paul Blakey Link: https://lore.kernel.org/r/20220223163416.24096-1-paulb@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/checksum.h | 5 +++++ net/openvswitch/actions.c | 46 ++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/include/net/checksum.h b/include/net/checksum.h index 02d0c2d01014..79c67f14c448 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -142,6 +142,11 @@ static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } +static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) +{ + *csum = csum_add(csum_sub(*csum, old), new); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 076774034bb9..780d9e2246f3 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -423,12 +423,43 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, memcpy(addr, new_addr, sizeof(__be32[4])); } -static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) +static void set_ipv6_dsfield(struct sk_buff *skb, struct ipv6hdr *nh, u8 ipv6_tclass, u8 mask) { + u8 old_ipv6_tclass = ipv6_get_dsfield(nh); + + ipv6_tclass = OVS_MASKED(old_ipv6_tclass, ipv6_tclass, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(old_ipv6_tclass << 12), + (__force __wsum)(ipv6_tclass << 12)); + + ipv6_change_dsfield(nh, ~mask, ipv6_tclass); +} + +static void set_ipv6_fl(struct sk_buff *skb, struct ipv6hdr *nh, u32 fl, u32 mask) +{ + u32 ofl; + + ofl = nh->flow_lbl[0] << 16 | nh->flow_lbl[1] << 8 | nh->flow_lbl[2]; + fl = OVS_MASKED(ofl, fl, mask); + /* Bits 21-24 are always unmasked, so this retains their values. */ - OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + nh->flow_lbl[0] = (u8)(fl >> 16); + nh->flow_lbl[1] = (u8)(fl >> 8); + nh->flow_lbl[2] = (u8)fl; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)htonl(ofl), (__force __wsum)htonl(fl)); +} + +static void set_ipv6_ttl(struct sk_buff *skb, struct ipv6hdr *nh, u8 new_ttl, u8 mask) +{ + new_ttl = OVS_MASKED(nh->hop_limit, new_ttl, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(nh->hop_limit << 8), + (__force __wsum)(new_ttl << 8)); + nh->hop_limit = new_ttl; } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, @@ -546,18 +577,17 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv6_tclass) { - ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); + set_ipv6_dsfield(skb, nh, key->ipv6_tclass, mask->ipv6_tclass); flow_key->ip.tos = ipv6_get_dsfield(nh); } if (mask->ipv6_label) { - set_ipv6_fl(nh, ntohl(key->ipv6_label), + set_ipv6_fl(skb, nh, ntohl(key->ipv6_label), ntohl(mask->ipv6_label)); flow_key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, - mask->ipv6_hlimit); + set_ipv6_ttl(skb, nh, key->ipv6_hlimit, mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; From fe20371578ef640069e6ae9fa8038f60e7908565 Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Wed, 23 Feb 2022 09:53:47 -0800 Subject: [PATCH 413/773] Revert "i40e: Fix reset bw limit when DCB enabled with 1 TC" Revert of a patch that instead of fixing a AQ error when trying to reset BW limit introduced several regressions related to creation and managing TC. Currently there are errors when creating a TC on both PF and VF. Error log: [17428.783095] i40e 0000:3b:00.1: AQ command Config VSI BW allocation per TC failed = 14 [17428.783107] i40e 0000:3b:00.1: Failed configuring TC map 0 for VSI 391 [17428.783254] i40e 0000:3b:00.1: AQ command Config VSI BW allocation per TC failed = 14 [17428.783259] i40e 0000:3b:00.1: Unable to configure TC map 0 for VSI 391 This reverts commit 3d2504663c41104b4359a15f35670cfa82de1bbf. Fixes: 3d2504663c41 (i40e: Fix reset bw limit when DCB enabled with 1 TC) Signed-off-by: Mateusz Palczewski Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20220223175347.1690692-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 0c4b7dfb3b35..31b03fe78d3b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5372,15 +5372,7 @@ static int i40e_vsi_configure_bw_alloc(struct i40e_vsi *vsi, u8 enabled_tc, /* There is no need to reset BW when mqprio mode is on. */ if (pf->flags & I40E_FLAG_TC_MQPRIO) return 0; - - if (!vsi->mqprio_qopt.qopt.hw) { - if (pf->flags & I40E_FLAG_DCB_ENABLED) - goto skip_reset; - - if (IS_ENABLED(CONFIG_I40E_DCB) && - i40e_dcb_hw_get_num_tc(&pf->hw) == 1) - goto skip_reset; - + if (!vsi->mqprio_qopt.qopt.hw && !(pf->flags & I40E_FLAG_DCB_ENABLED)) { ret = i40e_set_bw_limit(vsi, vsi->seid, 0); if (ret) dev_info(&pf->pdev->dev, @@ -5388,8 +5380,6 @@ static int i40e_vsi_configure_bw_alloc(struct i40e_vsi *vsi, u8 enabled_tc, vsi->seid); return ret; } - -skip_reset: memset(&bw_data, 0, sizeof(bw_data)); bw_data.tc_valid_bits = enabled_tc; for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) From cd33bdcbead882c2e58fdb4a54a7bd75b610a452 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 23 Feb 2022 22:41:08 -0500 Subject: [PATCH 414/773] ping: remove pr_err from ping_lookup As Jakub noticed, prints should be avoided on the datapath. Also, as packets would never come to the else branch in ping_lookup(), remove pr_err() from ping_lookup(). Fixes: 35a79e64de29 ("ping: fix the dif and sdif check in ping_lookup") Reported-by: Jakub Kicinski Signed-off-by: Xin Long Link: https://lore.kernel.org/r/1ef3f2fcd31bd681a193b1fcf235eee1603819bd.1645674068.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/ping.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 3a5994b50571..3ee947557b88 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -187,7 +187,6 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) (int)ident, &ipv6_hdr(skb)->daddr, dif); #endif } else { - pr_err("ping: protocol(%x) is not supported\n", ntohs(skb->protocol)); return NULL; } From 6f3c1fc53d86d580d8d6d749c4af23705e4f6f79 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Tue, 22 Feb 2022 11:12:39 +0800 Subject: [PATCH 415/773] KVM: x86/mmu: make apf token non-zero to fix bug In current async pagefault logic, when a page is ready, KVM relies on kvm_arch_can_dequeue_async_page_present() to determine whether to deliver a READY event to the Guest. This function test token value of struct kvm_vcpu_pv_apf_data, which must be reset to zero by Guest kernel when a READY event is finished by Guest. If value is zero meaning that a READY event is done, so the KVM can deliver another. But the kvm_arch_setup_async_pf() may produce a valid token with zero value, which is confused with previous mention and may lead the loss of this READY event. This bug may cause task blocked forever in Guest: INFO: task stress:7532 blocked for more than 1254 seconds. Not tainted 5.10.0 #16 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:stress state:D stack: 0 pid: 7532 ppid: 1409 flags:0x00000080 Call Trace: __schedule+0x1e7/0x650 schedule+0x46/0xb0 kvm_async_pf_task_wait_schedule+0xad/0xe0 ? exit_to_user_mode_prepare+0x60/0x70 __kvm_handle_async_pf+0x4f/0xb0 ? asm_exc_page_fault+0x8/0x30 exc_page_fault+0x6f/0x110 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x402d00 RSP: 002b:00007ffd31912500 EFLAGS: 00010206 RAX: 0000000000071000 RBX: ffffffffffffffff RCX: 00000000021a32b0 RDX: 000000000007d011 RSI: 000000000007d000 RDI: 00000000021262b0 RBP: 00000000021262b0 R08: 0000000000000003 R09: 0000000000000086 R10: 00000000000000eb R11: 00007fefbdf2baa0 R12: 0000000000000000 R13: 0000000000000002 R14: 000000000007d000 R15: 0000000000001000 Signed-off-by: Liang Zhang Message-Id: <20220222031239.1076682-1-zhangliang5@huawei.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 593093b52395..8e24f73bf60b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3889,12 +3889,23 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } +static u32 alloc_apf_token(struct kvm_vcpu *vcpu) +{ + /* make sure the token value is not 0 */ + u32 id = vcpu->arch.apf.id; + + if (id << 12 == 0) + vcpu->arch.apf.id = 1; + + return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; +} + static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, gfn_t gfn) { struct kvm_arch_async_pf arch; - arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; + arch.token = alloc_apf_token(vcpu); arch.gfn = gfn; arch.direct_map = vcpu->arch.mmu->direct_map; arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); From e910a53fb4f20aa012e46371ffb4c32c8da259b4 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 23 Feb 2022 13:56:49 +0200 Subject: [PATCH 416/773] KVM: x86: nSVM: disallow userspace setting of MSR_AMD64_TSC_RATIO to non default value when tsc scaling disabled If nested tsc scaling is disabled, MSR_AMD64_TSC_RATIO should never have non default value. Due to way nested tsc scaling support was implmented in qemu, it would set this msr to 0 when nested tsc scaling was disabled. Ignore that value for now, as it causes no harm. Fixes: 5228eb96a487 ("KVM: x86: nSVM: implement nested TSC scaling") Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky Message-Id: <20220223115649.319134-1-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 821edf664e7a..fd3a00c892c7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2693,8 +2693,23 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!msr->host_initiated && !svm->tsc_scaling_enabled) - return 1; + + if (!svm->tsc_scaling_enabled) { + + if (!msr->host_initiated) + return 1; + /* + * In case TSC scaling is not enabled, always + * leave this MSR at the default value. + * + * Due to bug in qemu 6.2.0, it would try to set + * this msr to 0 if tsc scaling is not enabled. + * Ignore this value as well. + */ + if (data != 0 && data != svm->tsc_ratio_msr) + return 1; + break; + } if (data & TSC_RATIO_RSVD) return 1; From 42404d8f1c01861b22ccfa1d70f950242720ae57 Mon Sep 17 00:00:00 2001 From: Mauri Sandberg Date: Wed, 23 Feb 2022 16:23:37 +0200 Subject: [PATCH 417/773] net: mv643xx_eth: process retval from of_get_mac_address Obtaining a MAC address may be deferred in cases when the MAC is stored in an NVMEM block, for example, and it may not be ready upon the first retrieval attempt and return EPROBE_DEFER. It is also possible that a port that does not rely on NVMEM has been already created when getting the defer request. Thus, also the resources allocated previously must be freed when doing a roll-back. Fixes: 76723bca2802 ("net: mv643xx_eth: add DT parsing support") Signed-off-by: Mauri Sandberg Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220223142337.41757-1-maukka@ext.kapsi.fi Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mv643xx_eth.c | 24 +++++++++++++--------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 105247582684..143ca8be5eb5 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2704,6 +2704,16 @@ MODULE_DEVICE_TABLE(of, mv643xx_eth_shared_ids); static struct platform_device *port_platdev[3]; +static void mv643xx_eth_shared_of_remove(void) +{ + int n; + + for (n = 0; n < 3; n++) { + platform_device_del(port_platdev[n]); + port_platdev[n] = NULL; + } +} + static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, struct device_node *pnp) { @@ -2740,7 +2750,9 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, return -EINVAL; } - of_get_mac_address(pnp, ppd.mac_addr); + ret = of_get_mac_address(pnp, ppd.mac_addr); + if (ret) + return ret; mv643xx_eth_property(pnp, "tx-queue-size", ppd.tx_queue_size); mv643xx_eth_property(pnp, "tx-sram-addr", ppd.tx_sram_addr); @@ -2804,21 +2816,13 @@ static int mv643xx_eth_shared_of_probe(struct platform_device *pdev) ret = mv643xx_eth_shared_of_add_port(pdev, pnp); if (ret) { of_node_put(pnp); + mv643xx_eth_shared_of_remove(); return ret; } } return 0; } -static void mv643xx_eth_shared_of_remove(void) -{ - int n; - - for (n = 0; n < 3; n++) { - platform_device_del(port_platdev[n]); - port_platdev[n] = NULL; - } -} #else static inline int mv643xx_eth_shared_of_probe(struct platform_device *pdev) { From 3abea10e6a8f0e7804ed4c124bea2d15aca977c8 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Wed, 23 Feb 2022 08:20:24 +0800 Subject: [PATCH 418/773] thermal: int340x: fix memory leak in int3400_notify() It is easy to hit the below memory leaks in my TigerLake platform: unreferenced object 0xffff927c8b91dbc0 (size 32): comm "kworker/0:2", pid 112, jiffies 4294893323 (age 83.604s) hex dump (first 32 bytes): 4e 41 4d 45 3d 49 4e 54 33 34 30 30 20 54 68 65 NAME=INT3400 The 72 6d 61 6c 00 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 rmal.kkkkkkkkkk. backtrace: [] __kmalloc_track_caller+0x2fe/0x4a0 [] kvasprintf+0x65/0xd0 [] kasprintf+0x4e/0x70 [] int3400_notify+0x82/0x120 [int3400_thermal] [] acpi_ev_notify_dispatch+0x54/0x71 [] acpi_os_execute_deferred+0x17/0x30 [] process_one_work+0x21a/0x3f0 [] worker_thread+0x4a/0x3b0 [] kthread+0xfd/0x130 [] ret_from_fork+0x1f/0x30 Fix it by calling kfree() accordingly. Fixes: 38e44da59130 ("thermal: int3400_thermal: process "thermal table changed" event") Signed-off-by: Chuansheng Liu Cc: 4.14+ # 4.14+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/int3400_thermal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index 72acb1f61849..4f478812cb51 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -404,6 +404,10 @@ static void int3400_notify(acpi_handle handle, thermal_prop[3] = kasprintf(GFP_KERNEL, "EVENT=%d", therm_event); thermal_prop[4] = NULL; kobject_uevent_env(&priv->thermal->device.kobj, KOBJ_CHANGE, thermal_prop); + kfree(thermal_prop[0]); + kfree(thermal_prop[1]); + kfree(thermal_prop[2]); + kfree(thermal_prop[3]); } static int int3400_thermal_get_temp(struct thermal_zone_device *thermal, From dd3b1dc3dd050f1f47cd13e300732852414270f8 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 4 Feb 2022 13:12:35 -0800 Subject: [PATCH 419/773] Bluetooth: hci_core: Fix leaking sent_cmd skb sent_cmd memory is not freed before freeing hci_dev causing it to leak it contents. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2b7bd3655b07..2882bc7d79d7 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2738,6 +2738,7 @@ void hci_release_dev(struct hci_dev *hdev) hci_dev_unlock(hdev); ida_simple_remove(&hci_index_ida, hdev->id); + kfree_skb(hdev->sent_cmd); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); From fa78d2d1d64f147062e384a4a10a26a5f89944b5 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 16 Feb 2022 12:37:14 +0800 Subject: [PATCH 420/773] Bluetooth: fix data races in smp_unregister(), smp_del_chan() Previous commit e04480920d1e ("Bluetooth: defer cleanup of resources in hci_unregister_dev()") defers all destructive actions to hci_release_dev() to prevent cocurrent problems like NPD, UAF. However, there are still some exceptions that are ignored. The smp_unregister() in hci_dev_close_sync() (previously in hci_dev_do_close) will release resources like the sensitive channel and the smp_dev objects. Consider the situations the device is detaching or power down while the kernel is still operating on it, the following data race could take place. thread-A hci_dev_close_sync | thread-B read_local_oob_ext_data | hci_dev_unlock() | ... | hci_dev_lock() if (hdev->smp_data) | chan = hdev->smp_data | | chan = hdev->smp_data (3) | hdev->smp_data = NULL (1) | if (!chan || !chan->data) (4) ... | smp = chan->data | smp = chan->data if (smp) | chan->data = NULL (2) | ... | kfree_sensitive(smp) | | // dereference smp trigger UFA That is, the objects hdev->smp_data and chan->data both suffer from the data races. In a preempt-enable kernel, the above schedule (when (3) is before (1) and (4) is before (2)) leads to UAF bugs. It can be reproduced in the latest kernel and below is part of the report: [ 49.097146] ================================================================ [ 49.097611] BUG: KASAN: use-after-free in smp_generate_oob+0x2dd/0x570 [ 49.097611] Read of size 8 at addr ffff888006528360 by task generate_oob/155 [ 49.097611] [ 49.097611] Call Trace: [ 49.097611] [ 49.097611] dump_stack_lvl+0x34/0x44 [ 49.097611] print_address_description.constprop.0+0x1f/0x150 [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] kasan_report.cold+0x7f/0x11b [ 49.097611] ? smp_generate_oob+0x2dd/0x570 [ 49.097611] smp_generate_oob+0x2dd/0x570 [ 49.097611] read_local_oob_ext_data+0x689/0xc30 [ 49.097611] ? hci_event_packet+0xc80/0xc80 [ 49.097611] ? sysvec_apic_timer_interrupt+0x9b/0xc0 [ 49.097611] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 49.097611] ? mgmt_init_hdev+0x1c/0x240 [ 49.097611] ? mgmt_init_hdev+0x28/0x240 [ 49.097611] hci_sock_sendmsg+0x1880/0x1e70 [ 49.097611] ? create_monitor_event+0x890/0x890 [ 49.097611] ? create_monitor_event+0x890/0x890 [ 49.097611] sock_sendmsg+0xdf/0x110 [ 49.097611] __sys_sendto+0x19e/0x270 [ 49.097611] ? __ia32_sys_getpeername+0xa0/0xa0 [ 49.097611] ? kernel_fpu_begin_mask+0x1c0/0x1c0 [ 49.097611] __x64_sys_sendto+0xd8/0x1b0 [ 49.097611] ? syscall_exit_to_user_mode+0x1d/0x40 [ 49.097611] do_syscall_64+0x3b/0x90 [ 49.097611] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 49.097611] RIP: 0033:0x7f5a59f51f64 ... [ 49.097611] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5a59f51f64 [ 49.097611] RDX: 0000000000000007 RSI: 00007f5a59d6ac70 RDI: 0000000000000006 [ 49.097611] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 49.097611] R10: 0000000000000040 R11: 0000000000000246 R12: 00007ffec26916ee [ 49.097611] R13: 00007ffec26916ef R14: 00007f5a59d6afc0 R15: 00007f5a59d6b700 To solve these data races, this patch places the smp_unregister() function in the protected area by the hci_dev_lock(). That is, the smp_unregister() function can not be concurrently executed when operating functions (most of them are mgmt operations in mgmt.c) hold the device lock. This patch is tested with kernel LOCK DEBUGGING enabled. The price from the extended holding time of the device lock is supposed to be low as the smp_unregister() function is fairly short and efficient. Signed-off-by: Lin Ma Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 0feb68f12545..e34fc15b7d2c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4106,9 +4106,9 @@ int hci_dev_close_sync(struct hci_dev *hdev) hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); - hci_dev_unlock(hdev); - + /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */ smp_unregister(hdev); + hci_dev_unlock(hdev); hci_sock_dev_event(hdev, HCI_DEV_DOWN); From 29fb608396d6a62c1b85acc421ad7a4399085b9f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 14 Feb 2022 17:59:38 -0800 Subject: [PATCH 421/773] Bluetooth: Fix bt_skb_sendmmsg not allocating partial chunks Since bt_skb_sendmmsg can be used with the likes of SOCK_STREAM it shall return the partial chunks it could allocate instead of freeing everything as otherwise it can cause problems like bellow. Fixes: 81be03e026dc ("Bluetooth: RFCOMM: Replace use of memcpy_from_msg with bt_skb_sendmmsg") Reported-by: Paul Menzel Link: https://lore.kernel.org/r/d7206e12-1b99-c3be-84f4-df22af427ef5@molgen.mpg.de BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215594 Signed-off-by: Luiz Augusto von Dentz Tested-by: Paul Menzel (Nokia N9 (MeeGo/Harmattan) Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 4b3d0b16c185..a647e5fabdbd 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -506,8 +506,7 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, tmp = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom); if (IS_ERR(tmp)) { - kfree_skb(skb); - return tmp; + return skb; } len -= tmp->len; From 2e8ecb4bbc13d4752d64a9f8f5512d59125cab25 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Mon, 14 Feb 2022 18:01:56 -0800 Subject: [PATCH 422/773] Bluetooth: assign len after null check len should be assigned after a null check Signed-off-by: Wang Qing Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt_util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index edee60bbc7b4..37eef2ce55ae 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -77,11 +77,12 @@ int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag, { struct hci_dev *hdev; struct mgmt_hdr *hdr; - int len = skb->len; + int len; if (!skb) return -EINVAL; + len = skb->len; hdev = bt_cb(skb)->mgmt.hdev; /* Time stamp */ From 80740ebb7e1ad15ab9c11425dcd26e073f86d74b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Feb 2022 07:11:47 -0800 Subject: [PATCH 423/773] Bluetooth: hci_sync: Fix hci_update_accept_list_sync hci_update_accept_list_sync is returning the filter based on the error but that gets overwritten by hci_le_set_addr_resolution_enable_sync return instead of using the actual result of the likes of hci_le_add_accept_list_sync which was intended. Fixes: ad383c2c65a5b ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sync.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e34fc15b7d2c..9d8490570b42 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1841,6 +1841,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; + u8 filter_policy; int err; /* Pause advertising if resolving list can be used as controllers are @@ -1927,6 +1928,8 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) err = -EINVAL; done: + filter_policy = err ? 0x00 : 0x01; + /* Enable address resolution when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01); if (err) @@ -1937,7 +1940,7 @@ done: hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ - return err ? 0x00 : 0x01; + return filter_policy; } /* Returns true if an le connection is in the scanning state */ From a56a1138cbd85e4d565356199d60e1cb94e5a77a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 17 Feb 2022 13:10:38 -0800 Subject: [PATCH 424/773] Bluetooth: hci_sync: Fix not using conn_timeout When using hci_le_create_conn_sync it shall wait for the conn_timeout since the connection complete may take longer than just 2 seconds. Also fix the masking of HCI_EV_LE_ENHANCED_CONN_COMPLETE and HCI_EV_LE_CONN_COMPLETE so they are never both set so we can predict which one the controller will use in case of HCI_OP_LE_CREATE_CONN. Fixes: 6cd29ec6ae5e3 ("Bluetooth: hci_sync: Wait for proper events when connecting LE") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 8 ++++++++ net/bluetooth/hci_sync.c | 21 +++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 586f69d084a2..e336e9c1dda4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1489,6 +1489,14 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) +/* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: + * + * C24: Mandatory if the LE Controller supports Connection State and either + * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported + */ +#define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 9d8490570b42..9ba2a1a7d481 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3265,10 +3265,10 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ - /* If the controller supports LL Privacy feature, enable - * the corresponding event. + /* If the controller supports LL Privacy feature or LE Extended Adv, + * enable the corresponding event. */ - if (hdev->le_features[0] & HCI_LE_LL_PRIVACY) + if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ /* If the controller supports Extended Scanner Filter @@ -5188,7 +5188,7 @@ int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN, plen, data, HCI_EV_LE_ENHANCED_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + conn->conn_timeout, NULL); } int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) @@ -5273,9 +5273,18 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261: + * + * If this event is unmasked and the HCI_LE_Connection_Complete event + * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is + * sent when a new connection has been created. + */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN, - sizeof(cp), &cp, HCI_EV_LE_CONN_COMPLETE, - HCI_CMD_TIMEOUT, NULL); + sizeof(cp), &cp, + use_enhanced_conn_complete(hdev) ? + HCI_EV_LE_ENHANCED_CONN_COMPLETE : + HCI_EV_LE_CONN_COMPLETE, + conn->conn_timeout, NULL); done: /* Re-enable advertising after the connection attempt is finished. */ From 4e7c4d3652f96f41179aab3ff53025c7a550d689 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Thu, 24 Feb 2022 00:26:05 +0530 Subject: [PATCH 425/773] clk: qcom: gdsc: Add support to update GDSC transition delay GDSCs have multiple transition delays which are used for the GDSC FSM states. Older targets/designs required these values to be updated from gdsc code to certain default values for the FSM state to work as expected. But on the newer targets/designs the values updated from the GDSC driver can hamper the FSM state to not work as expected. On SC7180 we observe black screens because the gdsc is being enabled/disabled very rapidly and the GDSC FSM state does not work as expected. This is due to the fact that the GDSC reset value is being updated from SW. Thus add support to update the transition delay from the clock controller gdscs as required. Fixes: 45dd0e55317cc ("clk: qcom: Add support for GDSCs) Signed-off-by: Taniya Das Link: https://lore.kernel.org/r/20220223185606.3941-1-tdas@codeaurora.org Reviewed-by: Bjorn Andersson Signed-off-by: Stephen Boyd --- drivers/clk/qcom/gdsc.c | 26 +++++++++++++++++++++----- drivers/clk/qcom/gdsc.h | 8 +++++++- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/clk/qcom/gdsc.c b/drivers/clk/qcom/gdsc.c index 7e1dd8ccfa38..44520efc6c72 100644 --- a/drivers/clk/qcom/gdsc.c +++ b/drivers/clk/qcom/gdsc.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2015, 2017-2018, The Linux Foundation. All rights reserved. + * Copyright (c) 2015, 2017-2018, 2022, The Linux Foundation. All rights reserved. */ #include @@ -35,9 +35,14 @@ #define CFG_GDSCR_OFFSET 0x4 /* Wait 2^n CXO cycles between all states. Here, n=2 (4 cycles). */ -#define EN_REST_WAIT_VAL (0x2 << 20) -#define EN_FEW_WAIT_VAL (0x8 << 16) -#define CLK_DIS_WAIT_VAL (0x2 << 12) +#define EN_REST_WAIT_VAL 0x2 +#define EN_FEW_WAIT_VAL 0x8 +#define CLK_DIS_WAIT_VAL 0x2 + +/* Transition delay shifts */ +#define EN_REST_WAIT_SHIFT 20 +#define EN_FEW_WAIT_SHIFT 16 +#define CLK_DIS_WAIT_SHIFT 12 #define RETAIN_MEM BIT(14) #define RETAIN_PERIPH BIT(13) @@ -380,7 +385,18 @@ static int gdsc_init(struct gdsc *sc) */ mask = HW_CONTROL_MASK | SW_OVERRIDE_MASK | EN_REST_WAIT_MASK | EN_FEW_WAIT_MASK | CLK_DIS_WAIT_MASK; - val = EN_REST_WAIT_VAL | EN_FEW_WAIT_VAL | CLK_DIS_WAIT_VAL; + + if (!sc->en_rest_wait_val) + sc->en_rest_wait_val = EN_REST_WAIT_VAL; + if (!sc->en_few_wait_val) + sc->en_few_wait_val = EN_FEW_WAIT_VAL; + if (!sc->clk_dis_wait_val) + sc->clk_dis_wait_val = CLK_DIS_WAIT_VAL; + + val = sc->en_rest_wait_val << EN_REST_WAIT_SHIFT | + sc->en_few_wait_val << EN_FEW_WAIT_SHIFT | + sc->clk_dis_wait_val << CLK_DIS_WAIT_SHIFT; + ret = regmap_update_bits(sc->regmap, sc->gdscr, mask, val); if (ret) return ret; diff --git a/drivers/clk/qcom/gdsc.h b/drivers/clk/qcom/gdsc.h index d7cc4c21a9d4..ad313d7210bd 100644 --- a/drivers/clk/qcom/gdsc.h +++ b/drivers/clk/qcom/gdsc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2015, 2017-2018, The Linux Foundation. All rights reserved. + * Copyright (c) 2015, 2017-2018, 2022, The Linux Foundation. All rights reserved. */ #ifndef __QCOM_GDSC_H__ @@ -22,6 +22,9 @@ struct reset_controller_dev; * @cxcs: offsets of branch registers to toggle mem/periph bits in * @cxc_count: number of @cxcs * @pwrsts: Possible powerdomain power states + * @en_rest_wait_val: transition delay value for receiving enr ack signal + * @en_few_wait_val: transition delay value for receiving enf ack signal + * @clk_dis_wait_val: transition delay value for halting clock * @resets: ids of resets associated with this gdsc * @reset_count: number of @resets * @rcdev: reset controller @@ -36,6 +39,9 @@ struct gdsc { unsigned int clamp_io_ctrl; unsigned int *cxcs; unsigned int cxc_count; + unsigned int en_rest_wait_val; + unsigned int en_few_wait_val; + unsigned int clk_dis_wait_val; const u8 pwrsts; /* Powerdomain allowable state bitfields */ #define PWRSTS_OFF BIT(0) From 6e6fec3f961c00ca34ffb4bf2ad9febb4b499f8d Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Thu, 24 Feb 2022 00:26:06 +0530 Subject: [PATCH 426/773] clk: qcom: dispcc: Update the transition delay for MDSS GDSC On SC7180 we observe black screens because the gdsc is being enabled/disabled very rapidly and the GDSC FSM state does not work as expected. This is due to the fact that the GDSC reset value is being updated from SW. The recommended transition delay for mdss core gdsc updated for SC7180/SC7280/SM8250. Fixes: dd3d06622138 ("clk: qcom: Add display clock controller driver for SC7180") Fixes: 1a00c962f9cd ("clk: qcom: Add display clock controller driver for SC7280") Fixes: 80a18f4a8567 ("clk: qcom: Add display clock controller driver for SM8150 and SM8250") Signed-off-by: Taniya Das Link: https://lore.kernel.org/r/20220223185606.3941-2-tdas@codeaurora.org Reviewed-by: Bjorn Andersson [sboyd@kernel.org: lowercase hex] Signed-off-by: Stephen Boyd --- drivers/clk/qcom/dispcc-sc7180.c | 5 ++++- drivers/clk/qcom/dispcc-sc7280.c | 5 ++++- drivers/clk/qcom/dispcc-sm8250.c | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/clk/qcom/dispcc-sc7180.c b/drivers/clk/qcom/dispcc-sc7180.c index 538e4963c915..5d2ae297e741 100644 --- a/drivers/clk/qcom/dispcc-sc7180.c +++ b/drivers/clk/qcom/dispcc-sc7180.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2019, The Linux Foundation. All rights reserved. + * Copyright (c) 2019, 2022, The Linux Foundation. All rights reserved. */ #include @@ -625,6 +625,9 @@ static struct clk_branch disp_cc_mdss_vsync_clk = { static struct gdsc mdss_gdsc = { .gdscr = 0x3000, + .en_rest_wait_val = 0x2, + .en_few_wait_val = 0x2, + .clk_dis_wait_val = 0xf, .pd = { .name = "mdss_gdsc", }, diff --git a/drivers/clk/qcom/dispcc-sc7280.c b/drivers/clk/qcom/dispcc-sc7280.c index 4ef4ae231794..ad596d567f6a 100644 --- a/drivers/clk/qcom/dispcc-sc7280.c +++ b/drivers/clk/qcom/dispcc-sc7280.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2021, The Linux Foundation. All rights reserved. + * Copyright (c) 2021-2022, The Linux Foundation. All rights reserved. */ #include @@ -787,6 +787,9 @@ static struct clk_branch disp_cc_sleep_clk = { static struct gdsc disp_cc_mdss_core_gdsc = { .gdscr = 0x1004, + .en_rest_wait_val = 0x2, + .en_few_wait_val = 0x2, + .clk_dis_wait_val = 0xf, .pd = { .name = "disp_cc_mdss_core_gdsc", }, diff --git a/drivers/clk/qcom/dispcc-sm8250.c b/drivers/clk/qcom/dispcc-sm8250.c index 566fdfa0a15b..db9379634fb2 100644 --- a/drivers/clk/qcom/dispcc-sm8250.c +++ b/drivers/clk/qcom/dispcc-sm8250.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2018-2020, The Linux Foundation. All rights reserved. + * Copyright (c) 2018-2020, 2022, The Linux Foundation. All rights reserved. */ #include @@ -1126,6 +1126,9 @@ static struct clk_branch disp_cc_mdss_vsync_clk = { static struct gdsc mdss_gdsc = { .gdscr = 0x3000, + .en_rest_wait_val = 0x2, + .en_few_wait_val = 0x2, + .clk_dis_wait_val = 0xf, .pd = { .name = "mdss_gdsc", }, From 0d22b031662ad48d5835e470a90784f4b39adce9 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:30 +0000 Subject: [PATCH 427/773] drm/exynos/exynos7_drm_decon: Use platform_get_irq_byname() to get the interrupt platform_get_resource_byname(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq_byname(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos7_drm_decon.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos7_drm_decon.c b/drivers/gpu/drm/exynos/exynos7_drm_decon.c index 12571ac45540..c04264f70ad1 100644 --- a/drivers/gpu/drm/exynos/exynos7_drm_decon.c +++ b/drivers/gpu/drm/exynos/exynos7_drm_decon.c @@ -678,7 +678,6 @@ static int decon_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct decon_context *ctx; struct device_node *i80_if_timings; - struct resource *res; int ret; if (!dev->of_node) @@ -728,16 +727,11 @@ static int decon_probe(struct platform_device *pdev) goto err_iounmap; } - res = platform_get_resource_byname(pdev, IORESOURCE_IRQ, - ctx->i80_if ? "lcd_sys" : "vsync"); - if (!res) { - dev_err(dev, "irq request failed.\n"); - ret = -ENXIO; + ret = platform_get_irq_byname(pdev, ctx->i80_if ? "lcd_sys" : "vsync"); + if (ret < 0) goto err_iounmap; - } - ret = devm_request_irq(dev, res->start, decon_irq_handler, - 0, "drm_decon", ctx); + ret = devm_request_irq(dev, ret, decon_irq_handler, 0, "drm_decon", ctx); if (ret) { dev_err(dev, "irq request failed.\n"); goto err_iounmap; From be52abd4d2b7ea343373cc116a99699a3e3c5573 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:31 +0000 Subject: [PATCH 428/773] drm/exynos: mixer: Use platform_get_irq() to get the interrupt platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_mixer.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_mixer.c b/drivers/gpu/drm/exynos/exynos_mixer.c index 41c54f1f60bc..e5204be86093 100644 --- a/drivers/gpu/drm/exynos/exynos_mixer.c +++ b/drivers/gpu/drm/exynos/exynos_mixer.c @@ -809,19 +809,17 @@ static int mixer_resources_init(struct mixer_context *mixer_ctx) return -ENXIO; } - res = platform_get_resource(mixer_ctx->pdev, IORESOURCE_IRQ, 0); - if (res == NULL) { - dev_err(dev, "get interrupt resource failed.\n"); - return -ENXIO; - } + ret = platform_get_irq(mixer_ctx->pdev, 0); + if (ret < 0) + return ret; + mixer_ctx->irq = ret; - ret = devm_request_irq(dev, res->start, mixer_irq_handler, - 0, "drm_mixer", mixer_ctx); + ret = devm_request_irq(dev, mixer_ctx->irq, mixer_irq_handler, + 0, "drm_mixer", mixer_ctx); if (ret) { dev_err(dev, "request interrupt failed.\n"); return ret; } - mixer_ctx->irq = res->start; return 0; } From b342c1f335981ebc442127efe03524d2331a273c Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:32 +0000 Subject: [PATCH 429/773] drm/exynos/exynos_drm_fimd: Use platform_get_irq_byname() to get the interrupt platform_get_resource_byname(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq_byname(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimd.c b/drivers/gpu/drm/exynos/exynos_drm_fimd.c index c735e53939d8..7d5a483a54de 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimd.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimd.c @@ -1133,7 +1133,6 @@ static int fimd_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct fimd_context *ctx; struct device_node *i80_if_timings; - struct resource *res; int ret; if (!dev->of_node) @@ -1206,15 +1205,11 @@ static int fimd_probe(struct platform_device *pdev) if (IS_ERR(ctx->regs)) return PTR_ERR(ctx->regs); - res = platform_get_resource_byname(pdev, IORESOURCE_IRQ, - ctx->i80_if ? "lcd_sys" : "vsync"); - if (!res) { - dev_err(dev, "irq request failed.\n"); - return -ENXIO; - } + ret = platform_get_irq_byname(pdev, ctx->i80_if ? "lcd_sys" : "vsync"); + if (ret < 0) + return ret; - ret = devm_request_irq(dev, res->start, fimd_irq_handler, - 0, "drm_fimd", ctx); + ret = devm_request_irq(dev, ret, fimd_irq_handler, 0, "drm_fimd", ctx); if (ret) { dev_err(dev, "irq request failed.\n"); return ret; From be0a3b7e2a97e3f73004a5b453cc2023d8c1317a Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:33 +0000 Subject: [PATCH 430/773] drm/exynos/fimc: Use platform_get_irq() to get the interrupt platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimc.c b/drivers/gpu/drm/exynos/exynos_drm_fimc.c index 023f54ee61a8..0ee32e4b1e43 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimc.c @@ -1267,7 +1267,6 @@ static int fimc_probe(struct platform_device *pdev) struct exynos_drm_ipp_formats *formats; struct device *dev = &pdev->dev; struct fimc_context *ctx; - struct resource *res; int ret; int i, j, num_limits, num_formats; @@ -1330,14 +1329,12 @@ static int fimc_probe(struct platform_device *pdev) return PTR_ERR(ctx->regs); /* resource irq */ - res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (!res) { - dev_err(dev, "failed to request irq resource.\n"); - return -ENOENT; - } + ret = platform_get_irq(pdev, 0); + if (ret < 0) + return ret; - ret = devm_request_irq(dev, res->start, fimc_irq_handler, - 0, dev_name(dev), ctx); + ret = devm_request_irq(dev, ret, fimc_irq_handler, + 0, dev_name(dev), ctx); if (ret < 0) { dev_err(dev, "failed to request irq.\n"); return ret; From 586d0902456ad965c9a456fd0a0f451518aed1c5 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 22 Dec 2021 19:01:34 +0000 Subject: [PATCH 431/773] drm/exynos: gsc: Use platform_get_irq() to get the interrupt platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypassed the hierarchical setup and messed up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq(). Signed-off-by: Lad Prabhakar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_gsc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_gsc.c b/drivers/gpu/drm/exynos/exynos_drm_gsc.c index 166a80262896..964dceb28c1e 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gsc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gsc.c @@ -1220,7 +1220,6 @@ static int gsc_probe(struct platform_device *pdev) struct gsc_driverdata *driver_data; struct exynos_drm_ipp_formats *formats; struct gsc_context *ctx; - struct resource *res; int num_formats, ret, i, j; ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL); @@ -1275,13 +1274,10 @@ static int gsc_probe(struct platform_device *pdev) return PTR_ERR(ctx->regs); /* resource irq */ - res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (!res) { - dev_err(dev, "failed to request irq resource.\n"); - return -ENOENT; - } + ctx->irq = platform_get_irq(pdev, 0); + if (ctx->irq < 0) + return ctx->irq; - ctx->irq = res->start; ret = devm_request_irq(dev, ctx->irq, gsc_irq_handler, 0, dev_name(dev), ctx); if (ret < 0) { From 0a6e8d0a6df67e0fff9c7d130b89769df4167c2b Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 21 Jan 2022 11:00:39 +0100 Subject: [PATCH 432/773] drm/exynos: Don't fail if no TE-gpio is defined for DSI driver TE-gpio is optional and if it is not found then gpiod_get_optional() returns NULL. In such case the code will continue and try to convert NULL gpiod to irq what in turn fails. The failure is then propagated and driver is not registered. Fix this by returning early from exynos_dsi_register_te_irq() if no TE-gpio is found. Fixes: ee6c8b5afa62 ("drm/exynos: Replace legacy gpio interface for gpiod interface") Signed-off-by: Marek Szyprowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_dsi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_dsi.c b/drivers/gpu/drm/exynos/exynos_drm_dsi.c index 32a36572b894..14ebbb124852 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dsi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dsi.c @@ -1335,7 +1335,9 @@ static int exynos_dsi_register_te_irq(struct exynos_dsi *dsi, int te_gpio_irq; dsi->te_gpio = devm_gpiod_get_optional(dsi->dev, "te", GPIOD_IN); - if (IS_ERR(dsi->te_gpio)) { + if (!dsi->te_gpio) { + return 0; + } else if (IS_ERR(dsi->te_gpio)) { dev_err(dsi->dev, "gpio request failed with %ld\n", PTR_ERR(dsi->te_gpio)); return PTR_ERR(dsi->te_gpio); From 4188db23285e28d9e9b9096f856cdcd7868005ee Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 24 Jan 2022 14:52:46 +0100 Subject: [PATCH 433/773] drm/exynos: Search for TE-gpio in DSI panel's node TE-gpio, if defined, is placed in the panel's node, not the parent DSI node. Change the devm_gpiod_get_optional() to gpiod_get_optional() and pass proper device node to it. The code already has a proper cleanup path, so it looks that the devm_* variant has been applied accidentally during the conversion to gpiod API. Fixes: ee6c8b5afa62 ("drm/exynos: Replace legacy gpio interface for gpiod interface") Signed-off-by: Marek Szyprowski Fixed a typo. Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_dsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_dsi.c b/drivers/gpu/drm/exynos/exynos_drm_dsi.c index 14ebbb124852..d13f5e3a030d 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dsi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dsi.c @@ -1334,7 +1334,7 @@ static int exynos_dsi_register_te_irq(struct exynos_dsi *dsi, int ret; int te_gpio_irq; - dsi->te_gpio = devm_gpiod_get_optional(dsi->dev, "te", GPIOD_IN); + dsi->te_gpio = gpiod_get_optional(panel, "te", GPIOD_IN); if (!dsi->te_gpio) { return 0; } else if (IS_ERR(dsi->te_gpio)) { From aa091a6a91df395a0fa00a808a543301ec99e734 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sat, 19 Feb 2022 15:15:36 +0100 Subject: [PATCH 434/773] clk: lan966x: Fix linking error If the config options HAS_IOMEM is not set then the driver fails to link with the following error: clk-lan966x.c:(.text+0x950): undefined reference to `devm_platform_ioremap_resource' Therefor add missing dependencies: HAS_IOMEM and OF. Fixes: 54104ee02333 ("clk: lan966x: Add lan966x SoC clock driver") Reported-by: kernel test robot Signed-off-by: Horatiu Vultur Link: https://lore.kernel.org/r/20220219141536.460812-1-horatiu.vultur@microchip.com Reviewed-by: Nicolas Ferre Signed-off-by: Stephen Boyd --- drivers/clk/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index ad4256d54361..d4d67fbae869 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -231,6 +231,8 @@ config COMMON_CLK_GEMINI config COMMON_CLK_LAN966X bool "Generic Clock Controller driver for LAN966X SoC" + depends on HAS_IOMEM + depends on OF help This driver provides support for Generic Clock Controller(GCK) on LAN966X SoC. GCK generates and supplies clock to various peripherals From ce33c845b030c9cf768370c951bc699470b09fa7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Sun, 20 Feb 2022 23:49:57 +0100 Subject: [PATCH 435/773] tracing: Dump stacktrace trigger to the corresponding instance The stacktrace event trigger is not dumping the stacktrace to the instance where it was enabled, but to the global "instance." Use the private_data, pointing to the trigger file, to figure out the corresponding trace instance, and use it in the trigger action, like snapshot_trigger does. Link: https://lkml.kernel.org/r/afbb0b4f18ba92c276865bc97204d438473f4ebc.1645396236.git.bristot@kernel.org Cc: stable@vger.kernel.org Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables") Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d00fee705f9c..e0d50c9577f3 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1540,7 +1540,12 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - trace_dump_stack(STACK_SKIP); + struct trace_event_file *file = data->private_data; + + if (file) + __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + else + trace_dump_stack(STACK_SKIP); } static void From 762e52f79c95ea20a7229674ffd13b94d7d8959c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 9 Feb 2022 12:56:23 +0900 Subject: [PATCH 436/773] riscv: fix nommu_k210_sdcard_defconfig Instead of an arbitrary delay, use the "rootwait" kernel option to wait for the mmc root device to be ready. Signed-off-by: Damien Le Moal Reviewed-by: Anup Patel Fixes: 7e09fd3994c5 ("riscv: Add Canaan Kendryte K210 SD card defconfig") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/nommu_k210_sdcard_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig index 2a82a3b2992b..af64b95e88cc 100644 --- a/arch/riscv/configs/nommu_k210_sdcard_defconfig +++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig @@ -23,7 +23,7 @@ CONFIG_SLOB=y CONFIG_SOC_CANAAN=y CONFIG_SMP=y CONFIG_NR_CPUS=2 -CONFIG_CMDLINE="earlycon console=ttySIF0 rootdelay=2 root=/dev/mmcblk0p1 ro" +CONFIG_CMDLINE="earlycon console=ttySIF0 root=/dev/mmcblk0p1 rootwait ro" CONFIG_CMDLINE_FORCE=y # CONFIG_SECCOMP is not set # CONFIG_STACKPROTECTOR is not set From 22e2100b1b07d6f5acc71cc1acb53f680c677d77 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sun, 13 Feb 2022 16:18:45 +0800 Subject: [PATCH 437/773] riscv: fix oops caused by irqsoff latency tracer The trace_hardirqs_{on,off}() require the caller to setup frame pointer properly. This because these two functions use macro 'CALLER_ADDR1' (aka. __builtin_return_address(1)) to acquire caller info. If the $fp is used for other purpose, the code generated this macro (as below) could trigger memory access fault. 0xffffffff8011510e <+80>: ld a1,-16(s0) 0xffffffff80115112 <+84>: ld s2,-8(a1) # <-- paging fault here The oops message during booting if compiled with 'irqoff' tracer enabled: [ 0.039615][ T0] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000f8 [ 0.041925][ T0] Oops [#1] [ 0.042063][ T0] Modules linked in: [ 0.042864][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.17.0-rc1-00233-g9a20c48d1ed2 #29 [ 0.043568][ T0] Hardware name: riscv-virtio,qemu (DT) [ 0.044343][ T0] epc : trace_hardirqs_on+0x56/0xe2 [ 0.044601][ T0] ra : restore_all+0x12/0x6e [ 0.044721][ T0] epc : ffffffff80126a5c ra : ffffffff80003b94 sp : ffffffff81403db0 [ 0.044801][ T0] gp : ffffffff8163acd8 tp : ffffffff81414880 t0 : 0000000000000020 [ 0.044882][ T0] t1 : 0098968000000000 t2 : 0000000000000000 s0 : ffffffff81403de0 [ 0.044967][ T0] s1 : 0000000000000000 a0 : 0000000000000001 a1 : 0000000000000100 [ 0.045046][ T0] a2 : 0000000000000000 a3 : 0000000000000000 a4 : 0000000000000000 [ 0.045124][ T0] a5 : 0000000000000000 a6 : 0000000000000000 a7 : 0000000054494d45 [ 0.045210][ T0] s2 : ffffffff80003b94 s3 : ffffffff81a8f1b0 s4 : ffffffff80e27b50 [ 0.045289][ T0] s5 : ffffffff81414880 s6 : ffffffff8160fa00 s7 : 00000000800120e8 [ 0.045389][ T0] s8 : 0000000080013100 s9 : 000000000000007f s10: 0000000000000000 [ 0.045474][ T0] s11: 0000000000000000 t3 : 7fffffffffffffff t4 : 0000000000000000 [ 0.045548][ T0] t5 : 0000000000000000 t6 : ffffffff814aa368 [ 0.045620][ T0] status: 0000000200000100 badaddr: 00000000000000f8 cause: 000000000000000d [ 0.046402][ T0] [] restore_all+0x12/0x6e This because the $fp(aka. $s0) register is not used as frame pointer in the assembly entry code. resume_kernel: REG_L s0, TASK_TI_PREEMPT_COUNT(tp) bnez s0, restore_all REG_L s0, TASK_TI_FLAGS(tp) andi s0, s0, _TIF_NEED_RESCHED beqz s0, restore_all call preempt_schedule_irq j restore_all To fix above issue, here we add one extra level wrapper for function trace_hardirqs_{on,off}() so they can be safely called by low level entry code. Signed-off-by: Changbin Du Fixes: 3c4697982982 ("riscv: Enable LOCKDEP_SUPPORT & fixup TRACE_IRQFLAGS_SUPPORT") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/Makefile | 2 ++ arch/riscv/kernel/entry.S | 10 +++++----- arch/riscv/kernel/trace_irq.c | 27 +++++++++++++++++++++++++++ arch/riscv/kernel/trace_irq.h | 11 +++++++++++ 4 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 arch/riscv/kernel/trace_irq.c create mode 100644 arch/riscv/kernel/trace_irq.h diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 612556faa527..ffc87e76b1dd 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -51,6 +51,8 @@ obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o obj-$(CONFIG_DYNAMIC_FTRACE) += mcount-dyn.o +obj-$(CONFIG_TRACE_IRQFLAGS) += trace_irq.o + obj-$(CONFIG_RISCV_BASE_PMU) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index ed29e9c8f660..d6a46ed0bf05 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -108,7 +108,7 @@ _save_context: .option pop #ifdef CONFIG_TRACE_IRQFLAGS - call trace_hardirqs_off + call __trace_hardirqs_off #endif #ifdef CONFIG_CONTEXT_TRACKING @@ -143,7 +143,7 @@ skip_context_tracking: li t0, EXC_BREAKPOINT beq s4, t0, 1f #ifdef CONFIG_TRACE_IRQFLAGS - call trace_hardirqs_on + call __trace_hardirqs_on #endif csrs CSR_STATUS, SR_IE @@ -234,7 +234,7 @@ ret_from_exception: REG_L s0, PT_STATUS(sp) csrc CSR_STATUS, SR_IE #ifdef CONFIG_TRACE_IRQFLAGS - call trace_hardirqs_off + call __trace_hardirqs_off #endif #ifdef CONFIG_RISCV_M_MODE /* the MPP value is too large to be used as an immediate arg for addi */ @@ -270,10 +270,10 @@ restore_all: REG_L s1, PT_STATUS(sp) andi t0, s1, SR_PIE beqz t0, 1f - call trace_hardirqs_on + call __trace_hardirqs_on j 2f 1: - call trace_hardirqs_off + call __trace_hardirqs_off 2: #endif REG_L a0, PT_STATUS(sp) diff --git a/arch/riscv/kernel/trace_irq.c b/arch/riscv/kernel/trace_irq.c new file mode 100644 index 000000000000..095ac976d7da --- /dev/null +++ b/arch/riscv/kernel/trace_irq.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Changbin Du + */ + +#include +#include +#include "trace_irq.h" + +/* + * trace_hardirqs_on/off require the caller to setup frame pointer properly. + * Otherwise, CALLER_ADDR1 might trigger an pagging exception in kernel. + * Here we add one extra level so they can be safely called by low + * level entry code which $fp is used for other purpose. + */ + +void __trace_hardirqs_on(void) +{ + trace_hardirqs_on(); +} +NOKPROBE_SYMBOL(__trace_hardirqs_on); + +void __trace_hardirqs_off(void) +{ + trace_hardirqs_off(); +} +NOKPROBE_SYMBOL(__trace_hardirqs_off); diff --git a/arch/riscv/kernel/trace_irq.h b/arch/riscv/kernel/trace_irq.h new file mode 100644 index 000000000000..99fe67377e5e --- /dev/null +++ b/arch/riscv/kernel/trace_irq.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022 Changbin Du + */ +#ifndef __TRACE_IRQ_H +#define __TRACE_IRQ_H + +void __trace_hardirqs_on(void); +void __trace_hardirqs_off(void); + +#endif /* __TRACE_IRQ_H */ From 07c2c7a3b622e109ba4d2efd916da0477617ce81 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 24 Feb 2022 16:52:57 -0800 Subject: [PATCH 438/773] mptcp: accurate SIOCOUTQ for fallback socket The MPTCP SIOCOUTQ implementation is not very accurate in case of fallback: it only measures the data in the MPTCP-level write queue, but it does not take in account the subflow write queue utilization. In case of fallback the first can be empty, while the latter is not. The above produces sporadic self-tests issues and can foul legit user-space application. Fix the issue additionally querying the subflow in case of fallback. Fixes: 644807e3e462 ("mptcp: add SIOCINQ, OUTQ and OUTQNSD ioctls") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/260 Reported-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f60f01b14fac..12bb28c5007e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3294,6 +3294,17 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) return 0; delta = msk->write_seq - v; + if (__mptcp_check_fallback(msk) && msk->first) { + struct tcp_sock *tp = tcp_sk(msk->first); + + /* the first subflow is disconnected after close - see + * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq + * so ignore that status, too. + */ + if (!((1 << msk->first->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) + delta += READ_ONCE(tp->write_seq) - tp->snd_una; + } if (delta > INT_MAX) delta = INT_MAX; From 63bb8239d80571204c61d19c73f7bf5e3d9ef5fa Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 24 Feb 2022 16:52:58 -0800 Subject: [PATCH 439/773] selftests: mptcp: do complete cleanup at exit After commit 05be5e273c84 ("selftests: mptcp: add disconnect tests") the mptcp selftests leave behind a couple of tmp files after each run. run_tests_disconnect() misnames a few variables used to track them. Address the issue setting the appropriate global variables Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index cb5809b89081..f0f4ab96b8f3 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -763,8 +763,8 @@ run_tests_disconnect() run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-I 3 -i $old_cin" # restore previous status - cout=$old_cout - cout_disconnect="$cout".disconnect + sin=$old_sin + sin_disconnect="$cout".disconnect cin=$old_cin cin_disconnect="$cin".disconnect connect_per_transfer=1 From 877d11f0332cd2160e19e3313e262754c321fa36 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 24 Feb 2022 16:52:59 -0800 Subject: [PATCH 440/773] mptcp: Correctly set DATA_FIN timeout when number of retransmits is large Syzkaller with UBSAN uncovered a scenario where a large number of DATA_FIN retransmits caused a shift-out-of-bounds in the DATA_FIN timeout calculation: ================================================================================ UBSAN: shift-out-of-bounds in net/mptcp/protocol.c:470:29 shift exponent 32 is too large for 32-bit type 'unsigned int' CPU: 1 PID: 13059 Comm: kworker/1:0 Not tainted 5.17.0-rc2-00630-g5fbf21c90c60 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: events mptcp_worker Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_shift_out_of_bounds.cold+0xb2/0x20e lib/ubsan.c:330 mptcp_set_datafin_timeout net/mptcp/protocol.c:470 [inline] __mptcp_retrans.cold+0x72/0x77 net/mptcp/protocol.c:2445 mptcp_worker+0x58a/0xa70 net/mptcp/protocol.c:2528 process_one_work+0x9df/0x16d0 kernel/workqueue.c:2307 worker_thread+0x95/0xe10 kernel/workqueue.c:2454 kthread+0x2f4/0x3b0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 ================================================================================ This change limits the maximum timeout by limiting the size of the shift, which keeps all intermediate values in-bounds. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/259 Fixes: 6477dd39e62c ("mptcp: Retransmit DATA_FIN") Acked-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 12bb28c5007e..1c72f25f083e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -466,9 +466,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) static void mptcp_set_datafin_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + u32 retransmits; - mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, - TCP_RTO_MIN << icsk->icsk_retransmits); + retransmits = min_t(u32, icsk->icsk_retransmits, + ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); + + mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) From f4896248e9025ff744b4147e6758274a1cb8cbae Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 12 Feb 2022 20:27:13 +0900 Subject: [PATCH 441/773] can: etas_es58x: change opened_channel_cnt's type from atomic_t to u8 The driver uses an atomic_t variable: struct es58x_device::opened_channel_cnt to keep track of the number of opened channels in order to only allocate memory for the URBs when this count changes from zero to one. While the intent was to prevent race conditions, the choice of an atomic_t turns out to be a bad idea for several reasons: - implementation is incorrect and fails to decrement opened_channel_cnt when the URB allocation fails as reported in [1]. - even if opened_channel_cnt were to be correctly decremented, atomic_t is insufficient to cover edge cases: there can be a race condition in which 1/ a first process fails to allocate URBs memory 2/ a second process enters es58x_open() before the first process does its cleanup and decrements opened_channed_cnt. In which case, the second process would successfully return despite the URBs memory not being allocated. - actually, any kind of locking mechanism was useless here because it is redundant with the network stack big kernel lock (a.k.a. rtnl_lock) which is being hold by all the callers of net_device_ops:ndo_open() and net_device_ops:ndo_close(). c.f. the ASSERST_RTNL() calls in __dev_open() [2] and __dev_close_many() [3]. The atmomic_t is thus replaced by a simple u8 type and the logic to increment and decrement es58x_device:opened_channel_cnt is simplified accordingly fixing the bug reported in [1]. We do not check again for ASSERST_RTNL() as this is already done by the callers. [1] https://lore.kernel.org/linux-can/20220201140351.GA2548@kili/T/#u [2] https://elixir.bootlin.com/linux/v5.16/source/net/core/dev.c#L1463 [3] https://elixir.bootlin.com/linux/v5.16/source/net/core/dev.c#L1541 Fixes: 8537257874e9 ("can: etas_es58x: add core support for ETAS ES58X CAN USB interfaces") Link: https://lore.kernel.org/all/20220212112713.577957-1-mailhol.vincent@wanadoo.fr Reported-by: Dan Carpenter Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/etas_es58x/es58x_core.c | 9 +++++---- drivers/net/can/usb/etas_es58x/es58x_core.h | 8 +++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index 2ed2370a3166..2d73ebbf3836 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -1787,7 +1787,7 @@ static int es58x_open(struct net_device *netdev) struct es58x_device *es58x_dev = es58x_priv(netdev)->es58x_dev; int ret; - if (atomic_inc_return(&es58x_dev->opened_channel_cnt) == 1) { + if (!es58x_dev->opened_channel_cnt) { ret = es58x_alloc_rx_urbs(es58x_dev); if (ret) return ret; @@ -1805,12 +1805,13 @@ static int es58x_open(struct net_device *netdev) if (ret) goto free_urbs; + es58x_dev->opened_channel_cnt++; netif_start_queue(netdev); return ret; free_urbs: - if (atomic_dec_and_test(&es58x_dev->opened_channel_cnt)) + if (!es58x_dev->opened_channel_cnt) es58x_free_urbs(es58x_dev); netdev_err(netdev, "%s: Could not open the network device: %pe\n", __func__, ERR_PTR(ret)); @@ -1845,7 +1846,8 @@ static int es58x_stop(struct net_device *netdev) es58x_flush_pending_tx_msg(netdev); - if (atomic_dec_and_test(&es58x_dev->opened_channel_cnt)) + es58x_dev->opened_channel_cnt--; + if (!es58x_dev->opened_channel_cnt) es58x_free_urbs(es58x_dev); return 0; @@ -2215,7 +2217,6 @@ static struct es58x_device *es58x_init_es58x_dev(struct usb_interface *intf, init_usb_anchor(&es58x_dev->tx_urbs_idle); init_usb_anchor(&es58x_dev->tx_urbs_busy); atomic_set(&es58x_dev->tx_urbs_idle_cnt, 0); - atomic_set(&es58x_dev->opened_channel_cnt, 0); usb_set_intfdata(intf, es58x_dev); es58x_dev->rx_pipe = usb_rcvbulkpipe(es58x_dev->udev, diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.h b/drivers/net/can/usb/etas_es58x/es58x_core.h index 826a15871573..e5033cb5e695 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.h +++ b/drivers/net/can/usb/etas_es58x/es58x_core.h @@ -373,8 +373,6 @@ struct es58x_operators { * queue wake/stop logic should prevent this URB from getting * empty. Please refer to es58x_get_tx_urb() for more details. * @tx_urbs_idle_cnt: number of urbs in @tx_urbs_idle. - * @opened_channel_cnt: number of channels opened (c.f. es58x_open() - * and es58x_stop()). * @ktime_req_ns: kernel timestamp when es58x_set_realtime_diff_ns() * was called. * @realtime_diff_ns: difference in nanoseconds between the clocks of @@ -384,6 +382,10 @@ struct es58x_operators { * in RX branches. * @rx_max_packet_size: Maximum length of bulk-in URB. * @num_can_ch: Number of CAN channel (i.e. number of elements of @netdev). + * @opened_channel_cnt: number of channels opened. Free of race + * conditions because its two users (net_device_ops:ndo_open() + * and net_device_ops:ndo_close()) guarantee that the network + * stack big kernel lock (a.k.a. rtnl_mutex) is being hold. * @rx_cmd_buf_len: Length of @rx_cmd_buf. * @rx_cmd_buf: The device might split the URB commands in an * arbitrary amount of pieces. This buffer is used to concatenate @@ -406,7 +408,6 @@ struct es58x_device { struct usb_anchor tx_urbs_busy; struct usb_anchor tx_urbs_idle; atomic_t tx_urbs_idle_cnt; - atomic_t opened_channel_cnt; u64 ktime_req_ns; s64 realtime_diff_ns; @@ -415,6 +416,7 @@ struct es58x_device { u16 rx_max_packet_size; u8 num_can_ch; + u8 opened_channel_cnt; u16 rx_cmd_buf_len; union es58x_urb_cmd rx_cmd_buf; From 035b0fcf02707d3c9c2890dc1484b11aa5335eb1 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 15 Feb 2022 08:48:14 +0900 Subject: [PATCH 442/773] can: gs_usb: change active_channels's type from atomic_t to u8 The driver uses an atomic_t variable: gs_usb:active_channels to keep track of the number of opened channels in order to only allocate memory for the URBs when this count changes from zero to one. However, the driver does not decrement the counter when an error occurs in gs_can_open(). This issue is fixed by changing the type from atomic_t to u8 and by simplifying the logic accordingly. It is safe to use an u8 here because the network stack big kernel lock (a.k.a. rtnl_mutex) is being hold. For details, please refer to [1]. [1] https://lore.kernel.org/linux-can/CAMZ6Rq+sHpiw34ijPsmp7vbUpDtJwvVtdV7CvRZJsLixjAFfrg@mail.gmail.com/T/#t Fixes: d08e973a77d1 ("can: gs_usb: Added support for the GS_USB CAN devices") Link: https://lore.kernel.org/all/20220214234814.1321599-1-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index b487e3fe770a..d35749fad1ef 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -191,8 +191,8 @@ struct gs_can { struct gs_usb { struct gs_can *canch[GS_MAX_INTF]; struct usb_anchor rx_submitted; - atomic_t active_channels; struct usb_device *udev; + u8 active_channels; }; /* 'allocate' a tx context. @@ -589,7 +589,7 @@ static int gs_can_open(struct net_device *netdev) if (rc) return rc; - if (atomic_add_return(1, &parent->active_channels) == 1) { + if (!parent->active_channels) { for (i = 0; i < GS_MAX_RX_URBS; i++) { struct urb *urb; u8 *buf; @@ -690,6 +690,7 @@ static int gs_can_open(struct net_device *netdev) dev->can.state = CAN_STATE_ERROR_ACTIVE; + parent->active_channels++; if (!(dev->can.ctrlmode & CAN_CTRLMODE_LISTENONLY)) netif_start_queue(netdev); @@ -705,7 +706,8 @@ static int gs_can_close(struct net_device *netdev) netif_stop_queue(netdev); /* Stop polling */ - if (atomic_dec_and_test(&parent->active_channels)) + parent->active_channels--; + if (!parent->active_channels) usb_kill_anchored_urbs(&parent->rx_submitted); /* Stop sending URBs */ @@ -984,8 +986,6 @@ static int gs_usb_probe(struct usb_interface *intf, init_usb_anchor(&dev->rx_submitted); - atomic_set(&dev->active_channels, 0); - usb_set_intfdata(intf, dev); dev->udev = interface_to_usbdev(intf); From bca06b85fcaf866602e328b3bcd86f74180eca14 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Feb 2022 19:19:16 +0000 Subject: [PATCH 443/773] Revert "KVM: VMX: Save HOST_CR3 in vmx_set_host_fs_gs()" Undo a nested VMX fix as a step toward reverting the commit it fixed, 15ad9762d69f ("KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()"), as the underlying premise that "host CR3 in the vcpu thread can only be changed when scheduling" is wrong. This reverts commit a9f2705ec84449e3b8d70c804766f8e97e23080d. Signed-off-by: Sean Christopherson Message-Id: <20220224191917.3508476-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 20 +++++++++++--------- arch/x86/kvm/vmx/vmx.h | 5 ++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ba34e94049c7..c12f95004a72 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -246,8 +246,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, src = &prev->host_state; dest = &vmx->loaded_vmcs->host_state; - vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel, - src->fs_base, src->gs_base); + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); dest->ldt_sel = src->ldt_sel; #ifdef CONFIG_X86_64 dest->ds_sel = src->ds_sel; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index efda5e4d6247..beb68cd28aca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1080,14 +1080,9 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } -void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, - u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base) +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base) { - if (unlikely(cr3 != host->cr3)) { - vmcs_writel(HOST_CR3, cr3); - host->cr3 = cr3; - } if (unlikely(fs_sel != host->fs_sel)) { if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); @@ -1119,6 +1114,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif + unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1182,8 +1178,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(), - fs_sel, gs_sel, fs_base, gs_base); + vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); + + /* Host CR3 including its PCID is stable when guest state is loaded. */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != host_state->cr3)) { + vmcs_writel(HOST_CR3, cr3); + host_state->cr3 = cr3; + } vmx->guest_state_loaded = true; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 7f2c82e7f38f..9c6bfcd84008 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -374,9 +374,8 @@ int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); -void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, - u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base); +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); From 1a71581012ddf1f465040ef3d9f700341fa3cf04 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Feb 2022 19:19:17 +0000 Subject: [PATCH 444/773] Revert "KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()" Revert back to refreshing vmcs.HOST_CR3 immediately prior to VM-Enter. The PCID (ASID) part of CR3 can be bumped without KVM being scheduled out, as the kernel will switch CR3 during __text_poke(), e.g. in response to a static key toggling. If switch_mm_irqs_off() chooses a new ASID for the mm associate with KVM, KVM will do VM-Enter => VM-Exit with a stale vmcs.HOST_CR3. Add a comment to explain why KVM must wait until VM-Enter is imminent to refresh vmcs.HOST_CR3. The following splat was captured by stashing vmcs.HOST_CR3 in kvm_vcpu and adding a WARN in load_new_mm_cr3() to fire if a new ASID is being loaded for the KVM-associated mm while KVM has a "running" vCPU: static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); ... WARN(vcpu && (vcpu->cr3 & GENMASK(11, 0)) != (new_mm_cr3 & GENMASK(11, 0)) && (vcpu->cr3 & PHYSICAL_PAGE_MASK) == (new_mm_cr3 & PHYSICAL_PAGE_MASK), "KVM is hosed, loading CR3 = %lx, vmcs.HOST_CR3 = %lx", new_mm_cr3, vcpu->cr3); } ------------[ cut here ]------------ KVM is hosed, loading CR3 = 8000000105393004, vmcs.HOST_CR3 = 105393003 WARNING: CPU: 4 PID: 20717 at arch/x86/mm/tlb.c:291 load_new_mm_cr3+0x82/0xe0 Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel CPU: 4 PID: 20717 Comm: stable Tainted: G W 5.17.0-rc3+ #747 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:load_new_mm_cr3+0x82/0xe0 RSP: 0018:ffffc9000489fa98 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 8000000105393004 RCX: 0000000000000027 RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff888277d1b788 RBP: 0000000000000004 R08: ffff888277d1b780 R09: ffffc9000489f8b8 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: ffff88810678a800 R14: 0000000000000004 R15: 0000000000000c33 FS: 00007fa9f0e72700(0000) GS:ffff888277d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001001b5003 CR4: 0000000000172ea0 Call Trace: switch_mm_irqs_off+0x1cb/0x460 __text_poke+0x308/0x3e0 text_poke_bp_batch+0x168/0x220 text_poke_finish+0x1b/0x30 arch_jump_label_transform_apply+0x18/0x30 static_key_slow_inc_cpuslocked+0x7c/0x90 static_key_slow_inc+0x16/0x20 kvm_lapic_set_base+0x116/0x190 kvm_set_apic_base+0xa5/0xe0 kvm_set_msr_common+0x2f4/0xf60 vmx_set_msr+0x355/0xe70 [kvm_intel] kvm_set_msr_ignored_check+0x91/0x230 kvm_emulate_wrmsr+0x36/0x120 vmx_handle_exit+0x609/0x6c0 [kvm_intel] kvm_arch_vcpu_ioctl_run+0x146f/0x1b80 kvm_vcpu_ioctl+0x279/0x690 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae ---[ end trace 0000000000000000 ]--- This reverts commit 15ad9762d69fd8e40a4a51828c1d6b0c1b8fbea0. Fixes: 15ad9762d69f ("KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()") Reported-by: Wanpeng Li Cc: Lai Jiangshan Signed-off-by: Sean Christopherson Acked-by: Lai Jiangshan Message-Id: <20220224191917.3508476-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +++++++- arch/x86/kvm/vmx/vmx.c | 24 ++++++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c12f95004a72..dc822a1d403d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3055,7 +3055,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr4; + unsigned long cr3, cr4; bool vm_fail; if (!nested_early_check) @@ -3078,6 +3078,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) */ vmcs_writel(GUEST_RFLAGS, 0); + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index beb68cd28aca..b730d799c26e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1114,7 +1114,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif - unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1179,14 +1178,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - - /* Host CR3 including its PCID is stable when guest state is loaded. */ - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != host_state->cr3)) { - vmcs_writel(HOST_CR3, cr3); - host_state->cr3 = cr3; - } - vmx->guest_state_loaded = true; } @@ -6793,7 +6784,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr4; + unsigned long cr3, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -6836,6 +6827,19 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vcpu->arch.regs_dirty = 0; + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time + * it switches back to the current->mm, which can occur in KVM context + * when switching to a temporary mm to patch kernel code, e.g. if KVM + * toggles a static key while handling a VM-Exit. + */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); From dcf4ff7a48e7598e6b10126cc02177abb8ae4f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Wed, 23 Feb 2022 22:19:54 +0100 Subject: [PATCH 445/773] xen/netfront: destroy queues before real_num_tx_queues is zeroed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xennet_destroy_queues() relies on info->netdev->real_num_tx_queues to delete queues. Since d7dac083414eb5bb99a6d2ed53dc2c1b405224e5 ("net-sysfs: update the queue counts in the unregistration path"), unregister_netdev() indirectly sets real_num_tx_queues to 0. Those two facts together means, that xennet_destroy_queues() called from xennet_remove() cannot do its job, because it's called after unregister_netdev(). This results in kfree-ing queues that are still linked in napi, which ultimately crashes: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 52 Comm: xenwatch Tainted: G W 5.16.10-1.32.fc32.qubes.x86_64+ #226 RIP: 0010:free_netdev+0xa3/0x1a0 Code: ff 48 89 df e8 2e e9 00 00 48 8b 43 50 48 8b 08 48 8d b8 a0 fe ff ff 48 8d a9 a0 fe ff ff 49 39 c4 75 26 eb 47 e8 ed c1 66 ff <48> 8b 85 60 01 00 00 48 8d 95 60 01 00 00 48 89 ef 48 2d 60 01 00 RSP: 0000:ffffc90000bcfd00 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88800edad000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffc90000bcfc30 RDI: 00000000ffffffff RBP: fffffffffffffea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff88800edad050 R13: ffff8880065f8f88 R14: 0000000000000000 R15: ffff8880066c6680 FS: 0000000000000000(0000) GS:ffff8880f3300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000e998c006 CR4: 00000000003706e0 Call Trace: xennet_remove+0x13d/0x300 [xen_netfront] xenbus_dev_remove+0x6d/0xf0 __device_release_driver+0x17a/0x240 device_release_driver+0x24/0x30 bus_remove_device+0xd8/0x140 device_del+0x18b/0x410 ? _raw_spin_unlock+0x16/0x30 ? klist_iter_exit+0x14/0x20 ? xenbus_dev_request_and_reply+0x80/0x80 device_unregister+0x13/0x60 xenbus_dev_changed+0x18e/0x1f0 xenwatch_thread+0xc0/0x1a0 ? do_wait_intr_irq+0xa0/0xa0 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Fix this by calling xennet_destroy_queues() from xennet_uninit(), when real_num_tx_queues is still available. This ensures that queues are destroyed when real_num_tx_queues is set to 0, regardless of how unregister_netdev() was called. Originally reported at https://github.com/QubesOS/qubes-issues/issues/7257 Fixes: d7dac083414eb5bb9 ("net-sysfs: update the queue counts in the unregistration path") Cc: stable@vger.kernel.org Signed-off-by: Marek Marczykowski-Górecki Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 39 ++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 8b18246ad999..7748f07e2cf1 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -842,6 +842,28 @@ static int xennet_close(struct net_device *dev) return 0; } +static void xennet_destroy_queues(struct netfront_info *info) +{ + unsigned int i; + + for (i = 0; i < info->netdev->real_num_tx_queues; i++) { + struct netfront_queue *queue = &info->queues[i]; + + if (netif_running(info->netdev)) + napi_disable(&queue->napi); + netif_napi_del(&queue->napi); + } + + kfree(info->queues); + info->queues = NULL; +} + +static void xennet_uninit(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + xennet_destroy_queues(np); +} + static void xennet_set_rx_rsp_cons(struct netfront_queue *queue, RING_IDX val) { unsigned long flags; @@ -1611,6 +1633,7 @@ static int xennet_xdp(struct net_device *dev, struct netdev_bpf *xdp) } static const struct net_device_ops xennet_netdev_ops = { + .ndo_uninit = xennet_uninit, .ndo_open = xennet_open, .ndo_stop = xennet_close, .ndo_start_xmit = xennet_start_xmit, @@ -2103,22 +2126,6 @@ error: return err; } -static void xennet_destroy_queues(struct netfront_info *info) -{ - unsigned int i; - - for (i = 0; i < info->netdev->real_num_tx_queues; i++) { - struct netfront_queue *queue = &info->queues[i]; - - if (netif_running(info->netdev)) - napi_disable(&queue->napi); - netif_napi_del(&queue->napi); - } - - kfree(info->queues); - info->queues = NULL; -} - static int xennet_create_page_pool(struct netfront_queue *queue) From 087a7b944c5db409f7c1a68bf4896c56ba54eaff Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 24 Feb 2022 12:38:29 +0100 Subject: [PATCH 446/773] net: stmmac: only enable DMA interrupts when ready In this driver's ->ndo_open() callback, it enables DMA interrupts, starts the DMA channels, then requests interrupts with request_irq(), and then finally enables napi. If RX DMA interrupts are received before napi is enabled, no processing is done because napi_schedule_prep() will return false. If the network has a lot of broadcast/multicast traffic, then the RX ring could fill up completely before napi is enabled. When this happens, no further RX interrupts will be delivered, and the driver will fail to receive any packets. Fix this by only enabling DMA interrupts after all other initialization is complete. Fixes: 523f11b5d4fd72efb ("net: stmmac: move hardware setup for stmmac_open to new function") Reported-by: Lars Persson Signed-off-by: Vincent Whitchurch Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index bde76ea2deec..cb9b6e08780c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2262,6 +2262,23 @@ static void stmmac_stop_tx_dma(struct stmmac_priv *priv, u32 chan) stmmac_stop_tx(priv, priv->ioaddr, chan); } +static void stmmac_enable_all_dma_irq(struct stmmac_priv *priv) +{ + u32 rx_channels_count = priv->plat->rx_queues_to_use; + u32 tx_channels_count = priv->plat->tx_queues_to_use; + u32 dma_csr_ch = max(rx_channels_count, tx_channels_count); + u32 chan; + + for (chan = 0; chan < dma_csr_ch; chan++) { + struct stmmac_channel *ch = &priv->channel[chan]; + unsigned long flags; + + spin_lock_irqsave(&ch->lock, flags); + stmmac_enable_dma_irq(priv, priv->ioaddr, chan, 1, 1); + spin_unlock_irqrestore(&ch->lock, flags); + } +} + /** * stmmac_start_all_dma - start all RX and TX DMA channels * @priv: driver private structure @@ -2904,8 +2921,10 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) stmmac_axi(priv, priv->ioaddr, priv->plat->axi); /* DMA CSR Channel configuration */ - for (chan = 0; chan < dma_csr_ch; chan++) + for (chan = 0; chan < dma_csr_ch; chan++) { stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan); + stmmac_disable_dma_irq(priv, priv->ioaddr, chan, 1, 1); + } /* DMA RX Channel Configuration */ for (chan = 0; chan < rx_channels_count; chan++) { @@ -3761,6 +3780,7 @@ static int stmmac_open(struct net_device *dev) stmmac_enable_all_queues(priv); netif_tx_start_all_queues(priv->dev); + stmmac_enable_all_dma_irq(priv); return 0; @@ -6510,8 +6530,10 @@ int stmmac_xdp_open(struct net_device *dev) } /* DMA CSR Channel configuration */ - for (chan = 0; chan < dma_csr_ch; chan++) + for (chan = 0; chan < dma_csr_ch; chan++) { stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan); + stmmac_disable_dma_irq(priv, priv->ioaddr, chan, 1, 1); + } /* Adjust Split header */ sph_en = (priv->hw->rx_csum > 0) && priv->sph; @@ -6572,6 +6594,7 @@ int stmmac_xdp_open(struct net_device *dev) stmmac_enable_all_queues(priv); netif_carrier_on(dev); netif_tx_start_all_queues(dev); + stmmac_enable_all_dma_irq(priv); return 0; @@ -7451,6 +7474,7 @@ int stmmac_resume(struct device *dev) stmmac_restore_hw_vlan_rx_fltr(priv, ndev, priv->hw); stmmac_enable_all_queues(priv); + stmmac_enable_all_dma_irq(priv); mutex_unlock(&priv->lock); rtnl_unlock(); From 9f1c50cf39167ff71dc5953a3234f3f6eeb8fcb5 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 24 Feb 2022 23:26:19 +0800 Subject: [PATCH 447/773] net/smc: fix connection leak There's a potential leak issue under following execution sequence : smc_release smc_connect_work if (sk->sk_state == SMC_INIT) send_clc_confirim tcp_abort(); ... sk.sk_state = SMC_ACTIVE smc_close_active switch(sk->sk_state) { ... case SMC_ACTIVE: smc_close_final() // then wait peer closed Unfortunately, tcp_abort() may discard CLC CONFIRM messages that are still in the tcp send buffer, in which case our connection token cannot be delivered to the server side, which means that we cannot get a passive close message at all. Therefore, it is impossible for the to be disconnected at all. This patch tries a very simple way to avoid this issue, once the state has changed to SMC_ACTIVE after tcp_abort(), we can actively abort the smc connection, considering that the state is SMC_INIT before tcp_abort(), abandoning the complete disconnection process should not cause too much problem. In fact, this problem may exist as long as the CLC CONFIRM message is not received by the server. Whether a timer should be added after smc_close_final() needs to be discussed in the future. But even so, this patch provides a faster release for connection in above case, it should also be valuable. Fixes: 39f41f367b08 ("net/smc: common release code for non-accepted sockets") Signed-off-by: D. Wythe Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 306d9e8cd1dd..81984c1c0e78 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -183,7 +183,7 @@ static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = 0; + int old_state, rc = 0; if (!sk) goto out; @@ -191,8 +191,10 @@ static int smc_release(struct socket *sock) sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); + old_state = sk->sk_state; + /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); if (cancel_work_sync(&smc->connect_work)) @@ -206,6 +208,10 @@ static int smc_release(struct socket *sock) else lock_sock(sk); + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) + smc_close_active_abort(smc); + rc = __smc_release(smc); /* detach socket */ From 91b0383fef06f20b847fa9e4f0e3054ead0b1a1b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 24 Feb 2022 18:01:54 +0200 Subject: [PATCH 448/773] net: dcb: flush lingering app table entries for unregistered devices If I'm not mistaken (and I don't think I am), the way in which the dcbnl_ops work is that drivers call dcb_ieee_setapp() and this populates the application table with dynamically allocated struct dcb_app_type entries that are kept in the module-global dcb_app_list. However, nobody keeps exact track of these entries, and although dcb_ieee_delapp() is supposed to remove them, nobody does so when the interface goes away (example: driver unbinds from device). So the dcb_app_list will contain lingering entries with an ifindex that no longer matches any device in dcb_app_lookup(). Reclaim the lost memory by listening for the NETDEV_UNREGISTER event and flushing the app table entries of interfaces that are now gone. In fact something like this used to be done as part of the initial commit (blamed below), but it was done in dcbnl_exit() -> dcb_flushapp(), essentially at module_exit time. That became dead code after commit 7a6b6f515f77 ("DCB: fix kconfig option") which essentially merged "tristate config DCB" and "bool config DCBNL" into a single "bool config DCB", so net/dcb/dcbnl.c could not be built as a module anymore. Commit 36b9ad8084bd ("net/dcb: make dcbnl.c explicitly non-modular") recognized this and deleted dcbnl_exit() and dcb_flushapp() altogether, leaving us with the version we have today. Since flushing application table entries can and should be done as soon as the netdevice disappears, fundamentally the commit that is to blame is the one that introduced the design of this API. Fixes: 9ab933ab2cc8 ("dcbnl: add appliction tlv handlers") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index b441ab330fd3..36c91273daac 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2073,8 +2073,52 @@ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); +static void dcbnl_flush_dev(struct net_device *dev) +{ + struct dcb_app_type *itr, *tmp; + + spin_lock(&dcb_lock); + + list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { + if (itr->ifindex == dev->ifindex) { + list_del(&itr->list); + kfree(itr); + } + } + + spin_unlock(&dcb_lock); +} + +static int dcbnl_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_UNREGISTER: + if (!dev->dcbnl_ops) + return NOTIFY_DONE; + + dcbnl_flush_dev(dev); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block dcbnl_nb __read_mostly = { + .notifier_call = dcbnl_netdevice_event, +}; + static int __init dcbnl_init(void) { + int err; + + err = register_netdevice_notifier(&dcbnl_nb); + if (err) + return err; + rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); From 8d0657f39f487d904fca713e0bc39c2707382553 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:51 -0800 Subject: [PATCH 449/773] ibmvnic: free reset-work-item when flushing Fix a tiny memory leak when flushing the reset work queue. Fixes: 2770a7984db5 ("ibmvnic: Introduce hard reset recovery") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index dee05a353dbd..86d522171892 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2784,8 +2784,10 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, * flush reset queue and process this reset */ if (adapter->force_reset_recovery && !list_empty(&adapter->rwi_list)) { - list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) + list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) { list_del(entry); + kfree(list_entry(entry, struct ibmvnic_rwi, list)); + } } rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); From 765559b10ce514eb1576595834f23cdc92125fee Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:52 -0800 Subject: [PATCH 450/773] ibmvnic: initialize rc before completing wait We should initialize ->init_done_rc before calling complete(). Otherwise the waiting thread may see ->init_done_rc as 0 before we have updated it and may assume that the CRQ was successful. Fixes: 6b278c0cb378 ("ibmvnic delay complete()") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 86d522171892..c73c699600a8 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5323,9 +5323,9 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, } if (!completion_done(&adapter->init_done)) { - complete(&adapter->init_done); if (!adapter->init_done_rc) adapter->init_done_rc = -EAGAIN; + complete(&adapter->init_done); } break; From 83da53f7e4bd86dca4b2edc1e2bb324fb3c033a1 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:53 -0800 Subject: [PATCH 451/773] ibmvnic: define flush_reset_queue helper Define and use a helper to flush the reset queue. Fixes: 2770a7984db5 ("ibmvnic: Introduce hard reset recovery") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c73c699600a8..42c3ac9ebb75 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2735,12 +2735,23 @@ static void __ibmvnic_delayed_reset(struct work_struct *work) __ibmvnic_reset(&adapter->ibmvnic_reset); } +static void flush_reset_queue(struct ibmvnic_adapter *adapter) +{ + struct list_head *entry, *tmp_entry; + + if (!list_empty(&adapter->rwi_list)) { + list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) { + list_del(entry); + kfree(list_entry(entry, struct ibmvnic_rwi, list)); + } + } +} + static int ibmvnic_reset(struct ibmvnic_adapter *adapter, enum ibmvnic_reset_reason reason) { - struct list_head *entry, *tmp_entry; - struct ibmvnic_rwi *rwi, *tmp; struct net_device *netdev = adapter->netdev; + struct ibmvnic_rwi *rwi, *tmp; unsigned long flags; int ret; @@ -2783,12 +2794,9 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, /* if we just received a transport event, * flush reset queue and process this reset */ - if (adapter->force_reset_recovery && !list_empty(&adapter->rwi_list)) { - list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) { - list_del(entry); - kfree(list_entry(entry, struct ibmvnic_rwi, list)); - } - } + if (adapter->force_reset_recovery) + flush_reset_queue(adapter); + rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); netdev_dbg(adapter->netdev, "Scheduling reset (reason %s)\n", From 36491f2df9ad2501e5a4ec25d3d95d72bafd2781 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:54 -0800 Subject: [PATCH 452/773] ibmvnic: complete init_done on transport events If we get a transport event, set the error and mark the init as complete so the attempt to send crq-init or login fail sooner rather than wait for the timeout. Fixes: bbd669a868bb ("ibmvnic: Fix completion structure initialization") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 42c3ac9ebb75..5913d372bc27 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5356,6 +5356,13 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, adapter->fw_done_rc = -EIO; complete(&adapter->fw_done); } + + /* if we got here during crq-init, retry crq-init */ + if (!completion_done(&adapter->init_done)) { + adapter->init_done_rc = -EAGAIN; + complete(&adapter->init_done); + } + if (!completion_done(&adapter->stats_done)) complete(&adapter->stats_done); if (test_bit(0, &adapter->resetting)) From 570425f8c7c18b14fa8a2a58a0adb431968ad118 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:55 -0800 Subject: [PATCH 453/773] ibmvnic: register netdev after init of adapter Finish initializing the adapter before registering netdev so state is consistent. Fixes: c26eba03e407 ("ibmvnic: Update reset infrastructure to support tunable parameters") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5913d372bc27..a7b03ca109d8 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5826,12 +5826,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) goto ibmvnic_dev_file_err; netif_carrier_off(netdev); - rc = register_netdev(netdev); - if (rc) { - dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc); - goto ibmvnic_register_fail; - } - dev_info(&dev->dev, "ibmvnic registered\n"); if (init_success) { adapter->state = VNIC_PROBED; @@ -5844,6 +5838,14 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) adapter->wait_for_reset = false; adapter->last_reset_time = jiffies; + + rc = register_netdev(netdev); + if (rc) { + dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc); + goto ibmvnic_register_fail; + } + dev_info(&dev->dev, "ibmvnic registered\n"); + return 0; ibmvnic_register_fail: From ae16bf15374d8b055e040ac6f3f1147ab1c9bb7d Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:56 -0800 Subject: [PATCH 454/773] ibmvnic: init init_done_rc earlier We currently initialize the ->init_done completion/return code fields before issuing a CRQ_INIT command. But if we get a transport event soon after registering the CRQ the taskslet may already have recorded the completion and error code. If we initialize here, we might overwrite/ lose that and end up issuing the CRQ_INIT only to timeout later. If that timeout happens during probe, we will leave the adapter in the DOWN state rather than retrying to register/init the CRQ. Initialize the completion before registering the CRQ so we don't lose the notification. Fixes: 032c5e82847a ("Driver for IBM System i/p VNIC protocol") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index a7b03ca109d8..16a8f92adbda 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2212,6 +2212,19 @@ static const char *reset_reason_to_string(enum ibmvnic_reset_reason reason) return "UNKNOWN"; } +/* + * Initialize the init_done completion and return code values. We + * can get a transport event just after registering the CRQ and the + * tasklet will use this to communicate the transport event. To ensure + * we don't miss the notification/error, initialize these _before_ + * regisering the CRQ. + */ +static inline void reinit_init_done(struct ibmvnic_adapter *adapter) +{ + reinit_completion(&adapter->init_done); + adapter->init_done_rc = 0; +} + /* * do_reset returns zero if we are able to keep processing reset events, or * non-zero if we hit a fatal error and must halt. @@ -2318,6 +2331,8 @@ static int do_reset(struct ibmvnic_adapter *adapter, */ adapter->state = VNIC_PROBED; + reinit_init_done(adapter); + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { rc = init_crq_queue(adapter); } else if (adapter->reset_reason == VNIC_RESET_MOBILITY) { @@ -2461,7 +2476,8 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter, */ adapter->state = VNIC_PROBED; - reinit_completion(&adapter->init_done); + reinit_init_done(adapter); + rc = init_crq_queue(adapter); if (rc) { netdev_err(adapter->netdev, @@ -5679,10 +5695,6 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset) adapter->from_passive_init = false; - if (reset) - reinit_completion(&adapter->init_done); - - adapter->init_done_rc = 0; rc = ibmvnic_send_crq_init(adapter); if (rc) { dev_err(dev, "Send crq init failed with error %d\n", rc); @@ -5696,12 +5708,14 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset) if (adapter->init_done_rc) { release_crq_queue(adapter); + dev_err(dev, "CRQ-init failed, %d\n", adapter->init_done_rc); return adapter->init_done_rc; } if (adapter->from_passive_init) { adapter->state = VNIC_OPEN; adapter->from_passive_init = false; + dev_err(dev, "CRQ-init failed, passive-init\n"); return -EINVAL; } @@ -5795,6 +5809,8 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) init_success = false; do { + reinit_init_done(adapter); + rc = init_crq_queue(adapter); if (rc) { dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n", From f628ad531b4f34fdba0984255b4a2850dd369513 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:57 -0800 Subject: [PATCH 455/773] ibmvnic: clear fop when retrying probe Clear ->failover_pending flag that may have been set in the previous pass of registering CRQ. If we don't clear, a subsequent ibmvnic_open() call would be misled into thinking a failover is pending and assuming that the reset worker thread would open the adapter. If this pass of registering the CRQ succeeds (i.e there is no transport event), there wouldn't be a reset worker thread. This would leave the adapter unconfigured and require manual intervention to bring it up during boot. Fixes: 5a18e1e0c193 ("ibmvnic: Fix failover case for non-redundant configuration") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 16a8f92adbda..93580c68600a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5811,6 +5811,11 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) do { reinit_init_done(adapter); + /* clear any failovers we got in the previous pass + * since we are reinitializing the CRQ + */ + adapter->failover_pending = false; + rc = init_crq_queue(adapter); if (rc) { dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n", From fd98693cb0721317f27341951593712c580c36a1 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 24 Feb 2022 22:23:58 -0800 Subject: [PATCH 456/773] ibmvnic: Allow queueing resets during probe We currently don't allow queuing resets when adapter is in VNIC_PROBING state - instead we throw away the reset and return EBUSY. The reasoning is probably that during ibmvnic_probe() the ibmvnic_adapter itself is being initialized so performing a reset during this time can lead us to accessing fields in the ibmvnic_adapter that are not fully initialized. A review of the code shows that all the adapter state neede to process a reset is initialized before registering the CRQ so that should no longer be a concern. Further the expectation is that if we do get a reset (transport event) during probe, the do..while() loop in ibmvnic_probe() will handle this by reinitializing the CRQ. While that is true to some extent, it is possible that the reset might occur _after_ the CRQ is registered and CRQ_INIT message was exchanged but _before_ the adapter state is set to VNIC_PROBED. As mentioned above, such a reset will be thrown away. While the client assumes that the adapter is functional, the vnic server will wait for the client to reinit the adapter. This disconnect between the two leaves the adapter down needing manual intervention. Because ibmvnic_probe() has other work to do after initializing the CRQ (such as registering the netdev at a minimum) and because the reset event can occur at any instant after the CRQ is initialized, there will always be a window between initializing the CRQ and considering the adapter ready for resets (ie state == PROBED). So rather than discarding resets during this window, allow queueing them - but only process them after the adapter is fully initialized. To do this, introduce a new completion state ->probe_done and have the reset worker thread wait on this before processing resets. This change brings up two new situations in or just after ibmvnic_probe(). First after one or more resets were queued, we encounter an error and decide to retry the initialization. At that point the queued resets are no longer relevant since we could be talking to a new vnic server. So we must purge/flush the queued resets before restarting the initialization. As a side note, since we are still in the probing stage and we have not registered the netdev, it will not be CHANGE_PARAM reset. Second this change opens up a potential race between the worker thread in __ibmvnic_reset(), the tasklet and the ibmvnic_open() due to the following sequence of events: 1. Register CRQ 2. Get transport event before CRQ_INIT completes. 3. Tasklet schedules reset: a) add rwi to list b) schedule_work() to start worker thread which runs and waits for ->probe_done. 4. ibmvnic_probe() decides to retry, purges rwi_list 5. Re-register crq and this time rest of probe succeeds - register netdev and complete(->probe_done). 6. Worker thread resumes in __ibmvnic_reset() from 3b. 7. Worker thread sets ->resetting bit 8. ibmvnic_open() comes in, notices ->resetting bit, sets state to IBMVNIC_OPEN and returns early expecting worker thread to finish the open. 9. Worker thread finds rwi_list empty and returns without opening the interface. If this happens, the ->ndo_open() call is effectively lost and the interface remains down. To address this, ensure that ->rwi_list is not empty before setting the ->resetting bit. See also comments in __ibmvnic_reset(). Fixes: 6a2fb0e99f9c ("ibmvnic: driver initialization for kdump/kexec") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 107 ++++++++++++++++++++++++++--- drivers/net/ethernet/ibm/ibmvnic.h | 1 + 2 files changed, 98 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 93580c68600a..b423e94956f1 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2618,23 +2618,82 @@ out: static void __ibmvnic_reset(struct work_struct *work) { struct ibmvnic_adapter *adapter; - bool saved_state = false; + unsigned int timeout = 5000; struct ibmvnic_rwi *tmprwi; + bool saved_state = false; struct ibmvnic_rwi *rwi; unsigned long flags; - u32 reset_state; + struct device *dev; + bool need_reset; int num_fails = 0; + u32 reset_state; int rc = 0; adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); + dev = &adapter->vdev->dev; - if (test_and_set_bit_lock(0, &adapter->resetting)) { + /* Wait for ibmvnic_probe() to complete. If probe is taking too long + * or if another reset is in progress, defer work for now. If probe + * eventually fails it will flush and terminate our work. + * + * Three possibilities here: + * 1. Adpater being removed - just return + * 2. Timed out on probe or another reset in progress - delay the work + * 3. Completed probe - perform any resets in queue + */ + if (adapter->state == VNIC_PROBING && + !wait_for_completion_timeout(&adapter->probe_done, timeout)) { + dev_err(dev, "Reset thread timed out on probe"); queue_delayed_work(system_long_wq, &adapter->ibmvnic_delayed_reset, IBMVNIC_RESET_DELAY); return; } + /* adapter is done with probe (i.e state is never VNIC_PROBING now) */ + if (adapter->state == VNIC_REMOVING) + return; + + /* ->rwi_list is stable now (no one else is removing entries) */ + + /* ibmvnic_probe() may have purged the reset queue after we were + * scheduled to process a reset so there maybe no resets to process. + * Before setting the ->resetting bit though, we have to make sure + * that there is infact a reset to process. Otherwise we may race + * with ibmvnic_open() and end up leaving the vnic down: + * + * __ibmvnic_reset() ibmvnic_open() + * ----------------- -------------- + * + * set ->resetting bit + * find ->resetting bit is set + * set ->state to IBMVNIC_OPEN (i.e + * assume reset will open device) + * return + * find reset queue empty + * return + * + * Neither performed vnic login/open and vnic stays down + * + * If we hold the lock and conditionally set the bit, either we + * or ibmvnic_open() will complete the open. + */ + need_reset = false; + spin_lock(&adapter->rwi_lock); + if (!list_empty(&adapter->rwi_list)) { + if (test_and_set_bit_lock(0, &adapter->resetting)) { + queue_delayed_work(system_long_wq, + &adapter->ibmvnic_delayed_reset, + IBMVNIC_RESET_DELAY); + } else { + need_reset = true; + } + } + spin_unlock(&adapter->rwi_lock); + + if (!need_reset) + return; + rwi = get_next_rwi(adapter); while (rwi) { spin_lock_irqsave(&adapter->state_lock, flags); @@ -2786,13 +2845,6 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, goto err; } - if (adapter->state == VNIC_PROBING) { - netdev_warn(netdev, "Adapter reset during probe\n"); - adapter->init_done_rc = -EAGAIN; - ret = EAGAIN; - goto err; - } - list_for_each_entry(tmp, &adapter->rwi_list, list) { if (tmp->reset_reason == reason) { netdev_dbg(netdev, "Skipping matching reset, reason=%s\n", @@ -5755,6 +5807,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) struct ibmvnic_adapter *adapter; struct net_device *netdev; unsigned char *mac_addr_p; + unsigned long flags; bool init_success; int rc; @@ -5799,6 +5852,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) spin_lock_init(&adapter->rwi_lock); spin_lock_init(&adapter->state_lock); mutex_init(&adapter->fw_lock); + init_completion(&adapter->probe_done); init_completion(&adapter->init_done); init_completion(&adapter->fw_done); init_completion(&adapter->reset_done); @@ -5816,6 +5870,26 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) */ adapter->failover_pending = false; + /* If we had already initialized CRQ, we may have one or + * more resets queued already. Discard those and release + * the CRQ before initializing the CRQ again. + */ + release_crq_queue(adapter); + + /* Since we are still in PROBING state, __ibmvnic_reset() + * will not access the ->rwi_list and since we released CRQ, + * we won't get _new_ transport events. But there maybe an + * ongoing ibmvnic_reset() call. So serialize access to + * rwi_list. If we win the race, ibvmnic_reset() could add + * a reset after we purged but thats ok - we just may end + * up with an extra reset (i.e similar to having two or more + * resets in the queue at once). + * CHECK. + */ + spin_lock_irqsave(&adapter->rwi_lock, flags); + flush_reset_queue(adapter); + spin_unlock_irqrestore(&adapter->rwi_lock, flags); + rc = init_crq_queue(adapter); if (rc) { dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n", @@ -5867,6 +5941,8 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } dev_info(&dev->dev, "ibmvnic registered\n"); + complete(&adapter->probe_done); + return 0; ibmvnic_register_fail: @@ -5881,6 +5957,17 @@ ibmvnic_stats_fail: ibmvnic_init_fail: release_sub_crqs(adapter, 1); release_crq_queue(adapter); + + /* cleanup worker thread after releasing CRQ so we don't get + * transport events (i.e new work items for the worker thread). + */ + adapter->state = VNIC_REMOVING; + complete(&adapter->probe_done); + flush_work(&adapter->ibmvnic_reset); + flush_delayed_work(&adapter->ibmvnic_delayed_reset); + + flush_reset_queue(adapter); + mutex_destroy(&adapter->fw_lock); free_netdev(netdev); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 4a7a56ff74ce..fa2d607a7b1b 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -930,6 +930,7 @@ struct ibmvnic_adapter { struct ibmvnic_tx_pool *tx_pool; struct ibmvnic_tx_pool *tso_pool; + struct completion probe_done; struct completion init_done; int init_done_rc; From 767b9825ed1765894e569a3d698749d40d83762a Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 25 Feb 2022 04:37:27 -0800 Subject: [PATCH 457/773] net: chelsio: cxgb3: check the return value of pci_find_capability() The function pci_find_capability() in t3_prep_adapter() can fail, so its return value should be checked. Fixes: 4d22de3e6cc4 ("Add support for the latest 1G/10G Chelsio adapter, T3") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb3/t3_hw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c index da41eee2f25c..a06003bfa04b 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c @@ -3613,6 +3613,8 @@ int t3_prep_adapter(struct adapter *adapter, const struct adapter_info *ai, MAC_STATS_ACCUM_SECS : (MAC_STATS_ACCUM_SECS * 10); adapter->params.pci.vpd_cap_addr = pci_find_capability(adapter->pdev, PCI_CAP_ID_VPD); + if (!adapter->params.pci.vpd_cap_addr) + return -ENODEV; ret = get_vpd_params(adapter, &adapter->params.vpd); if (ret < 0) return ret; From b3a34dc362c03215031b268fcc0b988e69490231 Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Fri, 25 Feb 2022 11:15:16 +0100 Subject: [PATCH 458/773] net: sparx5: Fix add vlan when invalid operation Check if operation is valid before changing any settings in hardware. Otherwise it results in changes being made despite it not being a valid operation. Fixes: 78eab33bb68b ("net: sparx5: add vlan support") Signed-off-by: Casper Andersson Signed-off-by: David S. Miller --- .../ethernet/microchip/sparx5/sparx5_vlan.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c b/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c index 4ce490a25f33..8e56ffa1c4f7 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c @@ -58,16 +58,6 @@ int sparx5_vlan_vid_add(struct sparx5_port *port, u16 vid, bool pvid, struct sparx5 *sparx5 = port->sparx5; int ret; - /* Make the port a member of the VLAN */ - set_bit(port->portno, sparx5->vlan_mask[vid]); - ret = sparx5_vlant_set_mask(sparx5, vid); - if (ret) - return ret; - - /* Default ingress vlan classification */ - if (pvid) - port->pvid = vid; - /* Untagged egress vlan classification */ if (untagged && port->vid != vid) { if (port->vid) { @@ -79,6 +69,16 @@ int sparx5_vlan_vid_add(struct sparx5_port *port, u16 vid, bool pvid, port->vid = vid; } + /* Make the port a member of the VLAN */ + set_bit(port->portno, sparx5->vlan_mask[vid]); + ret = sparx5_vlant_set_mask(sparx5, vid); + if (ret) + return ret; + + /* Default ingress vlan classification */ + if (pvid) + port->pvid = vid; + sparx5_vlan_port_apply(sparx5, port); return 0; From 456f89e0928ab938122a40e9f094a6524cc158b4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Feb 2022 13:16:24 +0000 Subject: [PATCH 459/773] KVM: selftests: aarch64: Skip tests if we can't create a vgic-v3 The arch_timer and vgic_irq kselftests assume that they can create a vgic-v3, using the library function vgic_v3_setup() which aborts with a test failure if it is not possible to do so. Since vgic-v3 can only be instantiated on systems where the host has GICv3 this leads to false positives on older systems where that is not the case. Fix this by changing vgic_v3_setup() to return an error if the vgic can't be instantiated and have the callers skip if this happens. We could also exit flagging a skip in vgic_v3_setup() but this would prevent future test cases conditionally deciding which GIC to use or generally doing more complex output. Signed-off-by: Mark Brown Reviewed-by: Andrew Jones Tested-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220223131624.1830351-1-broonie@kernel.org --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 7 ++++++- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 4 ++++ tools/testing/selftests/kvm/lib/aarch64/vgic.c | 4 +++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 9ad38bd360a4..b08d30bf71c5 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -366,6 +366,7 @@ static struct kvm_vm *test_vm_create(void) { struct kvm_vm *vm; unsigned int i; + int ret; int nr_vcpus = test_args.nr_vcpus; vm = vm_create_default_with_vcpus(nr_vcpus, 0, 0, guest_code, NULL); @@ -382,7 +383,11 @@ static struct kvm_vm *test_vm_create(void) ucall_init(vm, NULL); test_init_timer_irq(vm); - vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + ret = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + if (ret < 0) { + print_skip("Failed to create vgic-v3"); + exit(KSFT_SKIP); + } /* Make all the test's cmdline args visible to the guest */ sync_global_to_guest(vm, test_args); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index e6c7d7f8fbd1..7eca97799917 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -761,6 +761,10 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) gic_fd = vgic_v3_setup(vm, 1, nr_irqs, GICD_BASE_GPA, GICR_BASE_GPA); + if (gic_fd < 0) { + print_skip("Failed to create vgic-v3, skipping"); + exit(KSFT_SKIP); + } vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handlers[args.eoi_split][args.level_sensitive]); diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index b3a0fca0d780..f5cd0c536d85 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -52,7 +52,9 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, nr_vcpus, nr_vcpus_created); /* Distributor setup */ - gic_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + if (_kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, + false, &gic_fd) != 0) + return -1; kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs, true); From 40cd58dbf121e1d0c18f1bd4dd10335ae45a28fc Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 25 Feb 2022 00:29:40 -0800 Subject: [PATCH 460/773] x86/kvm: Don't use PV TLB/yield when mwait is advertised MWAIT is advertised in host is not overcommitted scenario, however, PV TLB/sched yield should be enabled in host overcommitted scenario. Let's add the MWAIT checking when enabling PV TLB/sched yield. Signed-off-by: Wanpeng Li Message-Id: <1645777780-2581-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f734e3b0cfec..491e1d9ca750 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -463,6 +463,7 @@ static bool pv_tlb_flush_supported(void) return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && (num_possible_cpus() != 1)); } @@ -477,6 +478,7 @@ static bool pv_sched_yield_supported(void) return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && (num_possible_cpus() != 1)); } From 3c51d0a6c761c2025c6db1ed4d3a7273167bf899 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 22 Feb 2022 01:02:03 -0800 Subject: [PATCH 461/773] x86/kvm: Don't waste memory if kvmclock is disabled Even if "no-kvmclock" is passed in cmdline parameter, the guest kernel still allocates hvclock_mem which is scaled by the number of vCPUs, let's check kvmclock enable in advance to avoid this memory waste. Signed-off-by: Wanpeng Li Message-Id: <1645520523-30814-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index a35cbf9107af..44ed677d401f 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -239,6 +239,9 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { + if (!kvmclock) + return 0; + kvmclock_init_mem(); #ifdef CONFIG_X86_64 From 92e68cc558774de01024c18e8b35cdce4731c910 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 25 Feb 2022 00:46:00 -0800 Subject: [PATCH 462/773] x86/kvmclock: Fix Hyper-V Isolated VM's boot issue when vCPUs > 64 When Linux runs as an Isolated VM on Hyper-V, it supports AMD SEV-SNP but it's partially enlightened, i.e. cc_platform_has( CC_ATTR_GUEST_MEM_ENCRYPT) is true but sev_active() is false. Commit 4d96f9109109 per se is good, but with it now kvm_setup_vsyscall_timeinfo() -> kvmclock_init_mem() calls set_memory_decrypted(), and later gets stuck when trying to zere out the pages pointed by 'hvclock_mem', if Linux runs as an Isolated VM on Hyper-V. The cause is that here now the Linux VM should no longer access the original guest physical addrss (GPA); instead the VM should do memremap() and access the original GPA + ms_hyperv.shared_gpa_boundary: see the example code in drivers/hv/connection.c: vmbus_connect() or drivers/hv/ring_buffer.c: hv_ringbuffer_init(). If the VM tries to access the original GPA, it keepts getting injected a fault by Hyper-V and gets stuck there. Here the issue happens only when the VM has >=65 vCPUs, because the global static array hv_clock_boot[] can hold 64 "struct pvclock_vsyscall_time_info" (the sizeof of the struct is 64 bytes), so kvmclock_init_mem() only allocates memory in the case of vCPUs > 64. Since the 'hvclock_mem' pages are only useful when the kvm clock is supported by the underlying hypervisor, fix the issue by returning early when Linux VM runs on Hyper-V, which doesn't support kvm clock. Fixes: 4d96f9109109 ("x86/sev: Replace occurrences of sev_active() with cc_platform_has()") Tested-by: Andrea Parri (Microsoft) Signed-off-by: Andrea Parri (Microsoft) Signed-off-by: Dexuan Cui Message-Id: <20220225084600.17817-1-decui@microsoft.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44ed677d401f..c5caa7311bd8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -239,7 +239,7 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { - if (!kvmclock) + if (!kvm_para_available() || !kvmclock) return 0; kvmclock_init_mem(); From 9ee83635d872812f3920209c606c6ea9e412ffcc Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 9 Feb 2022 12:16:41 +0800 Subject: [PATCH 463/773] KVM: x86: Yield to IPI target vCPU only if it is busy When sending a call-function IPI-many to vCPUs, yield to the IPI target vCPU which is marked as preempted. but when emulating HLT, an idling vCPU will be voluntarily scheduled out and mark as preempted from the guest kernel perspective. yielding to idle vCPU is pointless and increase unnecessary vmexit, maybe miss the true preempted vCPU so yield to IPI target vCPU only if vCPU is busy and preempted Signed-off-by: Li RongQing Message-Id: <1644380201-29423-1-git-send-email-lirongqing@baidu.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 491e1d9ca750..d77481ecb0d5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -624,7 +624,7 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) /* Make sure other vCPUs get a chance to run if they need to. */ for_each_cpu(cpu, mask) { - if (vcpu_is_preempted(cpu)) { + if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) { kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); break; } From 0ac983f512033cb7b5e210c9589768ad25b1e36b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 Feb 2022 08:32:28 -0600 Subject: [PATCH 464/773] ucounts: Fix systemd LimitNPROC with private users regression Long story short recursively enforcing RLIMIT_NPROC when it is not enforced on the process that creates a new user namespace, causes currently working code to fail. There is no reason to enforce RLIMIT_NPROC recursively when we don't enforce it normally so update the code to detect this case. I would like to simply use capable(CAP_SYS_RESOURCE) to detect when RLIMIT_NPROC is not enforced upon the caller. Unfortunately because RLIMIT_NPROC is charged and checked for enforcement based upon the real uid, using capable() which is euid based is inconsistent with reality. Come as close as possible to testing for capable(CAP_SYS_RESOURCE) by testing for when the real uid would match the conditions when CAP_SYS_RESOURCE would be present if the real uid was the effective uid. Reported-by: Etienne Dechamps Link: https://bugzilla.kernel.org/show_bug.cgi?id=215596 Link: https://lkml.kernel.org/r/e9589141-cfeb-90cd-2d0e-83a62787239a@edechamps.fr Link: https://lkml.kernel.org/r/87sfs8jmpz.fsf_-_@email.froward.int.ebiederm.org Cc: stable@vger.kernel.org Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 6b2e3ca7ee99..5481ba44a8d6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->user_ns = user_ns; } +static unsigned long enforced_nproc_rlimit(void) +{ + unsigned long limit = RLIM_INFINITY; + + /* Is RLIMIT_NPROC currently enforced? */ + if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || + (current_user_ns() != &init_user_ns)) + limit = rlimit(RLIMIT_NPROC); + + return limit; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -122,7 +134,7 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); From c5048a7b2c23ab589f3476a783bd586b663eda5b Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Mon, 21 Feb 2022 22:59:35 +0000 Subject: [PATCH 465/773] can: rcar_canfd: rcar_canfd_channel_probe(): register the CAN device when fully ready Register the CAN device only when all the necessary initialization is completed. This patch makes sure all the data structures and locks are initialized before registering the CAN device. Link: https://lore.kernel.org/all/20220221225935.12300-1-prabhakar.mahadev-lad.rj@bp.renesas.com Reported-by: Pavel Machek Signed-off-by: Lad Prabhakar Reviewed-by: Pavel Machek Reviewed-by: Ulrich Hecht Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rcar/rcar_canfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index b7dc1c32875f..acd74725831f 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -1715,15 +1715,15 @@ static int rcar_canfd_channel_probe(struct rcar_canfd_global *gpriv, u32 ch, netif_napi_add(ndev, &priv->napi, rcar_canfd_rx_poll, RCANFD_NAPI_WEIGHT); + spin_lock_init(&priv->tx_lock); + devm_can_led_init(ndev); + gpriv->ch[priv->channel] = priv; err = register_candev(ndev); if (err) { dev_err(&pdev->dev, "register_candev() failed, error %d\n", err); goto fail_candev; } - spin_lock_init(&priv->tx_lock); - devm_can_led_init(ndev); - gpriv->ch[priv->channel] = priv; dev_info(&pdev->dev, "device registered (channel %u)\n", priv->channel); return 0; From 50e06ddceeea263f57fe92baa677c638ecd65bb6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 23 Feb 2022 19:35:28 -0800 Subject: [PATCH 466/773] net: sxgbe: fix return value of __setup handler __setup() handlers should return 1 on success, i.e., the parameter has been handled. A return of 0 causes the "option=value" string to be added to init's environment strings, polluting it. Fixes: acc18c147b22 ("net: sxgbe: add EEE(Energy Efficient Ethernet) for Samsung sxgbe") Fixes: 1edb9ca69e8a ("net: sxgbe: add basic framework for Samsung 10Gb ethernet driver") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Cc: Siva Reddy Cc: Girish K S Cc: Byungho An Link: https://lore.kernel.org/r/20220224033528.24640-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c index 32161a56726c..2881f5b2b5f4 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c @@ -2285,18 +2285,18 @@ static int __init sxgbe_cmdline_opt(char *str) char *opt; if (!str || !*str) - return -EINVAL; + return 1; while ((opt = strsep(&str, ",")) != NULL) { if (!strncmp(opt, "eee_timer:", 10)) { if (kstrtoint(opt + 10, 0, &eee_timer)) goto err; } } - return 0; + return 1; err: pr_err("%s: ERROR broken module parameter conversion\n", __func__); - return -EINVAL; + return 1; } __setup("sxgbeeth=", sxgbe_cmdline_opt); From e01b042e580f1fbf4fd8da467442451da00c7a90 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 23 Feb 2022 19:35:36 -0800 Subject: [PATCH 467/773] net: stmmac: fix return value of __setup handler __setup() handlers should return 1 on success, i.e., the parameter has been handled. A return of 0 causes the "option=value" string to be added to init's environment strings, polluting it. Fixes: 47dd7a540b8a ("net: add support for STMicroelectronics Ethernet controllers.") Fixes: f3240e2811f0 ("stmmac: remove warning when compile as built-in (V2)") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Cc: Jose Abreu Link: https://lore.kernel.org/r/20220224033536.25056-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index cb9b6e08780c..422e3225f476 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7491,7 +7491,7 @@ static int __init stmmac_cmdline_opt(char *str) char *opt; if (!str || !*str) - return -EINVAL; + return 1; while ((opt = strsep(&str, ",")) != NULL) { if (!strncmp(opt, "debug:", 6)) { if (kstrtoint(opt + 6, 0, &debug)) @@ -7522,11 +7522,11 @@ static int __init stmmac_cmdline_opt(char *str) goto err; } } - return 0; + return 1; err: pr_err("%s: ERROR broken module parameter conversion", __func__); - return -EINVAL; + return 1; } __setup("stmmaceth=", stmmac_cmdline_opt); From 302e9edd54985f584cfc180098f3554774126969 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 23 Feb 2022 22:38:37 -0500 Subject: [PATCH 468/773] tracing: Have traceon and traceoff trigger honor the instance If a trigger is set on an event to disable or enable tracing within an instance, then tracing should be disabled or enabled in the instance and not at the top level, which is confusing to users. Link: https://lkml.kernel.org/r/20220223223837.14f94ec3@rorschach.local.home Cc: stable@vger.kernel.org Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables") Tested-by: Daniel Bristot de Oliveira Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 52 +++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e0d50c9577f3..efe563140f27 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1295,6 +1295,16 @@ traceon_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_on(file->tr); + return; + } + if (tracing_is_on()) return; @@ -1306,8 +1316,15 @@ traceon_count_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - if (tracing_is_on()) - return; + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + } else { + if (tracing_is_on()) + return; + } if (!data->count) return; @@ -1315,7 +1332,10 @@ traceon_count_trigger(struct event_trigger_data *data, if (data->count != -1) (data->count)--; - tracing_on(); + if (file) + tracer_tracing_on(file->tr); + else + tracing_on(); } static void @@ -1323,6 +1343,16 @@ traceoff_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_off(file->tr); + return; + } + if (!tracing_is_on()) return; @@ -1334,8 +1364,15 @@ traceoff_count_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - if (!tracing_is_on()) - return; + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + } else { + if (!tracing_is_on()) + return; + } if (!data->count) return; @@ -1343,7 +1380,10 @@ traceoff_count_trigger(struct event_trigger_data *data, if (data->count != -1) (data->count)--; - tracing_off(); + if (file) + tracer_tracing_off(file->tr); + else + tracing_off(); } static int From b61edd57740de5895f44f2ea417b164d9e1708bb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 18 Feb 2022 19:00:57 -0500 Subject: [PATCH 469/773] eprobes: Remove redundant event type information Currently, the event probes save the type of the event they are attached to when recording the event. For example: # echo 'e:switch sched/sched_switch prev_state=$prev_state prev_prio=$prev_prio next_pid=$next_pid next_prio=$next_prio' > dynamic_events # cat events/eprobes/switch/format name: switch ID: 1717 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:unsigned int __probe_type; offset:8; size:4; signed:0; field:u64 prev_state; offset:12; size:8; signed:0; field:u64 prev_prio; offset:20; size:8; signed:0; field:u64 next_pid; offset:28; size:8; signed:0; field:u64 next_prio; offset:36; size:8; signed:0; print fmt: "(%u) prev_state=0x%Lx prev_prio=0x%Lx next_pid=0x%Lx next_prio=0x%Lx", REC->__probe_type, REC->prev_state, REC->prev_prio, REC->next_pid, REC->next_prio The __probe_type adds 4 bytes to every event. One of the reasons for creating eprobes is to limit what is traced in an event to be able to limit what is written into the ring buffer. Having this redundant 4 bytes to every event takes away from this. The event that is recorded can be retrieved from the event probe itself, that is available when the trace is happening. For user space tools, it could simply read the dynamic_event file to find the event they are for. So there is really no reason to write this information into the ring buffer for every event. Link: https://lkml.kernel.org/r/20220218190057.2f5a19a8@gandalf.local.home Acked-by: Masami Hiramatsu Reviewed-by: Joel Fernandes Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 1 - kernel/trace/trace_eprobe.c | 16 +++++++--------- kernel/trace/trace_probe.c | 10 +++++----- kernel/trace/trace_probe.h | 1 - 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d038ddbf1bea..c5b09c31e077 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -136,7 +136,6 @@ struct kprobe_trace_entry_head { struct eprobe_trace_entry_head { struct trace_entry ent; - unsigned int type; }; struct kretprobe_trace_entry_head { diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 191db32dec46..541aa13581b9 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -242,7 +242,6 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) static int eprobe_event_define_fields(struct trace_event_call *event_call) { - int ret; struct eprobe_trace_entry_head field; struct trace_probe *tp; @@ -250,8 +249,6 @@ static int eprobe_event_define_fields(struct trace_event_call *event_call) if (WARN_ON_ONCE(!tp)) return -ENOENT; - DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } @@ -270,7 +267,9 @@ print_eprobe_event(struct trace_iterator *iter, int flags, struct trace_event_call *pevent; struct trace_event *probed_event; struct trace_seq *s = &iter->seq; + struct trace_eprobe *ep; struct trace_probe *tp; + unsigned int type; field = (struct eprobe_trace_entry_head *)iter->ent; tp = trace_probe_primary_from_call( @@ -278,15 +277,18 @@ print_eprobe_event(struct trace_iterator *iter, int flags, if (WARN_ON_ONCE(!tp)) goto out; + ep = container_of(tp, struct trace_eprobe, tp); + type = ep->event->event.type; + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - probed_event = ftrace_find_event(field->type); + probed_event = ftrace_find_event(type); if (probed_event) { pevent = container_of(probed_event, struct trace_event_call, event); trace_seq_printf(s, "%s.%s", pevent->class->system, trace_event_name(pevent)); } else { - trace_seq_printf(s, "%u", field->type); + trace_seq_printf(s, "%u", type); } trace_seq_putc(s, ')'); @@ -498,10 +500,6 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); - if (edata->ep->event) - entry->type = edata->ep->event->event.type; - else - entry->type = 0; store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 73d90179b51b..80863c6508e5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -871,15 +871,15 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, switch (ptype) { case PROBE_PRINT_NORMAL: fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; + arg = ", REC->" FIELD_STRING_IP; break; case PROBE_PRINT_RETURN: fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; break; case PROBE_PRINT_EVENT: - fmt = "(%u)"; - arg = "REC->" FIELD_STRING_TYPE; + fmt = ""; + arg = ""; break; default: WARN_ON_ONCE(1); @@ -903,7 +903,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, parg->type->fmt); } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg); for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 99e7a5df025e..92cc149af0fd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,7 +38,6 @@ #define FIELD_STRING_IP "__probe_ip" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" -#define FIELD_STRING_TYPE "__probe_type" #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ From bc82c38a6933aab308387d4aca47e0a05de7b553 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 11 Feb 2022 08:10:18 +0100 Subject: [PATCH 470/773] tracing: Uninline trace_trigger_soft_disabled() partly On a powerpc32 build with CONFIG_CC_OPTIMISE_FOR_SIZE, the inline keyword is not honored and trace_trigger_soft_disabled() appears approx 50 times in vmlinux. Adding -Winline to the build, the following message appears: ./include/linux/trace_events.h:712:1: error: inlining failed in call to 'trace_trigger_soft_disabled': call is unlikely and code size would grow [-Werror=inline] That function is rather big for an inlined function: c003df60 : c003df60: 94 21 ff f0 stwu r1,-16(r1) c003df64: 7c 08 02 a6 mflr r0 c003df68: 90 01 00 14 stw r0,20(r1) c003df6c: bf c1 00 08 stmw r30,8(r1) c003df70: 83 e3 00 24 lwz r31,36(r3) c003df74: 73 e9 01 00 andi. r9,r31,256 c003df78: 41 82 00 10 beq c003df88 c003df7c: 38 60 00 00 li r3,0 c003df80: 39 61 00 10 addi r11,r1,16 c003df84: 4b fd 60 ac b c0014030 <_rest32gpr_30_x> c003df88: 73 e9 00 80 andi. r9,r31,128 c003df8c: 7c 7e 1b 78 mr r30,r3 c003df90: 41 a2 00 14 beq c003dfa4 c003df94: 38 c0 00 00 li r6,0 c003df98: 38 a0 00 00 li r5,0 c003df9c: 38 80 00 00 li r4,0 c003dfa0: 48 05 c5 f1 bl c009a590 c003dfa4: 73 e9 00 40 andi. r9,r31,64 c003dfa8: 40 82 00 28 bne c003dfd0 c003dfac: 73 ff 02 00 andi. r31,r31,512 c003dfb0: 41 82 ff cc beq c003df7c c003dfb4: 80 01 00 14 lwz r0,20(r1) c003dfb8: 83 e1 00 0c lwz r31,12(r1) c003dfbc: 7f c3 f3 78 mr r3,r30 c003dfc0: 83 c1 00 08 lwz r30,8(r1) c003dfc4: 7c 08 03 a6 mtlr r0 c003dfc8: 38 21 00 10 addi r1,r1,16 c003dfcc: 48 05 6f 6c b c0094f38 c003dfd0: 38 60 00 01 li r3,1 c003dfd4: 4b ff ff ac b c003df80 However it is located in a hot path so inlining it is important. But forcing inlining of the entire function by using __always_inline leads to increasing the text size by approx 20 kbytes. Instead, split the fonction in two parts, one part with the likely fast path, flagged __always_inline, and a second part out of line. With this change, on a powerpc32 with CONFIG_CC_OPTIMISE_FOR_SIZE vmlinux text increases by only 1,4 kbytes, which is partly compensated by a decrease of vmlinux data by 7 kbytes. On ppc64_defconfig which has CONFIG_CC_OPTIMISE_FOR_SPEED, this change reduces vmlinux text by more than 30 kbytes. Link: https://lkml.kernel.org/r/69ce0986a52d026d381d612801d978aa4f977460.1644563295.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 22 ++++++++++++---------- kernel/trace/trace_events_trigger.c | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 70c069aef02c..dcea51fb60e2 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -699,6 +699,8 @@ event_triggers_post_call(struct trace_event_file *file, bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); +bool __trace_trigger_soft_disabled(struct trace_event_file *file); + /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test @@ -708,20 +710,20 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); * triggers that require testing the fields, it will return true, * otherwise false. */ -static inline bool +static __always_inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; - if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { - if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL, NULL, NULL); - if (eflags & EVENT_FILE_FL_SOFT_DISABLED) - return true; - if (eflags & EVENT_FILE_FL_PID_FILTER) - return trace_event_ignore_this_pid(file); - } - return false; + if (likely(!(eflags & (EVENT_FILE_FL_TRIGGER_MODE | + EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (likely(eflags & EVENT_FILE_FL_TRIGGER_COND)) + return false; + + return __trace_trigger_soft_disabled(file); } #ifdef CONFIG_BPF_EVENTS diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index efe563140f27..7eb9d04f1c2e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -84,6 +84,20 @@ event_triggers_call(struct trace_event_file *file, } EXPORT_SYMBOL_GPL(event_triggers_call); +bool __trace_trigger_soft_disabled(struct trace_event_file *file) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_MODE) + event_triggers_call(file, NULL, NULL, NULL); + if (eflags & EVENT_FILE_FL_SOFT_DISABLED) + return true; + if (eflags & EVENT_FILE_FL_PID_FILTER) + return trace_event_ignore_this_pid(file); + return false; +} +EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled); + /** * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event From 7acf3a127bb7c65ff39099afd78960e77b2ca5de Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 14 Feb 2022 14:44:56 +0100 Subject: [PATCH 471/773] tracing: Ensure trace buffer is at least 4096 bytes large Booting the kernel with 'trace_buf_size=1' give a warning at boot during the ftrace selftests: [ 0.892809] Running postponed tracer tests: [ 0.892893] Testing tracer function: [ 0.901899] Callback from call_rcu_tasks_trace() invoked. [ 0.983829] Callback from call_rcu_tasks_rude() invoked. [ 1.072003] .. bad ring buffer .. corrupted trace buffer .. [ 1.091944] Callback from call_rcu_tasks() invoked. [ 1.097695] PASSED [ 1.097701] Testing dynamic ftrace: .. filter failed count=0 ..FAILED! [ 1.353474] ------------[ cut here ]------------ [ 1.353478] WARNING: CPU: 0 PID: 1 at kernel/trace/trace.c:1951 run_tracer_selftest+0x13c/0x1b0 Therefore enforce a minimum of 4096 bytes to make the selftest pass. Link: https://lkml.kernel.org/r/20220214134456.1751749-1-svens@linux.ibm.com Signed-off-by: Sven Schnelle Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7c2578efde26..3050892d1812 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1474,10 +1474,12 @@ static int __init set_buf_size(char *str) if (!str) return 0; buf_size = memparse(str, &str); - /* nr_entries can not be zero */ - if (buf_size == 0) - return 0; - trace_buf_size = buf_size; + /* + * nr_entries can not be zero and the startup + * tests require some buffer space. Therefore + * ensure we have at least 4096 bytes of buffer. + */ + trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); From ab2f993c01f261aa3eeb8842842ff38bff7806b6 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 14 Feb 2022 12:28:47 -0700 Subject: [PATCH 472/773] ftrace: Remove unused ftrace_startup_enable() stub When building with clang + CONFIG_DYNAMIC_FTRACE=n + W=1, there is a warning: kernel/trace/ftrace.c:7194:20: error: unused function 'ftrace_startup_enable' [-Werror,-Wunused-function] static inline void ftrace_startup_enable(int command) { } ^ 1 error generated. Clang warns on instances of static inline functions in .c files with W=1 after commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). The ftrace_startup_enable() stub has been unused since commit e1effa0144a1 ("ftrace: Annotate the ops operation on update"), where its use outside of the CONFIG_DYNAMIC_TRACE section was replaced by ftrace_startup_all(). Remove it to resolve the warning. Link: https://lkml.kernel.org/r/20220214192847.488166-1-nathan@kernel.org Reported-by: kernel test robot Signed-off-by: Nathan Chancellor Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9feb197b2da..a4b462b6f944 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7191,7 +7191,6 @@ static int __init ftrace_nodyn_init(void) core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } -static inline void ftrace_startup_enable(int command) { } static inline void ftrace_startup_all(int command) { } # define ftrace_startup_sysctl() do { } while (0) From dd990352f01ee9a6c6eee152e5d11c021caccfe4 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 18 Feb 2022 16:17:38 +0100 Subject: [PATCH 473/773] tracing/osnoise: Make osnoise_main to sleep for microseconds osnoise's runtime and period are in the microseconds scale, but it is currently sleeping in the millisecond's scale. This behavior roots in the usage of hwlat as the skeleton for osnoise. Make osnoise to sleep in the microseconds scale. Also, move the sleep to a specialized function. Link: https://lkml.kernel.org/r/302aa6c7bdf2d131719b22901905e9da122a11b2.1645197336.git.bristot@kernel.org Cc: Ingo Molnar Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 53 ++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 870a08da5b48..cfddb30e65ab 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1436,6 +1436,37 @@ out: static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; +/* + * osnoise_sleep - sleep until the next period + */ +static void osnoise_sleep(void) +{ + u64 interval; + ktime_t wake_time; + + mutex_lock(&interface_lock); + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (!interval) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); + return; + } + + wake_time = ktime_add_us(ktime_get(), interval); + __set_current_state(TASK_INTERRUPTIBLE); + + while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { + if (kthread_should_stop()) + break; + } +} + /* * osnoise_main - The osnoise detection kernel thread * @@ -1444,30 +1475,10 @@ static struct cpumask save_cpumask; */ static int osnoise_main(void *data) { - u64 interval; while (!kthread_should_stop()) { - run_osnoise(); - - mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; - mutex_unlock(&interface_lock); - - do_div(interval, USEC_PER_MSEC); - - /* - * differently from hwlat_detector, the osnoise tracer can run - * without a pause because preemption is on. - */ - if (interval < 1) { - /* Let synchronize_rcu_tasks() make progress */ - cond_resched_tasks_rcu_qs(); - continue; - } - - if (msleep_interruptible(interval)) - break; + osnoise_sleep(); } return 0; From fc2e6b3b132a907378f6af08356b105a4139c4fb Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:35:49 +0100 Subject: [PATCH 474/773] iavf: Rework mutexes for better synchronisation The driver used to crash in multiple spots when put to stress testing of the init, reset and remove paths. The user would experience call traces or hangs when creating, resetting, removing VFs. Depending on the machines, the call traces are happening in random spots, like reset restoring resources racing with driver remove. Make adapter->crit_lock mutex a mandatory lock for guarding the operations performed on all workqueues and functions dealing with resource allocation and disposal. Make __IAVF_REMOVE a final state of the driver respected by workqueues that shall not requeue, when they fail to obtain the crit_lock. Make the IRQ handler not to queue the new work for adminq_task when the __IAVF_REMOVE state is set. Fixes: 5ac49f3c2702 ("iavf: use mutexes for locking of critical sections") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 - drivers/net/ethernet/intel/iavf/iavf_main.c | 66 +++++++++++---------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 59806d1f7e79..44f83e06486d 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -246,7 +246,6 @@ struct iavf_adapter { struct list_head mac_filter_list; struct mutex crit_lock; struct mutex client_lock; - struct mutex remove_lock; /* Lock to protect accesses to MAC and VLAN lists */ spinlock_t mac_vlan_list_lock; char misc_vector_name[IFNAMSIZ + 9]; diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 8125b9120615..84ae96e912d7 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -302,8 +302,9 @@ static irqreturn_t iavf_msix_aq(int irq, void *data) rd32(hw, IAVF_VFINT_ICR01); rd32(hw, IAVF_VFINT_ICR0_ENA1); - /* schedule work on the private workqueue */ - queue_work(iavf_wq, &adapter->adminq_task); + if (adapter->state != __IAVF_REMOVE) + /* schedule work on the private workqueue */ + queue_work(iavf_wq, &adapter->adminq_task); return IRQ_HANDLED; } @@ -2374,8 +2375,12 @@ static void iavf_watchdog_task(struct work_struct *work) struct iavf_hw *hw = &adapter->hw; u32 reg_val; - if (!mutex_trylock(&adapter->crit_lock)) + if (!mutex_trylock(&adapter->crit_lock)) { + if (adapter->state == __IAVF_REMOVE) + return; + goto restart_watchdog; + } if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) iavf_change_state(adapter, __IAVF_COMM_FAILED); @@ -2601,13 +2606,13 @@ static void iavf_reset_task(struct work_struct *work) /* When device is being removed it doesn't make sense to run the reset * task, just return in such a case. */ - if (mutex_is_locked(&adapter->remove_lock)) - return; + if (!mutex_trylock(&adapter->crit_lock)) { + if (adapter->state != __IAVF_REMOVE) + queue_work(iavf_wq, &adapter->reset_task); - if (iavf_lock_timeout(&adapter->crit_lock, 200)) { - schedule_work(&adapter->reset_task); return; } + while (!mutex_trylock(&adapter->client_lock)) usleep_range(500, 1000); if (CLIENT_ENABLED(adapter)) { @@ -2826,13 +2831,19 @@ static void iavf_adminq_task(struct work_struct *work) if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) goto out; + if (!mutex_trylock(&adapter->crit_lock)) { + if (adapter->state == __IAVF_REMOVE) + return; + + queue_work(iavf_wq, &adapter->adminq_task); + goto out; + } + event.buf_len = IAVF_MAX_AQ_BUF_SIZE; event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL); if (!event.msg_buf) goto out; - if (iavf_lock_timeout(&adapter->crit_lock, 200)) - goto freedom; do { ret = iavf_clean_arq_element(hw, &event, &pending); v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high); @@ -3800,11 +3811,12 @@ static int iavf_close(struct net_device *netdev) struct iavf_adapter *adapter = netdev_priv(netdev); int status; - if (adapter->state <= __IAVF_DOWN_PENDING) - return 0; + mutex_lock(&adapter->crit_lock); - while (!mutex_trylock(&adapter->crit_lock)) - usleep_range(500, 1000); + if (adapter->state <= __IAVF_DOWN_PENDING) { + mutex_unlock(&adapter->crit_lock); + return 0; + } set_bit(__IAVF_VSI_DOWN, adapter->vsi.state); if (CLIENT_ENABLED(adapter)) @@ -4431,7 +4443,6 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) */ mutex_init(&adapter->crit_lock); mutex_init(&adapter->client_lock); - mutex_init(&adapter->remove_lock); mutex_init(&hw->aq.asq_mutex); mutex_init(&hw->aq.arq_mutex); @@ -4556,11 +4567,7 @@ static void iavf_remove(struct pci_dev *pdev) struct iavf_cloud_filter *cf, *cftmp; struct iavf_hw *hw = &adapter->hw; int err; - /* Indicate we are in remove and not to run reset_task */ - mutex_lock(&adapter->remove_lock); - cancel_work_sync(&adapter->reset_task); - cancel_delayed_work_sync(&adapter->watchdog_task); - cancel_delayed_work_sync(&adapter->client_task); + if (adapter->netdev_registered) { unregister_netdev(netdev); adapter->netdev_registered = false; @@ -4572,6 +4579,10 @@ static void iavf_remove(struct pci_dev *pdev) err); } + mutex_lock(&adapter->crit_lock); + dev_info(&adapter->pdev->dev, "Remove device\n"); + iavf_change_state(adapter, __IAVF_REMOVE); + iavf_request_reset(adapter); msleep(50); /* If the FW isn't responding, kick it once, but only once. */ @@ -4579,18 +4590,19 @@ static void iavf_remove(struct pci_dev *pdev) iavf_request_reset(adapter); msleep(50); } - if (iavf_lock_timeout(&adapter->crit_lock, 5000)) - dev_warn(&adapter->pdev->dev, "failed to acquire crit_lock in %s\n", __FUNCTION__); - dev_info(&adapter->pdev->dev, "Removing device\n"); + iavf_misc_irq_disable(adapter); /* Shut down all the garbage mashers on the detention level */ - iavf_change_state(adapter, __IAVF_REMOVE); + cancel_work_sync(&adapter->reset_task); + cancel_delayed_work_sync(&adapter->watchdog_task); + cancel_work_sync(&adapter->adminq_task); + cancel_delayed_work_sync(&adapter->client_task); + adapter->aq_required = 0; adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; iavf_free_all_tx_resources(adapter); iavf_free_all_rx_resources(adapter); - iavf_misc_irq_disable(adapter); iavf_free_misc_irq(adapter); /* In case we enter iavf_remove from erroneous state, free traffic irqs @@ -4606,10 +4618,6 @@ static void iavf_remove(struct pci_dev *pdev) iavf_reset_interrupt_capability(adapter); iavf_free_q_vectors(adapter); - cancel_delayed_work_sync(&adapter->watchdog_task); - - cancel_work_sync(&adapter->adminq_task); - iavf_free_rss(adapter); if (hw->aq.asq.count) @@ -4621,8 +4629,6 @@ static void iavf_remove(struct pci_dev *pdev) mutex_destroy(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); mutex_destroy(&adapter->crit_lock); - mutex_unlock(&adapter->remove_lock); - mutex_destroy(&adapter->remove_lock); iounmap(hw->hw_addr); pci_release_regions(pdev); From 974578017fc1fdd06cea8afb9dfa32602e8529ed Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:36:56 +0100 Subject: [PATCH 475/773] iavf: Add waiting so the port is initialized in remove There exist races when port is being configured and remove is triggered. unregister_netdev is not and can't be called under crit_lock mutex since it is calling ndo_stop -> iavf_close which requires this lock. Depending on init state the netdev could be still unregistered so unregister_netdev never cleans up, when shortly after that the device could become registered. Make iavf_remove wait until port finishes initialization. All critical state changes are atomic (under crit_lock). Crashes that come from iavf_reset_interrupt_capability and iavf_free_traffic_irqs should now be solved in a graceful manner. Fixes: 605ca7c5c6707 ("iavf: Fix kernel BUG in free_msi_irqs") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 27 ++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 84ae96e912d7..5e71b38e9154 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -4558,7 +4558,6 @@ static int __maybe_unused iavf_resume(struct device *dev_d) static void iavf_remove(struct pci_dev *pdev) { struct iavf_adapter *adapter = iavf_pdev_to_adapter(pdev); - enum iavf_state_t prev_state = adapter->last_state; struct net_device *netdev = adapter->netdev; struct iavf_fdir_fltr *fdir, *fdirtmp; struct iavf_vlan_filter *vlf, *vlftmp; @@ -4568,6 +4567,22 @@ static void iavf_remove(struct pci_dev *pdev) struct iavf_hw *hw = &adapter->hw; int err; + /* Wait until port initialization is complete. + * There are flows where register/unregister netdev may race. + */ + while (1) { + mutex_lock(&adapter->crit_lock); + if (adapter->state == __IAVF_RUNNING || + adapter->state == __IAVF_DOWN) { + mutex_unlock(&adapter->crit_lock); + break; + } + + mutex_unlock(&adapter->crit_lock); + usleep_range(500, 1000); + } + cancel_delayed_work_sync(&adapter->watchdog_task); + if (adapter->netdev_registered) { unregister_netdev(netdev); adapter->netdev_registered = false; @@ -4605,16 +4620,6 @@ static void iavf_remove(struct pci_dev *pdev) iavf_free_all_rx_resources(adapter); iavf_free_misc_irq(adapter); - /* In case we enter iavf_remove from erroneous state, free traffic irqs - * here, so as to not cause a kernel crash, when calling - * iavf_reset_interrupt_capability. - */ - if ((adapter->last_state == __IAVF_RESETTING && - prev_state != __IAVF_DOWN) || - (adapter->last_state == __IAVF_RUNNING && - !(netdev->flags & IFF_UP))) - iavf_free_traffic_irqs(adapter); - iavf_reset_interrupt_capability(adapter); iavf_free_q_vectors(adapter); From 3ccd54ef44ebfa0792c5441b6d9c86618f3378d1 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:37:10 +0100 Subject: [PATCH 476/773] iavf: Fix init state closure on remove When init states of the adapter work, the errors like lack of communication with the PF might hop in. If such events occur the driver restores previous states in order to retry initialization in a proper way. When remove task kicks in, this situation could lead to races with unregistering the netdevice as well as resources cleanup. With the commit introducing the waiting in remove for init to complete, this problem turns into an endless waiting if init never recovers from errors. Introduce __IAVF_IN_REMOVE_TASK bit to indicate that the remove thread has started. Make __IAVF_COMM_FAILED adapter state respect the __IAVF_IN_REMOVE_TASK bit and set the __IAVF_INIT_FAILED state and return without any action instead of trying to recover. Make __IAVF_INIT_FAILED adapter state respect the __IAVF_IN_REMOVE_TASK bit and return without any further actions. Make the loop in the remove handler break when adapter has __IAVF_INIT_FAILED state set. Fixes: 898ef1cb1cb2 ("iavf: Combine init and watchdog state machines") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 4 ++++ drivers/net/ethernet/intel/iavf/iavf_main.c | 24 ++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 44f83e06486d..f259fd517b2c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -201,6 +201,10 @@ enum iavf_state_t { __IAVF_RUNNING, /* opened, working */ }; +enum iavf_critical_section_t { + __IAVF_IN_REMOVE_TASK, /* device being removed */ +}; + #define IAVF_CLOUD_FIELD_OMAC 0x01 #define IAVF_CLOUD_FIELD_IMAC 0x02 #define IAVF_CLOUD_FIELD_IVLAN 0x04 diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 5e71b38e9154..be51da978e7c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2424,6 +2424,15 @@ static void iavf_watchdog_task(struct work_struct *work) msecs_to_jiffies(1)); return; case __IAVF_INIT_FAILED: + if (test_bit(__IAVF_IN_REMOVE_TASK, + &adapter->crit_section)) { + /* Do not update the state and do not reschedule + * watchdog task, iavf_remove should handle this state + * as it can loop forever + */ + mutex_unlock(&adapter->crit_lock); + return; + } if (++adapter->aq_wait_count > IAVF_AQ_MAX_ERR) { dev_err(&adapter->pdev->dev, "Failed to communicate with PF; waiting before retry\n"); @@ -2440,6 +2449,17 @@ static void iavf_watchdog_task(struct work_struct *work) queue_delayed_work(iavf_wq, &adapter->watchdog_task, HZ); return; case __IAVF_COMM_FAILED: + if (test_bit(__IAVF_IN_REMOVE_TASK, + &adapter->crit_section)) { + /* Set state to __IAVF_INIT_FAILED and perform remove + * steps. Remove IAVF_FLAG_PF_COMMS_FAILED so the task + * doesn't bring the state back to __IAVF_COMM_FAILED. + */ + iavf_change_state(adapter, __IAVF_INIT_FAILED); + adapter->flags &= ~IAVF_FLAG_PF_COMMS_FAILED; + mutex_unlock(&adapter->crit_lock); + return; + } reg_val = rd32(hw, IAVF_VFGEN_RSTAT) & IAVF_VFGEN_RSTAT_VFR_STATE_MASK; if (reg_val == VIRTCHNL_VFR_VFACTIVE || @@ -4567,13 +4587,15 @@ static void iavf_remove(struct pci_dev *pdev) struct iavf_hw *hw = &adapter->hw; int err; + set_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section); /* Wait until port initialization is complete. * There are flows where register/unregister netdev may race. */ while (1) { mutex_lock(&adapter->crit_lock); if (adapter->state == __IAVF_RUNNING || - adapter->state == __IAVF_DOWN) { + adapter->state == __IAVF_DOWN || + adapter->state == __IAVF_INIT_FAILED) { mutex_unlock(&adapter->crit_lock); break; } From 0579fafd37fb7efe091f0e6c8ccf968864f40f3e Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:37:50 +0100 Subject: [PATCH 477/773] iavf: Fix locking for VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS iavf_virtchnl_completion is called under crit_lock but when the code for VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS is called, this lock is released in order to obtain rtnl_lock to avoid ABBA deadlock with unregister_netdev. Along with the new way iavf_remove behaves, there exist many risks related to the lock release and attmepts to regrab it. The driver faces crashes related to races between unregister_netdev and netdev_update_features. Yet another risk is that the driver could already obtain the crit_lock in order to destroy it and iavf_virtchnl_completion could crash or block forever. Make iavf_virtchnl_completion never relock crit_lock in it's call paths. Extract rtnl_lock locking logic to the driver for unregister_netdev in order to set the netdev_registered flag inside the lock. Introduce a new flag that will inform adminq_task to perform the code from VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS right after it finishes processing messages. Guard this code with remove flags so it's never called when the driver is in remove state. Fixes: 5951a2b9812d ("iavf: Fix VLAN feature flags after VFR") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 + drivers/net/ethernet/intel/iavf/iavf_main.c | 22 ++++++++++++++++- .../net/ethernet/intel/iavf/iavf_virtchnl.c | 24 +------------------ 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index f259fd517b2c..89423947ee65 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -287,6 +287,7 @@ struct iavf_adapter { #define IAVF_FLAG_LEGACY_RX BIT(15) #define IAVF_FLAG_REINIT_ITR_NEEDED BIT(16) #define IAVF_FLAG_QUEUES_DISABLED BIT(17) +#define IAVF_FLAG_SETUP_NETDEV_FEATURES BIT(18) /* duplicates for common code */ #define IAVF_FLAG_DCB_ENABLED 0 /* flags for admin queue service task */ diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index be51da978e7c..67349d24dc90 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2879,6 +2879,24 @@ static void iavf_adminq_task(struct work_struct *work) } while (pending); mutex_unlock(&adapter->crit_lock); + if ((adapter->flags & IAVF_FLAG_SETUP_NETDEV_FEATURES)) { + if (adapter->netdev_registered || + !test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section)) { + struct net_device *netdev = adapter->netdev; + + rtnl_lock(); + netdev_update_features(netdev); + rtnl_unlock(); + /* Request VLAN offload settings */ + if (VLAN_V2_ALLOWED(adapter)) + iavf_set_vlan_offload_features + (adapter, 0, netdev->features); + + iavf_set_queue_vlan_tag_loc(adapter); + } + + adapter->flags &= ~IAVF_FLAG_SETUP_NETDEV_FEATURES; + } if ((adapter->flags & (IAVF_FLAG_RESET_PENDING | IAVF_FLAG_RESET_NEEDED)) || adapter->state == __IAVF_RESETTING) @@ -4606,8 +4624,10 @@ static void iavf_remove(struct pci_dev *pdev) cancel_delayed_work_sync(&adapter->watchdog_task); if (adapter->netdev_registered) { - unregister_netdev(netdev); + rtnl_lock(); + unregister_netdevice(netdev); adapter->netdev_registered = false; + rtnl_unlock(); } if (CLIENT_ALLOWED(adapter)) { err = iavf_lan_del_device(adapter); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 5ee1d118fd30..88844d68e150 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -2146,29 +2146,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, sizeof(adapter->vlan_v2_caps))); iavf_process_config(adapter); - - /* unlock crit_lock before acquiring rtnl_lock as other - * processes holding rtnl_lock could be waiting for the same - * crit_lock - */ - mutex_unlock(&adapter->crit_lock); - /* VLAN capabilities can change during VFR, so make sure to - * update the netdev features with the new capabilities - */ - rtnl_lock(); - netdev_update_features(netdev); - rtnl_unlock(); - if (iavf_lock_timeout(&adapter->crit_lock, 10000)) - dev_warn(&adapter->pdev->dev, "failed to acquire crit_lock in %s\n", - __FUNCTION__); - - /* Request VLAN offload settings */ - if (VLAN_V2_ALLOWED(adapter)) - iavf_set_vlan_offload_features(adapter, 0, - netdev->features); - - iavf_set_queue_vlan_tag_loc(adapter); - + adapter->flags |= IAVF_FLAG_SETUP_NETDEV_FEATURES; } break; case VIRTCHNL_OP_ENABLE_QUEUES: From a472eb5cbaebb5774672c565e024336c039e9128 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:38:01 +0100 Subject: [PATCH 478/773] iavf: Fix race in init state When iavf_init_version_check sends VIRTCHNL_OP_GET_VF_RESOURCES message, the driver will wait for the response after requeueing the watchdog task in iavf_init_get_resources call stack. The logic is implemented this way that iavf_init_get_resources has to be called in order to allocate adapter->vf_res. It is polling for the AQ response in iavf_get_vf_config function. Expect a call trace from kernel when adminq_task worker handles this message first. adapter->vf_res will be NULL in iavf_virtchnl_completion. Make the watchdog task not queue the adminq_task if the init process is not finished yet. Fixes: 898ef1cb1cb2 ("iavf: Combine init and watchdog state machines") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 67349d24dc90..36433d6504b7 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2532,7 +2532,8 @@ static void iavf_watchdog_task(struct work_struct *work) schedule_delayed_work(&adapter->client_task, msecs_to_jiffies(5)); mutex_unlock(&adapter->crit_lock); restart_watchdog: - queue_work(iavf_wq, &adapter->adminq_task); + if (adapter->state >= __IAVF_DOWN) + queue_work(iavf_wq, &adapter->adminq_task); if (adapter->aq_required) queue_delayed_work(iavf_wq, &adapter->watchdog_task, msecs_to_jiffies(20)); From e85ff9c631e1bf109ce8428848dfc8e8b0041f48 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:38:31 +0100 Subject: [PATCH 479/773] iavf: Fix deadlock in iavf_reset_task There exists a missing mutex_unlock call on crit_lock in iavf_reset_task call path. Unlock the crit_lock before returning from reset task. Fixes: 5ac49f3c2702 ("iavf: use mutexes for locking of critical sections") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 36433d6504b7..da50ea3e44c2 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2688,6 +2688,7 @@ static void iavf_reset_task(struct work_struct *work) reg_val); iavf_disable_vf(adapter); mutex_unlock(&adapter->client_lock); + mutex_unlock(&adapter->crit_lock); return; /* Do not attempt to reinit. It's dead, Jim. */ } From d2c0f45fcceb0995f208c441d9c9a453623f9ccf Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:38:43 +0100 Subject: [PATCH 480/773] iavf: Fix missing check for running netdev The driver was queueing reset_task regardless of the netdev state. Do not queue the reset task in iavf_change_mtu if netdev is not running. Fixes: fdd4044ffdc8 ("iavf: Remove timer for work triggering, use delaying work instead") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index da50ea3e44c2..be97ac251802 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3905,8 +3905,11 @@ static int iavf_change_mtu(struct net_device *netdev, int new_mtu) iavf_notify_client_l2_params(&adapter->vsi); adapter->flags |= IAVF_FLAG_SERVICE_CLIENT_REQUESTED; } - adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(iavf_wq, &adapter->reset_task); + + if (netif_running(netdev)) { + adapter->flags |= IAVF_FLAG_RESET_NEEDED; + queue_work(iavf_wq, &adapter->reset_task); + } return 0; } From 14756b2ae265d526b8356e86729090b01778fdf6 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 23 Feb 2022 13:38:55 +0100 Subject: [PATCH 481/773] iavf: Fix __IAVF_RESETTING state usage The setup of __IAVF_RESETTING state in watchdog task had no effect and could lead to slow resets in the driver as the task for __IAVF_RESETTING state only requeues watchdog. Till now the __IAVF_RESETTING was interpreted by reset task as running state which could lead to errors with allocating and resources disposal. Make watchdog_task queue the reset task when it's necessary. Do not update the state to __IAVF_RESETTING so the reset task knows exactly what is the current state of the adapter. Fixes: 898ef1cb1cb2 ("iavf: Combine init and watchdog state machines") Signed-off-by: Slawomir Laba Signed-off-by: Phani Burra Signed-off-by: Jacob Keller Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index be97ac251802..dcf24264c7ea 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1137,8 +1137,7 @@ void iavf_down(struct iavf_adapter *adapter) rss->state = IAVF_ADV_RSS_DEL_REQUEST; spin_unlock_bh(&adapter->adv_rss_lock); - if (!(adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) && - adapter->state != __IAVF_RESETTING) { + if (!(adapter->flags & IAVF_FLAG_PF_COMMS_FAILED)) { /* cancel any current operation */ adapter->current_op = VIRTCHNL_OP_UNKNOWN; /* Schedule operations to close down the HW. Don't wait @@ -2385,11 +2384,12 @@ static void iavf_watchdog_task(struct work_struct *work) if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) iavf_change_state(adapter, __IAVF_COMM_FAILED); - if (adapter->flags & IAVF_FLAG_RESET_NEEDED && - adapter->state != __IAVF_RESETTING) { - iavf_change_state(adapter, __IAVF_RESETTING); + if (adapter->flags & IAVF_FLAG_RESET_NEEDED) { adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; + mutex_unlock(&adapter->crit_lock); + queue_work(iavf_wq, &adapter->reset_task); + return; } switch (adapter->state) { @@ -2697,8 +2697,7 @@ continue_reset: * ndo_open() returning, so we can't assume it means all our open * tasks have finished, since we're not holding the rtnl_lock here. */ - running = ((adapter->state == __IAVF_RUNNING) || - (adapter->state == __IAVF_RESETTING)); + running = adapter->state == __IAVF_RUNNING; if (running) { netdev->flags &= ~IFF_UP; From 22e9f71072fa605cbf033158db58e0790101928d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 23 Feb 2022 11:23:57 -0400 Subject: [PATCH 482/773] RDMA/cma: Do not change route.addr.src_addr outside state checks If the state is not idle then resolve_prepare_src() should immediately fail and no change to global state should happen. However, it unconditionally overwrites the src_addr trying to build a temporary any address. For instance if the state is already RDMA_CM_LISTEN then this will corrupt the src_addr and would cause the test in cma_cancel_operation(): if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) Which would manifest as this trace from syzkaller: BUG: KASAN: use-after-free in __list_add_valid+0x93/0xa0 lib/list_debug.c:26 Read of size 8 at addr ffff8881546491e0 by task syz-executor.1/32204 CPU: 1 PID: 32204 Comm: syz-executor.1 Not tainted 5.12.0-rc8-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x5b/0x2f8 mm/kasan/report.c:232 __kasan_report mm/kasan/report.c:399 [inline] kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:416 __list_add_valid+0x93/0xa0 lib/list_debug.c:26 __list_add include/linux/list.h:67 [inline] list_add_tail include/linux/list.h:100 [inline] cma_listen_on_all drivers/infiniband/core/cma.c:2557 [inline] rdma_listen+0x787/0xe00 drivers/infiniband/core/cma.c:3751 ucma_listen+0x16a/0x210 drivers/infiniband/core/ucma.c:1102 ucma_write+0x259/0x350 drivers/infiniband/core/ucma.c:1732 vfs_write+0x28e/0xa30 fs/read_write.c:603 ksys_write+0x1ee/0x250 fs/read_write.c:658 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae This is indicating that an rdma_id_private was destroyed without doing cma_cancel_listens(). Instead of trying to re-use the src_addr memory to indirectly create an any address derived from the dst build one explicitly on the stack and bind to that as any other normal flow would do. rdma_bind_addr() will copy it over the src_addr once it knows the state is valid. This is similar to commit bc0bdc5afaa7 ("RDMA/cma: Do not change route.addr.src_addr.ss_family") Link: https://lore.kernel.org/r/0-v2-e975c8fd9ef2+11e-syz_cma_srcaddr_jgg@nvidia.com Cc: stable@vger.kernel.org Fixes: 732d41c545bb ("RDMA/cma: Make the locking for automatic state transition more clear") Reported-by: syzbot+c94a3675a626f6333d74@syzkaller.appspotmail.com Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 38 +++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index c447526288f4..50c53409ceb6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3370,22 +3370,30 @@ err: static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { - if (!src_addr || !src_addr->sa_family) { - src_addr = (struct sockaddr *) &id->route.addr.src_addr; - src_addr->sa_family = dst_addr->sa_family; - if (IS_ENABLED(CONFIG_IPV6) && - dst_addr->sa_family == AF_INET6) { - struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; - struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; - src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; - if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; - } else if (dst_addr->sa_family == AF_IB) { - ((struct sockaddr_ib *) src_addr)->sib_pkey = - ((struct sockaddr_ib *) dst_addr)->sib_pkey; - } + struct sockaddr_storage zero_sock = {}; + + if (src_addr && src_addr->sa_family) + return rdma_bind_addr(id, src_addr); + + /* + * When the src_addr is not specified, automatically supply an any addr + */ + zero_sock.ss_family = dst_addr->sa_family; + if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { + struct sockaddr_in6 *src_addr6 = + (struct sockaddr_in6 *)&zero_sock; + struct sockaddr_in6 *dst_addr6 = + (struct sockaddr_in6 *)dst_addr; + + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + id->route.addr.dev_addr.bound_dev_if = + dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *)&zero_sock)->sib_pkey = + ((struct sockaddr_ib *)dst_addr)->sib_pkey; } - return rdma_bind_addr(id, src_addr); + return rdma_bind_addr(id, (struct sockaddr *)&zero_sock); } /* From 851e99ebeec3f4a672bb5010cf1ece095acee447 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 25 Feb 2022 15:34:26 -0500 Subject: [PATCH 483/773] tracefs: Set the group ownership in apply_options() not parse_options() Al Viro brought it to my attention that the dentries may not be filled when the parse_options() is called, causing the call to set_gid() to possibly crash. It should only be called if parse_options() succeeds totally anyway. He suggested the logical place to do the update is in apply_options(). Link: https://lore.kernel.org/all/20220225165219.737025658@goodmis.org/ Link: https://lkml.kernel.org/r/20220225153426.1c4cab6b@gandalf.local.home Cc: stable@vger.kernel.org Acked-by: Al Viro Reported-by: Al Viro Fixes: 48b27b6b5191 ("tracefs: Set all files to the same group ownership as the mount option") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index bafc02bf8220..de7252715b12 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -264,7 +264,6 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) if (!gid_valid(gid)) return -EINVAL; opts->gid = gid; - set_gid(tracefs_mount->mnt_root, gid); break; case Opt_mode: if (match_octal(&args[0], &option)) @@ -291,7 +290,9 @@ static int tracefs_apply_options(struct super_block *sb) inode->i_mode |= opts->mode; inode->i_uid = opts->uid; - inode->i_gid = opts->gid; + + /* Set all the group ids to the mount option */ + set_gid(sb->s_root, opts->gid); return 0; } From c5229a0bd47814770c895e94fbc97ad21819abfe Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Dec 2021 16:38:06 +0000 Subject: [PATCH 484/773] tracing: Fix selftest config check for function graph start up test CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is required to test direct tramp. Link: https://lkml.kernel.org/r/bdc7e594e13b0891c1d61bc8d56c94b1890eaed7.1640017960.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_selftest.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index afd937a46496..abcadbe933bb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,9 +784,7 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; -#if defined(CONFIG_DYNAMIC_FTRACE) && \ - defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) -#define TEST_DIRECT_TRAMP +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS noinline __noclone static void trace_direct_tramp(void) { } #endif @@ -849,7 +847,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } -#ifdef TEST_DIRECT_TRAMP +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); From dd48f316a1216fa10f9ba26852457794417d9bc6 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 18 Feb 2022 18:57:07 +0100 Subject: [PATCH 485/773] rtla/hist: Make -E the short version of --entries Currently, --entries uses -e as the short version in the hist mode of timerlat and osnoise tools. But as -e is already used to enable events on trace sessions by other tools, thus let's keep it available for the same usage for all rtla tools. Make -E the short version of --entries for hist mode on all tools. Note: rtla was merged in this merge window, so rtla was not released yet. Link: https://lkml.kernel.org/r/5dbf0cbe7364d3a05e708926b41a097c59a02b1e.1645206561.git.bristot@kernel.org Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- Documentation/tools/rtla/common_hist_options.rst | 2 +- Documentation/tools/rtla/rtla-osnoise-hist.rst | 2 +- tools/tracing/rtla/src/osnoise_hist.c | 10 +++++----- tools/tracing/rtla/src/timerlat_hist.c | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/tools/rtla/common_hist_options.rst b/Documentation/tools/rtla/common_hist_options.rst index 0266cd08a6c9..df53ff835bfb 100644 --- a/Documentation/tools/rtla/common_hist_options.rst +++ b/Documentation/tools/rtla/common_hist_options.rst @@ -2,7 +2,7 @@ Set the histogram bucket size (default *1*). -**-e**, **--entries** *N* +**-E**, **--entries** *N* Set the number of entries of the histogram (default 256). diff --git a/Documentation/tools/rtla/rtla-osnoise-hist.rst b/Documentation/tools/rtla/rtla-osnoise-hist.rst index 52298ddd8701..f2e79d22c4c4 100644 --- a/Documentation/tools/rtla/rtla-osnoise-hist.rst +++ b/Documentation/tools/rtla/rtla-osnoise-hist.rst @@ -36,7 +36,7 @@ default). The reason for reducing the runtime is to avoid starving the **rtla** tool. The tool is also set to run for *one minute*. The output histogram is set to group outputs in buckets of *10us* and *25* entries:: - [root@f34 ~/]# rtla osnoise hist -P F:1 -c 0-11 -r 900000 -d 1M -b 10 -e 25 + [root@f34 ~/]# rtla osnoise hist -P F:1 -c 0-11 -r 900000 -d 1M -b 10 -E 25 # RTLA osnoise histogram # Time unit is microseconds (us) # Duration: 0 00:01:00 diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index 1f0b7fce55cf..52c053cc1789 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -426,7 +426,7 @@ static void osnoise_hist_usage(char *usage) static const char * const msg[] = { "", " usage: rtla osnoise hist [-h] [-D] [-d s] [-p us] [-r us] [-s us] [-S us] [-t[=file]] \\", - " [-c cpu-list] [-P priority] [-b N] [-e N] [--no-header] [--no-summary] \\", + " [-c cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros]", "", " -h/--help: print this menu", @@ -439,7 +439,7 @@ static void osnoise_hist_usage(char *usage) " -D/--debug: print debug info", " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", " -b/--bucket-size N: set the histogram bucket size (default 1)", - " -e/--entries N: set the number of entries of the histogram (default 256)", + " -E/--entries N: set the number of entries of the histogram (default 256)", " --no-header: do not print header", " --no-summary: do not print summary", " --no-index: do not print index", @@ -486,7 +486,7 @@ static struct osnoise_hist_params while (1) { static struct option long_options[] = { {"bucket-size", required_argument, 0, 'b'}, - {"entries", required_argument, 0, 'e'}, + {"entries", required_argument, 0, 'E'}, {"cpus", required_argument, 0, 'c'}, {"debug", no_argument, 0, 'D'}, {"duration", required_argument, 0, 'd'}, @@ -507,7 +507,7 @@ static struct osnoise_hist_params /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "c:b:d:e:Dhp:P:r:s:S:t::0123", + c = getopt_long(argc, argv, "c:b:d:E:Dhp:P:r:s:S:t::0123", long_options, &option_index); /* detect the end of the options. */ @@ -534,7 +534,7 @@ static struct osnoise_hist_params if (!params->duration) osnoise_hist_usage("Invalid -D duration\n"); break; - case 'e': + case 'E': params->entries = get_llong_from_str(optarg); if ((params->entries < 10) || (params->entries > 9999999)) osnoise_hist_usage("Entries must be > 10 and < 9999999\n"); diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 436a799f9adf..237e1735afa7 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -429,7 +429,7 @@ static void timerlat_hist_usage(char *usage) char *msg[] = { "", " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] [-t[=file]] \\", - " [-c cpu-list] [-P priority] [-e N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", + " [-c cpu-list] [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros]", "", " -h/--help: print this menu", @@ -443,7 +443,7 @@ static void timerlat_hist_usage(char *usage) " -T/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", " -n/--nano: display data in nanoseconds", " -b/--bucket-size N: set the histogram bucket size (default 1)", - " -e/--entries N: set the number of entries of the histogram (default 256)", + " -E/--entries N: set the number of entries of the histogram (default 256)", " --no-irq: ignore IRQ latencies", " --no-thread: ignore thread latencies", " --no-header: do not print header", @@ -494,7 +494,7 @@ static struct timerlat_hist_params {"cpus", required_argument, 0, 'c'}, {"bucket-size", required_argument, 0, 'b'}, {"debug", no_argument, 0, 'D'}, - {"entries", required_argument, 0, 'e'}, + {"entries", required_argument, 0, 'E'}, {"duration", required_argument, 0, 'd'}, {"help", no_argument, 0, 'h'}, {"irq", required_argument, 0, 'i'}, @@ -516,7 +516,7 @@ static struct timerlat_hist_params /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "c:b:d:e:Dhi:np:P:s:t::T:012345", + c = getopt_long(argc, argv, "c:b:d:E:Dhi:np:P:s:t::T:012345", long_options, &option_index); /* detect the end of the options. */ @@ -543,7 +543,7 @@ static struct timerlat_hist_params if (!params->duration) timerlat_hist_usage("Invalid -D duration\n"); break; - case 'e': + case 'E': params->entries = get_llong_from_str(optarg); if ((params->entries < 10) || (params->entries > 9999999)) timerlat_hist_usage("Entries must be > 10 and < 9999999\n"); From 316f710172461c501f9b73f3b2fc7ce8aa5b84a5 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 18 Feb 2022 18:57:08 +0100 Subject: [PATCH 486/773] rtla/osnoise: Free params at the exit The variable that stores the parsed command line arguments are not being free()d at the rtla osnoise top exit path. Free params variable before exiting. Link: https://lkml.kernel.org/r/0be31d8259c7c53b98a39769d60cfeecd8421785.1645206561.git.bristot@kernel.org Fixes: 1eceb2fc2ca5 ("rtla/osnoise: Add osnoise top mode") Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/osnoise_top.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index c67dc28ef716..7af769b9c0de 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -573,6 +573,7 @@ out_top: osnoise_free_top(tool->data); osnoise_destroy_tool(record); osnoise_destroy_tool(tool); + free(params); out_exit: exit(return_value); } From 90f59ee41abf587ad4675a70434136c8707fdf4b Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 18 Feb 2022 18:57:09 +0100 Subject: [PATCH 487/773] rtla/osnoise: Fix error message when failing to enable trace instance When a trace instance creation fails, tools are printing: Could not enable -> osnoiser <- tracer for tracing Print the actual (and correct) name of the tracer it fails to enable. Link: https://lkml.kernel.org/r/53ef0582605af91eca14b19dba9fc9febb95d4f9.1645206561.git.bristot@kernel.org Fixes: b1696371d865 ("rtla: Helper functions for rtla") Cc: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/osnoise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c index 5648f9252e58..e60f1862bad0 100644 --- a/tools/tracing/rtla/src/osnoise.c +++ b/tools/tracing/rtla/src/osnoise.c @@ -810,7 +810,7 @@ struct osnoise_tool *osnoise_init_trace_tool(char *tracer) retval = enable_tracer_by_name(trace->trace.inst, tracer); if (retval) { - err_msg("Could not enable osnoiser tracer for tracing\n"); + err_msg("Could not enable %s tracer for tracing\n", tracer); goto out_err; } From bbcf7b0e2e4b8376ef4b401777f2852302c745e3 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 25 Feb 2022 19:10:53 -0800 Subject: [PATCH 488/773] MAINTAINERS: add sysctl-next git tree Add a git tree for sysctls as there's been quite a bit of work lately to remove all the syctls out of kernel/sysctl.c and move to their respective places, so coordination has been needed to avoid conflicts. This tree will also help soak these changes on linux-next prior to getting to Linus. Link: https://lkml.kernel.org/r/20220218182736.3694508-1-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2524b75763cd..71498bb8a142 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15567,6 +15567,7 @@ M: Iurii Zaikin L: linux-kernel@vger.kernel.org L: linux-fsdevel@vger.kernel.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git sysctl-next F: fs/proc/proc_sysctl.c F: include/linux/sysctl.h F: kernel/sysctl-test.c From db110a99d3367936058727ff4798e3a39c707969 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 25 Feb 2022 19:10:56 -0800 Subject: [PATCH 489/773] mm/hugetlb: fix kernel crash with hugetlb mremap This fixes the below crash: kernel BUG at include/linux/mm.h:2373! cpu 0x5d: Vector: 700 (Program Check) at [c00000003c6e76e0] pc: c000000000581a54: pmd_to_page+0x54/0x80 lr: c00000000058d184: move_hugetlb_page_tables+0x4e4/0x5b0 sp: c00000003c6e7980 msr: 9000000000029033 current = 0xc00000003bd8d980 paca = 0xc000200fff610100 irqmask: 0x03 irq_happened: 0x01 pid = 9349, comm = hugepage-mremap kernel BUG at include/linux/mm.h:2373! move_hugetlb_page_tables+0x4e4/0x5b0 (link register) move_hugetlb_page_tables+0x22c/0x5b0 (unreliable) move_page_tables+0xdbc/0x1010 move_vma+0x254/0x5f0 sys_mremap+0x7c0/0x900 system_call_exception+0x160/0x2c0 the kernel can't use huge_pte_offset before it set the pte entry because a page table lookup check for huge PTE bit in the page table to differentiate between a huge pte entry and a pointer to pte page. A huge_pte_alloc won't mark the page table entry huge and hence kernel should not use huge_pte_offset after a huge_pte_alloc. Link: https://lkml.kernel.org/r/20220211063221.99293-1-aneesh.kumar@linux.ibm.com Fixes: 550a7d60bd5e ("mm, hugepages: add mremap() support for hugepage backed vma") Signed-off-by: Aneesh Kumar K.V Reviewed-by: Mike Kravetz Reviewed-by: Mina Almasry Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61895cc01d09..e57650a9404f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4851,14 +4851,13 @@ again: } static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, pte_t *src_pte) + unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte) { struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; - pte_t *dst_pte, pte; spinlock_t *src_ptl, *dst_ptl; + pte_t pte; - dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h)); dst_ptl = huge_pte_lock(h, mm, dst_pte); src_ptl = huge_pte_lockptr(h, mm, src_pte); @@ -4917,7 +4916,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (!dst_pte) break; - move_huge_pte(vma, old_addr, new_addr, src_pte); + move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte); } flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); From 70effdc3756c924f4a2b6af1ec4e2e92e18e1b45 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 25 Feb 2022 19:10:59 -0800 Subject: [PATCH 490/773] kasan: test: prevent cache merging in kmem_cache_double_destroy With HW_TAGS KASAN and kasan.stacktrace=off, the cache created in the kmem_cache_double_destroy() test might get merged with an existing one. Thus, the first kmem_cache_destroy() call won't actually destroy it but will only decrease the refcount. This causes the test to fail. Provide an empty constructor for the created cache to prevent the cache from getting merged. Link: https://lkml.kernel.org/r/b597bd434c49591d8af00ee3993a42c609dc9a59.1644346040.git.andreyknvl@google.com Fixes: f98f966cd750 ("kasan: test: add test case for double-kmem_cache_destroy()") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 26a5c9007653..3b413f8c8a71 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -869,11 +869,14 @@ static void kmem_cache_invalid_free(struct kunit *test) kmem_cache_destroy(cache); } +static void empty_cache_ctor(void *object) { } + static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; - cache = kmem_cache_create("test_cache", 200, 0, 0, NULL); + /* Provide a constructor to prevent cache merging. */ + cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); kmem_cache_destroy(cache); KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); From e79ce9832316e09529b212a21278d68240ccbf1f Mon Sep 17 00:00:00 2001 From: Liu Yuntao Date: Fri, 25 Feb 2022 19:11:02 -0800 Subject: [PATCH 491/773] hugetlbfs: fix a truncation issue in hugepages parameter When we specify a large number for node in hugepages parameter, it may be parsed to another number due to truncation in this statement: node = tmp; For example, add following parameter in command line: hugepagesz=1G hugepages=4294967297:5 and kernel will allocate 5 hugepages for node 1 instead of ignoring it. I move the validation check earlier to fix this issue, and slightly simplifies the condition here. Link: https://lkml.kernel.org/r/20220209134018.8242-1-liuyuntao10@huawei.com Fixes: b5389086ad7be0 ("hugetlbfs: extend the definition of hugepages parameter to support node allocation") Signed-off-by: Liu Yuntao Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e57650a9404f..f294db835f4b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4159,10 +4159,10 @@ static int __init hugepages_setup(char *s) pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); return 0; } + if (tmp >= nr_online_nodes) + goto invalid; node = tmp; p += count + 1; - if (node < 0 || node >= nr_online_nodes) - goto invalid; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; From f798a1d4f94de9510e060d37b9b47721065a957c Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 25 Feb 2022 19:11:05 -0800 Subject: [PATCH 492/773] mm: fix use-after-free bug when mm->mmap is reused after being freed oom reaping (__oom_reap_task_mm) relies on a 2 way synchronization with exit_mmap. First it relies on the mmap_lock to exclude from unlock path[1], page tables tear down (free_pgtables) and vma destruction. This alone is not sufficient because mm->mmap is never reset. For historical reasons[2] the lock is taken there is also MMF_OOM_SKIP set for oom victims before. The oom reaper only ever looks at oom victims so the whole scheme works properly but process_mrelease can opearate on any task (with fatal signals pending) which doesn't really imply oom victims. That means that the MMF_OOM_SKIP part of the synchronization doesn't work and it can see a task after the whole address space has been demolished and traverse an already released mm->mmap list. This leads to use after free as properly caught up by KASAN report. Fix the issue by reseting mm->mmap so that MMF_OOM_SKIP synchronization is not needed anymore. The MMF_OOM_SKIP is not removed from exit_mmap yet but it acts mostly as an optimization now. [1] 27ae357fa82b ("mm, oom: fix concurrent munlock and oom reaper unmap, v3") [2] 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") [mhocko@suse.com: changelog rewrite] Link: https://lore.kernel.org/all/00000000000072ef2c05d7f81950@google.com/ Link: https://lkml.kernel.org/r/20220215201922.1908156-1-surenb@google.com Fixes: 64591e8605d6 ("mm: protect free_pgtables with mmap_lock write lock in exit_mmap") Signed-off-by: Suren Baghdasaryan Reported-by: syzbot+2ccf63a4bd07cf39cab0@syzkaller.appspotmail.com Suggested-by: Michal Hocko Reviewed-by: Rik van Riel Reviewed-by: Yang Shi Acked-by: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: Johannes Weiner Cc: Roman Gushchin Cc: Rik van Riel Cc: Minchan Kim Cc: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Christian Brauner Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Andy Lutomirski Cc: Christian Brauner Cc: Florian Weimer Cc: Jan Engelhardt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 1e8fdb0b51ed..d445c1b9d606 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3186,6 +3186,7 @@ void exit_mmap(struct mm_struct *mm) vma = remove_vma(vma); cond_resched(); } + mm->mmap = NULL; mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } From f39c58008dee7ab5fc94c3f1995a21e886801df0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 25 Feb 2022 19:11:08 -0800 Subject: [PATCH 493/773] selftest/vm: fix map_fixed_noreplace test failure On the latest RHEL the test fails due to executable mapped at 256MB address # ./map_fixed_noreplace mmap() @ 0x10000000-0x10050000 p=0xffffffffffffffff result=File exists 10000000-10010000 r-xp 00000000 fd:04 34905657 /root/rpmbuild/BUILD/kernel-5.14.0-56.el9/linux-5.14.0-56.el9.ppc64le/tools/testing/selftests/vm/map_fixed_noreplace 10010000-10020000 r--p 00000000 fd:04 34905657 /root/rpmbuild/BUILD/kernel-5.14.0-56.el9/linux-5.14.0-56.el9.ppc64le/tools/testing/selftests/vm/map_fixed_noreplace 10020000-10030000 rw-p 00010000 fd:04 34905657 /root/rpmbuild/BUILD/kernel-5.14.0-56.el9/linux-5.14.0-56.el9.ppc64le/tools/testing/selftests/vm/map_fixed_noreplace 10029b90000-10029bc0000 rw-p 00000000 00:00 0 [heap] 7fffbb510000-7fffbb750000 r-xp 00000000 fd:04 24534 /usr/lib64/libc.so.6 7fffbb750000-7fffbb760000 r--p 00230000 fd:04 24534 /usr/lib64/libc.so.6 7fffbb760000-7fffbb770000 rw-p 00240000 fd:04 24534 /usr/lib64/libc.so.6 7fffbb780000-7fffbb7a0000 r--p 00000000 00:00 0 [vvar] 7fffbb7a0000-7fffbb7b0000 r-xp 00000000 00:00 0 [vdso] 7fffbb7b0000-7fffbb800000 r-xp 00000000 fd:04 24514 /usr/lib64/ld64.so.2 7fffbb800000-7fffbb810000 r--p 00040000 fd:04 24514 /usr/lib64/ld64.so.2 7fffbb810000-7fffbb820000 rw-p 00050000 fd:04 24514 /usr/lib64/ld64.so.2 7fffd93f0000-7fffd9420000 rw-p 00000000 00:00 0 [stack] Error: couldn't map the space we need for the test Fix this by finding a free address using mmap instead of hardcoding BASE_ADDRESS. Link: https://lkml.kernel.org/r/20220217083417.373823-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Cc: Jann Horn Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../selftests/vm/map_fixed_noreplace.c | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c index d91bde511268..eed44322d1a6 100644 --- a/tools/testing/selftests/vm/map_fixed_noreplace.c +++ b/tools/testing/selftests/vm/map_fixed_noreplace.c @@ -17,9 +17,6 @@ #define MAP_FIXED_NOREPLACE 0x100000 #endif -#define BASE_ADDRESS (256ul * 1024 * 1024) - - static void dump_maps(void) { char cmd[32]; @@ -28,18 +25,46 @@ static void dump_maps(void) system(cmd); } +static unsigned long find_base_addr(unsigned long size) +{ + void *addr; + unsigned long flags; + + flags = MAP_PRIVATE | MAP_ANONYMOUS; + addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + if (addr == MAP_FAILED) { + printf("Error: couldn't map the space we need for the test\n"); + return 0; + } + + if (munmap(addr, size) != 0) { + printf("Error: couldn't map the space we need for the test\n"); + return 0; + } + return (unsigned long)addr; +} + int main(void) { + unsigned long base_addr; unsigned long flags, addr, size, page_size; char *p; page_size = sysconf(_SC_PAGE_SIZE); + //let's find a base addr that is free before we start the tests + size = 5 * page_size; + base_addr = find_base_addr(size); + if (!base_addr) { + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; // Check we can map all the areas we need below errno = 0; - addr = BASE_ADDRESS; + addr = base_addr; size = 5 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); @@ -60,7 +85,7 @@ int main(void) printf("unmap() successful\n"); errno = 0; - addr = BASE_ADDRESS + page_size; + addr = base_addr + page_size; size = 3 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); @@ -80,7 +105,7 @@ int main(void) * +4 | free | new */ errno = 0; - addr = BASE_ADDRESS; + addr = base_addr; size = 5 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); @@ -101,7 +126,7 @@ int main(void) * +4 | free | */ errno = 0; - addr = BASE_ADDRESS + (2 * page_size); + addr = base_addr + (2 * page_size); size = page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); @@ -121,7 +146,7 @@ int main(void) * +4 | free | new */ errno = 0; - addr = BASE_ADDRESS + (3 * page_size); + addr = base_addr + (3 * page_size); size = 2 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); @@ -141,7 +166,7 @@ int main(void) * +4 | free | */ errno = 0; - addr = BASE_ADDRESS; + addr = base_addr; size = 2 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); @@ -161,7 +186,7 @@ int main(void) * +4 | free | */ errno = 0; - addr = BASE_ADDRESS; + addr = base_addr; size = page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); @@ -181,7 +206,7 @@ int main(void) * +4 | free | new */ errno = 0; - addr = BASE_ADDRESS + (4 * page_size); + addr = base_addr + (4 * page_size); size = page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); @@ -192,7 +217,7 @@ int main(void) return 1; } - addr = BASE_ADDRESS; + addr = base_addr; size = 5 * page_size; if (munmap((void *)addr, size) != 0) { dump_maps(); From 7d547dcf97f275e9f507c8099e168ed682fe1257 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 25 Feb 2022 19:11:11 -0800 Subject: [PATCH 494/773] MAINTAINERS: add Roman as a memcg co-maintainer Add myself as a memcg co-maintainer. My primary focus over last few years was the kernel memory accounting stack, but I do work on some other parts of the memory controller as well. Link: https://lkml.kernel.org/r/20220221233951.659048-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 71498bb8a142..c513c4a91c83 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4914,6 +4914,7 @@ CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner M: Michal Hocko M: Vladimir Davydov +M: Roman Gushchin L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained From 0a972e72e2f9630b10bf6b7b5e08312e87eb6854 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 25 Feb 2022 19:11:14 -0800 Subject: [PATCH 495/773] MAINTAINERS: remove Vladimir from memcg maintainers Link: https://lkml.kernel.org/r/4ad1f8da49d7b71c84a0c15bd5347f5ce704e730.1645608825.git.vdavydov.dev@gmail.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c513c4a91c83..ade902f0271a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4913,7 +4913,6 @@ F: kernel/cgroup/cpuset.c CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner M: Michal Hocko -M: Vladimir Davydov M: Roman Gushchin L: cgroups@vger.kernel.org L: linux-mm@kvack.org From bb9d5454992322a0b793c655e1d860a54a38a3d3 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 25 Feb 2022 19:11:17 -0800 Subject: [PATCH 496/773] MAINTAINERS: add Shakeel as a memcg co-maintainer I have been contributing and reviewing to the memcg codebase for last couple of years. So, making it official. Link: https://lkml.kernel.org/r/20220224060148.4092228-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index ade902f0271a..ae42c9e2437b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4914,6 +4914,7 @@ CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner M: Michal Hocko M: Roman Gushchin +M: Shakeel Butt L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained From 7b0112f3432915fd6ac68ddd61bc4bcd1ac2505d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 25 Feb 2022 19:11:20 -0800 Subject: [PATCH 497/773] MAINTAINERS, SLAB: add Roman as reviewer, git tree The slab code has an overlap with kmem accounting, where Roman has done a lot of work recently and it would be useful to make sure he's CC'd on patches that potentially affect it. Thus add him as a reviewer for the SLAB subsystem. Also while at it, add the link to slab git tree. Link: https://lkml.kernel.org/r/20220222103104.13241-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ae42c9e2437b..1ba1e4af2cbc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17762,8 +17762,10 @@ M: David Rientjes M: Joonsoo Kim M: Andrew Morton M: Vlastimil Babka +R: Roman Gushchin L: linux-mm@kvack.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git F: include/linux/sl?b*.h F: mm/sl?b* From 9502bdbf34e4ffe865d144fe4218eb64602a75bd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 25 Feb 2022 19:11:23 -0800 Subject: [PATCH 498/773] mailmap: update Roman Gushchin's email I'm moving to a @linux.dev account. Map my old addresses. Link: https://lkml.kernel.org/r/20220221200006.416377-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.mailmap b/.mailmap index 8cd44b0c6579..10ee1103c823 100644 --- a/.mailmap +++ b/.mailmap @@ -333,6 +333,9 @@ Rémi Denis-Courmont Ricardo Ribalda Ricardo Ribalda Ricardo Ribalda Delgado Ricardo Ribalda +Roman Gushchin +Roman Gushchin +Roman Gushchin Ross Zwisler Rudolf Marek Rui Saraiva From fda153c89af344d21df281009a9d046cf587ea0f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 25 Feb 2022 19:11:26 -0800 Subject: [PATCH 499/773] selftests/memfd: clean up mapping in mfd_fail_write Running the memfd script ./run_hugetlbfs_test.sh will often end in error as follows: memfd-hugetlb: CREATE memfd-hugetlb: BASIC memfd-hugetlb: SEAL-WRITE memfd-hugetlb: SEAL-FUTURE-WRITE memfd-hugetlb: SEAL-SHRINK fallocate(ALLOC) failed: No space left on device ./run_hugetlbfs_test.sh: line 60: 166855 Aborted (core dumped) ./memfd_test hugetlbfs opening: ./mnt/memfd fuse: DONE If no hugetlb pages have been preallocated, run_hugetlbfs_test.sh will allocate 'just enough' pages to run the test. In the SEAL-FUTURE-WRITE test the mfd_fail_write routine maps the file, but does not unmap. As a result, two hugetlb pages remain reserved for the mapping. When the fallocate call in the SEAL-SHRINK test attempts allocate all hugetlb pages, it is short by the two reserved pages. Fix by making sure to unmap in mfd_fail_write. Link: https://lkml.kernel.org/r/20220219004340.56478-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Joel Fernandes Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/memfd/memfd_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 192a2899bae8..94df2692e6e4 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -455,6 +455,7 @@ static void mfd_fail_write(int fd) printf("mmap()+mprotect() didn't fail as expected\n"); abort(); } + munmap(p, mfd_def_size); } /* verify PUNCH_HOLE fails */ From 7e57714cd0ad2d5bb90e50b5096a0e671dec1ef3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 27 Feb 2022 14:36:33 -0800 Subject: [PATCH 500/773] Linux 5.17-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 289ce2be8032..daeb5c88b50b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Superb Owl # *DOCUMENTATION* From ba115adf61b36b8c167126425a62b0efc23f72c0 Mon Sep 17 00:00:00 2001 From: David Gow Date: Sun, 27 Feb 2022 21:00:10 -0800 Subject: [PATCH 501/773] Input: samsung-keypad - properly state IOMEM dependency Make the samsung-keypad driver explicitly depend on CONFIG_HAS_IOMEM, as it calls devm_ioremap(). This prevents compile errors in some configs (e.g, allyesconfig/randconfig under UML): /usr/bin/ld: drivers/input/keyboard/samsung-keypad.o: in function `samsung_keypad_probe': samsung-keypad.c:(.text+0xc60): undefined reference to `devm_ioremap' Signed-off-by: David Gow Acked-by: anton ivanov Link: https://lore.kernel.org/r/20220225041727.1902850-1-davidgow@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 0c607da9ee10..9417ee0b1eff 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -556,7 +556,7 @@ config KEYBOARD_PMIC8XXX config KEYBOARD_SAMSUNG tristate "Samsung keypad support" - depends on HAVE_CLK + depends on HAS_IOMEM && HAVE_CLK select INPUT_MATRIXKMAP help Say Y here if you want to use the keypad on your Samsung mobile From dcf0c838854c86e1f41fb1934aea906845d69782 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Fri, 28 Jan 2022 10:20:04 +0530 Subject: [PATCH 502/773] riscv/efi_stub: Fix get_boot_hartid_from_fdt() return value The get_boot_hartid_from_fdt() function currently returns U32_MAX for failure case which is not correct because U32_MAX is a valid hartid value. This patch fixes the issue by returning error code. Cc: Fixes: d7071743db31 ("RISC-V: Add EFI stub support.") Signed-off-by: Sunil V L Reviewed-by: Heinrich Schuchardt Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/riscv-stub.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/efi/libstub/riscv-stub.c b/drivers/firmware/efi/libstub/riscv-stub.c index 380e4e251399..9c460843442f 100644 --- a/drivers/firmware/efi/libstub/riscv-stub.c +++ b/drivers/firmware/efi/libstub/riscv-stub.c @@ -25,7 +25,7 @@ typedef void __noreturn (*jump_kernel_func)(unsigned int, unsigned long); static u32 hartid; -static u32 get_boot_hartid_from_fdt(void) +static int get_boot_hartid_from_fdt(void) { const void *fdt; int chosen_node, len; @@ -33,23 +33,26 @@ static u32 get_boot_hartid_from_fdt(void) fdt = get_efi_config_table(DEVICE_TREE_GUID); if (!fdt) - return U32_MAX; + return -EINVAL; chosen_node = fdt_path_offset(fdt, "/chosen"); if (chosen_node < 0) - return U32_MAX; + return -EINVAL; prop = fdt_getprop((void *)fdt, chosen_node, "boot-hartid", &len); if (!prop || len != sizeof(u32)) - return U32_MAX; + return -EINVAL; - return fdt32_to_cpu(*prop); + hartid = fdt32_to_cpu(*prop); + return 0; } efi_status_t check_platform_features(void) { - hartid = get_boot_hartid_from_fdt(); - if (hartid == U32_MAX) { + int ret; + + ret = get_boot_hartid_from_fdt(); + if (ret) { efi_err("/chosen/boot-hartid missing or invalid!\n"); return EFI_UNSUPPORTED; } From 258dd902022cb10c83671176688074879517fd21 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 18 Feb 2022 19:05:59 +0100 Subject: [PATCH 503/773] efivars: Respect "block" flag in efivar_entry_set_safe() When the "block" flag is false, the old code would sometimes still call check_var_size(), which wrongly tells ->query_variable_store() that it can block. As far as I can tell, this can't really materialize as a bug at the moment, because ->query_variable_store only does something on X86 with generic EFI, and in that configuration we always take the efivar_entry_set_nonblocking() path. Fixes: ca0e30dcaa53 ("efi: Add nonblocking option to efi_query_variable_store()") Signed-off-by: Jann Horn Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220218180559.1432559-1-jannh@google.com --- drivers/firmware/efi/vars.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index abdc8a6a3963..cae590bd08f2 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -742,6 +742,7 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, { const struct efivar_operations *ops; efi_status_t status; + unsigned long varsize; if (!__efivars) return -EINVAL; @@ -764,15 +765,17 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, return efivar_entry_set_nonblocking(name, vendor, attributes, size, data); + varsize = size + ucs2_strsize(name, 1024); if (!block) { if (down_trylock(&efivars_lock)) return -EBUSY; + status = check_var_size_nonblocking(attributes, varsize); } else { if (down_interruptible(&efivars_lock)) return -EINTR; + status = check_var_size(attributes, varsize); } - status = check_var_size(attributes, size + ucs2_strsize(name, 1024)); if (status != EFI_SUCCESS) { up(&efivars_lock); return -ENOSPC; From 9995b408f17ff8c7f11bc725c8aa225ba3a63b1c Mon Sep 17 00:00:00 2001 From: "j.nixdorf@avm.de" Date: Thu, 24 Feb 2022 10:06:49 +0100 Subject: [PATCH 504/773] net: ipv6: ensure we call ipv6_mc_down() at most once There are two reasons for addrconf_notify() to be called with NETDEV_DOWN: either the network device is actually going down, or IPv6 was disabled on the interface. If either of them stays down while the other is toggled, we repeatedly call the code for NETDEV_DOWN, including ipv6_mc_down(), while never calling the corresponding ipv6_mc_up() in between. This will cause a new entry in idev->mc_tomb to be allocated for each multicast group the interface is subscribed to, which in turn leaks one struct ifmcaddr6 per nontrivial multicast group the interface is subscribed to. The following reproducer will leak at least $n objects: ip addr add ff2e::4242/32 dev eth0 autojoin sysctl -w net.ipv6.conf.eth0.disable_ipv6=1 for i in $(seq 1 $n); do ip link set up eth0; ip link set down eth0 done Joining groups with IPV6_ADD_MEMBERSHIP (unprivileged) or setting the sysctl net.ipv6.conf.eth0.forwarding to 1 (=> subscribing to ff02::2) can also be used to create a nontrivial idev->mc_list, which will the leak objects with the right up-down-sequence. Based on both sources for NETDEV_DOWN events the interface IPv6 state should be considered: - not ready if the network interface is not ready OR IPv6 is disabled for it - ready if the network interface is ready AND IPv6 is enabled for it The functions ipv6_mc_up() and ipv6_down() should only be run when this state changes. Implement this by remembering when the IPv6 state is ready, and only run ipv6_mc_down() if it actually changed from ready to not ready. The other direction (not ready -> ready) already works correctly, as: - the interface notification triggered codepath for NETDEV_UP / NETDEV_CHANGE returns early if ipv6 is disabled, and - the disable_ipv6=0 triggered codepath skips fully initializing the interface as long as addrconf_link_ready(dev) returns false - calling ipv6_mc_up() repeatedly does not leak anything Fixes: 3ce62a84d53c ("ipv6: exit early in addrconf_notify() if IPv6 is disabled") Signed-off-by: Johannes Nixdorf Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c8ab3e6e6fe..f908e2fd30b2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3732,6 +3732,7 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; bool keep_addr = false; + bool was_ready; int state, i; ASSERT_RTNL(); @@ -3797,7 +3798,10 @@ restart: addrconf_del_rs_timer(idev); - /* Step 2: clear flags for stateless addrconf */ + /* Step 2: clear flags for stateless addrconf, repeated down + * detection + */ + was_ready = idev->if_flags & IF_READY; if (!unregister) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); @@ -3871,7 +3875,7 @@ restart: if (unregister) { ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); - } else { + } else if (was_ready) { ipv6_mc_down(idev); } From 32568ae37596b529628ac09b875f4874e614f63f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Mon, 14 Feb 2022 15:05:07 -0500 Subject: [PATCH 505/773] arm64: dts: mt8183: jacuzzi: Fix bus properties in anx's DSI endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mt8183-kukui-jacuzzi has an anx7625 bridge connected to the output of its DSI host. However, after commit fd0310b6fe7d ("drm/bridge: anx7625: add MIPI DPI input feature"), a bus-type property started being required in the endpoint node by the driver to indicate whether it is DSI or DPI. Add the missing bus-type property and set it to 5 (V4L2_FWNODE_BUS_TYPE_PARALLEL) so that the driver has its input configured to DSI and the display pipeline can probe correctly. While at it, also set the data-lanes property that was also introduced in that same commit, so that we don't rely on the default value. Fixes: fd0310b6fe7d ("drm/bridge: anx7625: add MIPI DPI input feature") Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/r/20220214200507.2500693-1-nfraprado@collabora.com Signed-off-by: Matthias Brugger --- arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi index 8f7bf33f607d..e8f133dc96b9 100644 --- a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi @@ -171,6 +171,8 @@ anx7625_in: endpoint { remote-endpoint = <&dsi_out>; + bus-type = <5>; + data-lanes = <0 1 2 3>; }; }; From 4d08b7b57ece83a1c31c633a7e4e27f121157f9c Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Fri, 25 Feb 2022 14:56:57 +0800 Subject: [PATCH 506/773] net/smc: Fix cleanup when register ULP fails This patch calls smc_ib_unregister_client() when tcp_register_ulp() fails, and make sure to clean it up. Fixes: d7cd421da9da ("net/smc: Introduce TCP ULP support") Signed-off-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 81984c1c0e78..284befa90967 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3087,12 +3087,14 @@ static int __init smc_init(void) rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_sock; + goto out_ib; } static_branch_enable(&tcp_have_smc); return 0; +out_ib: + smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: From 90d4025285748448809701a44cf466a3f5443eaa Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Fri, 25 Feb 2022 13:43:27 +0100 Subject: [PATCH 507/773] net: sparx5: Add #include to remove warning main.h uses NUM_TARGETS from main_regs.h, but the missing include never causes any errors because everywhere main.h is (currently) included, main_regs.h is included before. But since it is dependent on main_regs.h it should always be included. Signed-off-by: Casper Andersson Reviewed-by: Joacim Zetterling Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/sparx5/sparx5_main.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h index a1acc9b461f2..d40e18ce3293 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h @@ -16,6 +16,8 @@ #include #include +#include "sparx5_main_regs.h" + /* Target chip type */ enum spx5_target_chiptype { SPX5_TARGET_CT_7546 = 0x7546, /* SparX-5-64 Enterprise */ From d4e26aaea7f82ba884dcb4acfe689406bc092dc3 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 25 Feb 2022 04:52:30 -0800 Subject: [PATCH 508/773] atm: firestream: check the return value of ioremap() in fs_init() The function ioremap() in fs_init() can fail, so its return value should be checked. Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/atm/firestream.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 3bc3c314a467..4f67404fe64c 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -1676,6 +1676,8 @@ static int fs_init(struct fs_dev *dev) dev->hw_base = pci_resource_start(pci_dev, 0); dev->base = ioremap(dev->hw_base, 0x1000); + if (!dev->base) + return 1; reset_chip (dev); From caef14b7530c065fb85d54492768fa48fdb5093e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 25 Feb 2022 14:15:30 -0600 Subject: [PATCH 509/773] net: ipa: fix a build dependency An IPA build problem arose in the linux-next tree the other day. The problem is that a recent commit adds a new dependency on some code, and the Kconfig file for IPA doesn't reflect that dependency. As a result, some configurations can fail to build (particularly when COMPILE_TEST is enabled). The recent patch adds calls to qmp_get(), qmp_put(), and qmp_send(), and those are built based on the QCOM_AOSS_QMP config option. If that symbol is not defined, stubs are defined, so we just need to ensure QCOM_AOSS_QMP is compatible with QCOM_IPA, or it's not defined. Reported-by: Randy Dunlap Fixes: 34a081761e4e3 ("net: ipa: request IPA register values be retained") Signed-off-by: Alex Elder Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: David S. Miller --- drivers/net/ipa/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipa/Kconfig b/drivers/net/ipa/Kconfig index d037682fb7ad..e0164a55c1e6 100644 --- a/drivers/net/ipa/Kconfig +++ b/drivers/net/ipa/Kconfig @@ -3,6 +3,7 @@ config QCOM_IPA depends on NET && QCOM_SMEM depends on ARCH_QCOM || COMPILE_TEST depends on QCOM_RPROC_COMMON || (QCOM_RPROC_COMMON=n && COMPILE_TEST) + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select QCOM_MDT_LOADER if ARCH_QCOM select QCOM_SCM select QCOM_QMI_HELPERS From 1b279f6ad467535c3b8a66b4edefaca2cdd5bdc3 Mon Sep 17 00:00:00 2001 From: Vinay Belgaumkar Date: Wed, 16 Feb 2022 10:15:04 -0800 Subject: [PATCH 510/773] drm/i915/guc/slpc: Correct the param count for unset param SLPC unset param H2G only needs one parameter - the id of the param. Fixes: 025cb07bebfa ("drm/i915/guc/slpc: Cache platform frequency limits") Suggested-by: Umesh Nerlige Ramappa Signed-off-by: Vinay Belgaumkar Reviewed-by: Umesh Nerlige Ramappa Signed-off-by: Ramalingam C Link: https://patchwork.freedesktop.org/patch/msgid/20220216181504.7155-1-vinay.belgaumkar@intel.com (cherry picked from commit 9648f1c3739505557d94ff749a4f32192ea81fe3) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c index 13b27b8ff74e..ba21ace973da 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c @@ -110,7 +110,7 @@ static int guc_action_slpc_unset_param(struct intel_guc *guc, u8 id) { u32 request[] = { GUC_ACTION_HOST2GUC_PC_SLPC_REQUEST, - SLPC_EVENT(SLPC_EVENT_PARAMETER_UNSET, 2), + SLPC_EVENT(SLPC_EVENT_PARAMETER_UNSET, 1), id, }; From 08783aa7693f55619859f4f63f384abf17cb58c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 24 Feb 2022 15:21:42 +0200 Subject: [PATCH 511/773] drm/i915: s/JSP2/ICP2/ PCH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This JSP2 PCH actually seems to be some special Apple specific ICP variant rather than a JSP. Make it so. Or at least all the references to it seem to be some Apple ICL machines. Didn't manage to find these PCI IDs in any public chipset docs unfortunately. The only thing we're losing here with this JSP->ICP change is Wa_14011294188, but based on the HSD that isn't actually needed on any ICP based design (including JSP), only TGP based stuff (including MCC) really need it. The documented w/a just never made that distinction because Windows didn't want to differentiate between JSP and MCC (not sure how they handle hpd/ddc/etc. then though...). Cc: stable@vger.kernel.org Cc: Matt Roper Cc: Vivek Kasireddy Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4226 Fixes: 943682e3bd19 ("drm/i915: Introduce Jasper Lake PCH") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220224132142.12927-1-ville.syrjala@linux.intel.com Acked-by: Vivek Kasireddy Tested-by: Tomas Bzatek (cherry picked from commit 53581504a8e216d435f114a4f2596ad0dfd902fc) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_pch.c | 2 +- drivers/gpu/drm/i915/intel_pch.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_pch.c b/drivers/gpu/drm/i915/intel_pch.c index da8f82c2342f..fc8a68f3a2ed 100644 --- a/drivers/gpu/drm/i915/intel_pch.c +++ b/drivers/gpu/drm/i915/intel_pch.c @@ -108,6 +108,7 @@ intel_pch_type(const struct drm_i915_private *dev_priv, unsigned short id) /* Comet Lake V PCH is based on KBP, which is SPT compatible */ return PCH_SPT; case INTEL_PCH_ICP_DEVICE_ID_TYPE: + case INTEL_PCH_ICP2_DEVICE_ID_TYPE: drm_dbg_kms(&dev_priv->drm, "Found Ice Lake PCH\n"); drm_WARN_ON(&dev_priv->drm, !IS_ICELAKE(dev_priv)); return PCH_ICP; @@ -123,7 +124,6 @@ intel_pch_type(const struct drm_i915_private *dev_priv, unsigned short id) !IS_GEN9_BC(dev_priv)); return PCH_TGP; case INTEL_PCH_JSP_DEVICE_ID_TYPE: - case INTEL_PCH_JSP2_DEVICE_ID_TYPE: drm_dbg_kms(&dev_priv->drm, "Found Jasper Lake PCH\n"); drm_WARN_ON(&dev_priv->drm, !IS_JSL_EHL(dev_priv)); return PCH_JSP; diff --git a/drivers/gpu/drm/i915/intel_pch.h b/drivers/gpu/drm/i915/intel_pch.h index 6bff77521094..4ba0f1967cca 100644 --- a/drivers/gpu/drm/i915/intel_pch.h +++ b/drivers/gpu/drm/i915/intel_pch.h @@ -50,11 +50,11 @@ enum intel_pch { #define INTEL_PCH_CMP2_DEVICE_ID_TYPE 0x0680 #define INTEL_PCH_CMP_V_DEVICE_ID_TYPE 0xA380 #define INTEL_PCH_ICP_DEVICE_ID_TYPE 0x3480 +#define INTEL_PCH_ICP2_DEVICE_ID_TYPE 0x3880 #define INTEL_PCH_MCC_DEVICE_ID_TYPE 0x4B00 #define INTEL_PCH_TGP_DEVICE_ID_TYPE 0xA080 #define INTEL_PCH_TGP2_DEVICE_ID_TYPE 0x4380 #define INTEL_PCH_JSP_DEVICE_ID_TYPE 0x4D80 -#define INTEL_PCH_JSP2_DEVICE_ID_TYPE 0x3880 #define INTEL_PCH_ADP_DEVICE_ID_TYPE 0x7A80 #define INTEL_PCH_ADP2_DEVICE_ID_TYPE 0x5180 #define INTEL_PCH_ADP3_DEVICE_ID_TYPE 0x7A00 From f0d2f15362f02444c5d7ffd5a5eb03e4aa54b685 Mon Sep 17 00:00:00 2001 From: Rong Chen Date: Wed, 16 Feb 2022 20:42:39 +0800 Subject: [PATCH 512/773] mmc: meson: Fix usage of meson_mmc_post_req() Currently meson_mmc_post_req() is called in meson_mmc_request() right after meson_mmc_start_cmd(). This could lead to DMA unmapping before the request is actually finished. To fix, don't call meson_mmc_post_req() until meson_mmc_request_done(). Signed-off-by: Rong Chen Reviewed-by: Kevin Hilman Fixes: 79ed05e329c3 ("mmc: meson-gx: add support for descriptor chain mode") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220216124239.4007667-1-rong.chen@amlogic.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 8f36536cb1b6..58ab9d90bc8b 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -173,6 +173,8 @@ struct meson_host { int irq; bool vqmmc_enabled; + bool needs_pre_post_req; + }; #define CMD_CFG_LENGTH_MASK GENMASK(8, 0) @@ -663,6 +665,8 @@ static void meson_mmc_request_done(struct mmc_host *mmc, struct meson_host *host = mmc_priv(mmc); host->cmd = NULL; + if (host->needs_pre_post_req) + meson_mmc_post_req(mmc, mrq, 0); mmc_request_done(host->mmc, mrq); } @@ -880,7 +884,7 @@ static int meson_mmc_validate_dram_access(struct mmc_host *mmc, struct mmc_data static void meson_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) { struct meson_host *host = mmc_priv(mmc); - bool needs_pre_post_req = mrq->data && + host->needs_pre_post_req = mrq->data && !(mrq->data->host_cookie & SD_EMMC_PRE_REQ_DONE); /* @@ -896,22 +900,19 @@ static void meson_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq) } } - if (needs_pre_post_req) { + if (host->needs_pre_post_req) { meson_mmc_get_transfer_mode(mmc, mrq); if (!meson_mmc_desc_chain_mode(mrq->data)) - needs_pre_post_req = false; + host->needs_pre_post_req = false; } - if (needs_pre_post_req) + if (host->needs_pre_post_req) meson_mmc_pre_req(mmc, mrq); /* Stop execution */ writel(0, host->regs + SD_EMMC_START); meson_mmc_start_cmd(mmc, mrq->sbc ?: mrq->cmd); - - if (needs_pre_post_req) - meson_mmc_post_req(mmc, mrq, 0); } static void meson_mmc_read_resp(struct mmc_host *mmc, struct mmc_command *cmd) From b00833768e170a31af09268f7ab96aecfcca9623 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Mon, 21 Feb 2022 13:33:48 +0800 Subject: [PATCH 513/773] iommu/vt-d: Fix double list_add when enabling VMD in scalable mode When enabling VMD and IOMMU scalable mode, the following kernel panic call trace/kernel log is shown in Eagle Stream platform (Sapphire Rapids CPU) during booting: pci 0000:59:00.5: Adding to iommu group 42 ... vmd 0000:59:00.5: PCI host bridge to bus 10000:80 pci 10000:80:01.0: [8086:352a] type 01 class 0x060400 pci 10000:80:01.0: reg 0x10: [mem 0x00000000-0x0001ffff 64bit] pci 10000:80:01.0: enabling Extended Tags pci 10000:80:01.0: PME# supported from D0 D3hot D3cold pci 10000:80:01.0: DMAR: Setup RID2PASID failed pci 10000:80:01.0: Failed to add to iommu group 42: -16 pci 10000:80:03.0: [8086:352b] type 01 class 0x060400 pci 10000:80:03.0: reg 0x10: [mem 0x00000000-0x0001ffff 64bit] pci 10000:80:03.0: enabling Extended Tags pci 10000:80:03.0: PME# supported from D0 D3hot D3cold ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:29! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 7 Comm: kworker/0:1 Not tainted 5.17.0-rc3+ #7 Hardware name: Lenovo ThinkSystem SR650V3/SB27A86647, BIOS ESE101Y-1.00 01/13/2022 Workqueue: events work_for_cpu_fn RIP: 0010:__list_add_valid.cold+0x26/0x3f Code: 9a 4a ab ff 4c 89 c1 48 c7 c7 40 0c d9 9e e8 b9 b1 fe ff 0f 0b 48 89 f2 4c 89 c1 48 89 fe 48 c7 c7 f0 0c d9 9e e8 a2 b1 fe ff <0f> 0b 48 89 d1 4c 89 c6 4c 89 ca 48 c7 c7 98 0c d9 9e e8 8b b1 fe RSP: 0000:ff5ad434865b3a40 EFLAGS: 00010246 RAX: 0000000000000058 RBX: ff4d61160b74b880 RCX: ff4d61255e1fffa8 RDX: 0000000000000000 RSI: 00000000fffeffff RDI: ffffffff9fd34f20 RBP: ff4d611d8e245c00 R08: 0000000000000000 R09: ff5ad434865b3888 R10: ff5ad434865b3880 R11: ff4d61257fdc6fe8 R12: ff4d61160b74b8a0 R13: ff4d61160b74b8a0 R14: ff4d611d8e245c10 R15: ff4d611d8001ba70 FS: 0000000000000000(0000) GS:ff4d611d5ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ff4d611fa1401000 CR3: 0000000aa0210001 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: intel_pasid_alloc_table+0x9c/0x1d0 dmar_insert_one_dev_info+0x423/0x540 ? device_to_iommu+0x12d/0x2f0 intel_iommu_attach_device+0x116/0x290 __iommu_attach_device+0x1a/0x90 iommu_group_add_device+0x190/0x2c0 __iommu_probe_device+0x13e/0x250 iommu_probe_device+0x24/0x150 iommu_bus_notifier+0x69/0x90 blocking_notifier_call_chain+0x5a/0x80 device_add+0x3db/0x7b0 ? arch_memremap_can_ram_remap+0x19/0x50 ? memremap+0x75/0x140 pci_device_add+0x193/0x1d0 pci_scan_single_device+0xb9/0xf0 pci_scan_slot+0x4c/0x110 pci_scan_child_bus_extend+0x3a/0x290 vmd_enable_domain.constprop.0+0x63e/0x820 vmd_probe+0x163/0x190 local_pci_probe+0x42/0x80 work_for_cpu_fn+0x13/0x20 process_one_work+0x1e2/0x3b0 worker_thread+0x1c4/0x3a0 ? rescuer_thread+0x370/0x370 kthread+0xc7/0xf0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Modules linked in: ---[ end trace 0000000000000000 ]--- ... Kernel panic - not syncing: Fatal exception Kernel Offset: 0x1ca00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- The following 'lspci' output shows devices '10000:80:*' are subdevices of the VMD device 0000:59:00.5: $ lspci ... 0000:59:00.5 RAID bus controller: Intel Corporation Volume Management Device NVMe RAID Controller (rev 20) ... 10000:80:01.0 PCI bridge: Intel Corporation Device 352a (rev 03) 10000:80:03.0 PCI bridge: Intel Corporation Device 352b (rev 03) 10000:80:05.0 PCI bridge: Intel Corporation Device 352c (rev 03) 10000:80:07.0 PCI bridge: Intel Corporation Device 352d (rev 03) 10000:81:00.0 Non-Volatile memory controller: Intel Corporation NVMe Datacenter SSD [3DNAND, Beta Rock Controller] 10000:82:00.0 Non-Volatile memory controller: Intel Corporation NVMe Datacenter SSD [3DNAND, Beta Rock Controller] The symptom 'list_add double add' is caused by the following failure message: pci 10000:80:01.0: DMAR: Setup RID2PASID failed pci 10000:80:01.0: Failed to add to iommu group 42: -16 pci 10000:80:03.0: [8086:352b] type 01 class 0x060400 Device 10000:80:01.0 is the subdevice of the VMD device 0000:59:00.5, so invoking intel_pasid_alloc_table() gets the pasid_table of the VMD device 0000:59:00.5. Here is call path: intel_pasid_alloc_table pci_for_each_dma_alias get_alias_pasid_table search_pasid_table pci_real_dma_dev() in pci_for_each_dma_alias() gets the real dma device which is the VMD device 0000:59:00.5. However, pte of the VMD device 0000:59:00.5 has been configured during this message "pci 0000:59:00.5: Adding to iommu group 42". So, the status -EBUSY is returned when configuring pasid entry for device 10000:80:01.0. It then invokes dmar_remove_one_dev_info() to release 'struct device_domain_info *' from iommu_devinfo_cache. But, the pasid table is not released because of the following statement in __dmar_remove_one_dev_info(): if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { ... intel_pasid_free_table(info->dev); } The subsequent dmar_insert_one_dev_info() operation of device 10000:80:03.0 allocates 'struct device_domain_info *' from iommu_devinfo_cache. The allocated address is the same address that is released previously for device 10000:80:01.0. Finally, invoking device_attach_pasid_table() causes the issue. `git bisect` points to the offending commit 474dd1c65064 ("iommu/vt-d: Fix clearing real DMA device's scalable-mode context entries"), which releases the pasid table if the device is not the subdevice by checking the returned status of dev_is_real_dma_subdevice(). Reverting the offending commit can work around the issue. The solution is to prevent from allocating pasid table if those devices are subdevices of the VMD device. Fixes: 474dd1c65064 ("iommu/vt-d: Fix clearing real DMA device's scalable-mode context entries") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Adrian Huang Link: https://lore.kernel.org/r/20220216091307.703-1-adrianhuang0701@gmail.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220221053348.262724-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 92fea3fbbb11..5b196cfe9ed2 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2738,7 +2738,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, spin_unlock_irqrestore(&device_domain_lock, flags); /* PASID table is mandatory for a PCI device in scalable mode. */ - if (dev && dev_is_pci(dev) && sm_supported(iommu)) { + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); if (ret) { dev_err(dev, "PASID table allocation failed\n"); From 9826e393e4a8c3df474e7f9eacd3087266f74005 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 7 Jan 2022 08:09:11 +0000 Subject: [PATCH 514/773] iommu/tegra-smmu: Fix missing put_device() call in tegra_smmu_find The reference taken by 'of_find_device_by_node()' must be released when not needed anymore. Add the corresponding 'put_device()' in the error handling path. Fixes: 765a9d1d02b2 ("iommu/tegra-smmu: Fix mc errors on tegra124-nyan") Signed-off-by: Miaoqian Lin Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20220107080915.12686-1-linmq006@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/tegra-smmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index e900e3c46903..2561ce8a2ce8 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -808,8 +808,10 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np) return NULL; mc = platform_get_drvdata(pdev); - if (!mc) + if (!mc) { + put_device(&pdev->dev); return NULL; + } return mc->smmu; } From 30939293262eb433c960c4532a0d59c4073b2b84 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 28 Feb 2022 11:43:54 +0800 Subject: [PATCH 515/773] blktrace: fix use after free for struct blk_trace When tracing the whole disk, 'dropped' and 'msg' will be created under 'q->debugfs_dir' and 'bt->dir' is NULL, thus blk_trace_free() won't remove those files. What's worse, the following UAF can be triggered because of accessing stale 'dropped' and 'msg': ================================================================== BUG: KASAN: use-after-free in blk_dropped_read+0x89/0x100 Read of size 4 at addr ffff88816912f3d8 by task blktrace/1188 CPU: 27 PID: 1188 Comm: blktrace Not tainted 5.17.0-rc4-next-20220217+ #469 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4 Call Trace: dump_stack_lvl+0x34/0x44 print_address_description.constprop.0.cold+0xab/0x381 ? blk_dropped_read+0x89/0x100 ? blk_dropped_read+0x89/0x100 kasan_report.cold+0x83/0xdf ? blk_dropped_read+0x89/0x100 kasan_check_range+0x140/0x1b0 blk_dropped_read+0x89/0x100 ? blk_create_buf_file_callback+0x20/0x20 ? kmem_cache_free+0xa1/0x500 ? do_sys_openat2+0x258/0x460 full_proxy_read+0x8f/0xc0 vfs_read+0xc6/0x260 ksys_read+0xb9/0x150 ? vfs_write+0x3d0/0x3d0 ? fpregs_assert_state_consistent+0x55/0x60 ? exit_to_user_mode_prepare+0x39/0x1e0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fbc080d92fd Code: ce 20 00 00 75 10 b8 00 00 00 00 0f 05 48 3d 01 f0 ff ff 73 31 c3 48 83 1 RSP: 002b:00007fbb95ff9cb0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00007fbb95ff9dc0 RCX: 00007fbc080d92fd RDX: 0000000000000100 RSI: 00007fbb95ff9cc0 RDI: 0000000000000045 RBP: 0000000000000045 R08: 0000000000406299 R09: 00000000fffffffd R10: 000000000153afa0 R11: 0000000000000293 R12: 00007fbb780008c0 R13: 00007fbb78000938 R14: 0000000000608b30 R15: 00007fbb780029c8 Allocated by task 1050: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 do_blk_trace_setup+0xcb/0x410 __blk_trace_setup+0xac/0x130 blk_trace_ioctl+0xe9/0x1c0 blkdev_ioctl+0xf1/0x390 __x64_sys_ioctl+0xa5/0xe0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 1050: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0x103/0x180 kfree+0x9a/0x4c0 __blk_trace_remove+0x53/0x70 blk_trace_ioctl+0x199/0x1c0 blkdev_common_ioctl+0x5e9/0xb30 blkdev_ioctl+0x1a5/0x390 __x64_sys_ioctl+0xa5/0xe0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88816912f380 which belongs to the cache kmalloc-96 of size 96 The buggy address is located 88 bytes inside of 96-byte region [ffff88816912f380, ffff88816912f3e0) The buggy address belongs to the page: page:000000009a1b4e7c refcount:1 mapcount:0 mapping:0000000000000000 index:0x0f flags: 0x17ffffc0000200(slab|node=0|zone=2|lastcpupid=0x1fffff) raw: 0017ffffc0000200 ffffea00044f1100 dead000000000002 ffff88810004c780 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88816912f280: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88816912f300: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc >ffff88816912f380: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ^ ffff88816912f400: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88816912f480: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ================================================================== Fixes: c0ea57608b69 ("blktrace: remove debugfs file dentries from struct blk_trace") Signed-off-by: Yu Kuai Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220228034354.4047385-1-yukuai3@huawei.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index af68a67179b4..21dea90eaa93 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -310,10 +310,20 @@ record_it: local_irq_restore(flags); } -static void blk_trace_free(struct blk_trace *bt) +static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); - debugfs_remove(bt->dir); + + /* + * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created + * under 'q->debugfs_dir', thus lookup and remove them. + */ + if (!bt->dir) { + debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); + debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + } else { + debugfs_remove(bt->dir); + } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -335,10 +345,10 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } -static void blk_trace_cleanup(struct blk_trace *bt) +static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); put_probe_ref(); } @@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q) return -EINVAL; if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(bt); + blk_trace_cleanup(q, bt); return 0; } @@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: if (ret) - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } @@ -1616,7 +1626,7 @@ static int blk_trace_remove_queue(struct request_queue *q) put_probe_ref(); synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); return 0; } @@ -1647,7 +1657,7 @@ static int blk_trace_setup_queue(struct request_queue *q, return 0; free_bt: - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } From 7b83299e5b9385943a857d59e15cba270df20d7e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 23 Feb 2022 20:46:35 +0100 Subject: [PATCH 516/773] ARM: 9182/1: mmu: fix returns from early_param() and __setup() functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit early_param() handlers should return 0 on success. __setup() handlers should return 1 on success, i.e., the parameter has been handled. A return of 0 would cause the "option=value" string to be added to init's environment strings, polluting it. ../arch/arm/mm/mmu.c: In function 'test_early_cachepolicy': ../arch/arm/mm/mmu.c:215:1: error: no return statement in function returning non-void [-Werror=return-type] ../arch/arm/mm/mmu.c: In function 'test_noalign_setup': ../arch/arm/mm/mmu.c:221:1: error: no return statement in function returning non-void [-Werror=return-type] Fixes: b849a60e0903 ("ARM: make cr_alignment read-only #ifndef CONFIG_CPU_CP15") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Cc: Uwe Kleine-König Cc: linux-arm-kernel@lists.infradead.org Cc: patches@armlinux.org.uk Signed-off-by: Russell King (Oracle) --- arch/arm/mm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 274e4f73fd33..5e2be37a198e 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -212,12 +212,14 @@ early_param("ecc", early_ecc); static int __init early_cachepolicy(char *p) { pr_warn("cachepolicy kernel parameter not supported without cp15\n"); + return 0; } early_param("cachepolicy", early_cachepolicy); static int __init noalign_setup(char *__unused) { pr_warn("noalign kernel parameter not supported without cp15\n"); + return 1; } __setup("noalign", noalign_setup); From fda2635466cd26ad237e1bc5d3f6a60f97ad09b6 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Wed, 16 Feb 2022 14:31:35 +0100 Subject: [PATCH 517/773] igc: igc_read_phy_reg_gpy: drop premature return igc_read_phy_reg_gpy checks the return value from igc_read_phy_reg_mdic and if it's not 0, returns immediately. By doing this, it leaves the HW semaphore in the acquired state. Drop this premature return statement, the function returns after releasing the semaphore immediately anyway. Fixes: 5586838fe9ce ("igc: Add code for PHY support") Signed-off-by: Corinna Vinschen Acked-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_phy.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index 5cad31c3c7b0..df91d07ce82a 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -779,8 +779,6 @@ s32 igc_read_phy_reg_gpy(struct igc_hw *hw, u32 offset, u16 *data) if (ret_val) return ret_val; ret_val = igc_read_phy_reg_mdic(hw, offset, data); - if (ret_val) - return ret_val; hw->phy.ops.release(hw); } else { ret_val = igc_read_xmdio_reg(hw, (u16)offset, dev_addr, From c4208653a327a09da1e9e7b10299709b6d9b17bf Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Sun, 20 Feb 2022 09:29:15 +0200 Subject: [PATCH 518/773] igc: igc_write_phy_reg_gpy: drop premature return Similar to "igc_read_phy_reg_gpy: drop premature return" patch. igc_write_phy_reg_gpy checks the return value from igc_write_phy_reg_mdic and if it's not 0, returns immediately. By doing this, it leaves the HW semaphore in the acquired state. Drop this premature return statement, the function returns after releasing the semaphore immediately anyway. Fixes: 5586838fe9ce ("igc: Add code for PHY support") Suggested-by: Dima Ruinskiy Reported-by: Corinna Vinschen Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_phy.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c index df91d07ce82a..40dbf4b43234 100644 --- a/drivers/net/ethernet/intel/igc/igc_phy.c +++ b/drivers/net/ethernet/intel/igc/igc_phy.c @@ -746,8 +746,6 @@ s32 igc_write_phy_reg_gpy(struct igc_hw *hw, u32 offset, u16 data) if (ret_val) return ret_val; ret_val = igc_write_phy_reg_mdic(hw, offset, data); - if (ret_val) - return ret_val; hw->phy.ops.release(hw); } else { ret_val = igc_write_xmdio_reg(hw, (u16)offset, dev_addr, From 244d00b5dd4755f8df892c86cab35fb2cfd4f14b Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 28 Feb 2022 11:23:15 -0600 Subject: [PATCH 519/773] x86/speculation: Use generic retpoline by default on AMD AMD retpoline may be susceptible to speculation. The speculation execution window for an incorrect indirect branch prediction using LFENCE/JMP sequence may potentially be large enough to allow exploitation using Spectre V2. By default, don't use retpoline,lfence on AMD. Instead, use the generic retpoline. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/bugs.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0a4267c63d3b..a36bfe2c2480 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -941,15 +941,6 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_NONE; } - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("LFENCE not serializing, switching to generic retpoline\n"); - return SPECTRE_V2_RETPOLINE; - } - return SPECTRE_V2_LFENCE; - } - return SPECTRE_V2_RETPOLINE; } From e9b6013a7ce31535b04b02ba99babefe8a8599fa Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 28 Feb 2022 11:23:16 -0600 Subject: [PATCH 520/773] x86/speculation: Update link to AMD speculation whitepaper Update the link to the "Software Techniques for Managing Speculation on AMD Processors" whitepaper. Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov --- Documentation/admin-guide/hw-vuln/spectre.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 79051f4dac45..9e9556826450 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -60,8 +60,8 @@ privileged data touched during the speculative execution. Spectre variant 1 attacks take advantage of speculative execution of conditional branches, while Spectre variant 2 attacks use speculative execution of indirect branches to leak privileged memory. -See :ref:`[1] ` :ref:`[5] ` :ref:`[7] ` -:ref:`[10] ` :ref:`[11] `. +See :ref:`[1] ` :ref:`[5] ` :ref:`[6] ` +:ref:`[7] ` :ref:`[10] ` :ref:`[11] `. Spectre variant 1 (Bounds Check Bypass) --------------------------------------- @@ -697,7 +697,7 @@ AMD white papers: .. _spec_ref6: -[6] `Software techniques for managing speculation on AMD processors `_. +[6] `Software techniques for managing speculation on AMD processors `_. ARM white papers: From 26d3474348293dc752c55fe6d41282199f73714c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 22 Feb 2022 14:18:43 -0800 Subject: [PATCH 521/773] drm/bridge: ti-sn65dsi86: Properly undo autosuspend The PM Runtime docs say: Drivers in ->remove() callback should undo the runtime PM changes done in ->probe(). Usually this means calling pm_runtime_disable(), pm_runtime_dont_use_autosuspend() etc. We weren't doing that for autosuspend. Let's do it. Fixes: 9bede63127c6 ("drm/bridge: ti-sn65dsi86: Use pm_runtime autosuspend") Signed-off-by: Douglas Anderson Reviewed-by: Linus Walleij Link: https://patchwork.freedesktop.org/patch/msgid/20220222141838.1.If784ba19e875e8ded4ec4931601ce6d255845245@changeid --- drivers/gpu/drm/bridge/ti-sn65dsi86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index dab8f76618f3..68d8415e6c28 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -1802,6 +1802,7 @@ static inline void ti_sn_gpio_unregister(void) {} static void ti_sn65dsi86_runtime_disable(void *data) { + pm_runtime_dont_use_autosuspend(data); pm_runtime_disable(data); } @@ -1861,11 +1862,11 @@ static int ti_sn65dsi86_probe(struct i2c_client *client, "failed to get reference clock\n"); pm_runtime_enable(dev); + pm_runtime_set_autosuspend_delay(pdata->dev, 500); + pm_runtime_use_autosuspend(pdata->dev); ret = devm_add_action_or_reset(dev, ti_sn65dsi86_runtime_disable, dev); if (ret) return ret; - pm_runtime_set_autosuspend_delay(pdata->dev, 500); - pm_runtime_use_autosuspend(pdata->dev); ti_sn65dsi86_debugfs_init(pdata); From cb1852783f790feae845006d062acb9e0a5d4304 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler Date: Mon, 24 Jan 2022 16:24:37 +0000 Subject: [PATCH 522/773] drm/arm: arm hdlcd select DRM_GEM_CMA_HELPER Without DRM_GEM_CMA_HELPER HDLCD won't build. This needs to be there too. Fixes: 09717af7d13d ("drm: Remove CONFIG_DRM_KMS_CMA_HELPER option") Reviewed-by: Steven Price Signed-off-by: Carsten Haitzler Acked-by: Liviu Dudau Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220124162437.2470344-1-carsten.haitzler@foss.arm.com --- drivers/gpu/drm/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/arm/Kconfig b/drivers/gpu/drm/arm/Kconfig index 58a242871b28..6e3f1d600541 100644 --- a/drivers/gpu/drm/arm/Kconfig +++ b/drivers/gpu/drm/arm/Kconfig @@ -6,6 +6,7 @@ config DRM_HDLCD depends on DRM && OF && (ARM || ARM64 || COMPILE_TEST) depends on COMMON_CLK select DRM_KMS_HELPER + select DRM_GEM_CMA_HELPER help Choose this option if you have an ARM High Definition Colour LCD controller. From 56763f12b0f02706576a088e85ef856deacc98a0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 27 Feb 2022 10:01:41 -0800 Subject: [PATCH 523/773] netfilter: fix use-after-free in __nf_register_net_hook() We must not dereference @new_hooks after nf_hook_mutex has been released, because other threads might have freed our allocated hooks already. BUG: KASAN: use-after-free in nf_hook_entries_get_hook_ops include/linux/netfilter.h:130 [inline] BUG: KASAN: use-after-free in hooks_validate net/netfilter/core.c:171 [inline] BUG: KASAN: use-after-free in __nf_register_net_hook+0x77a/0x820 net/netfilter/core.c:438 Read of size 2 at addr ffff88801c1a8000 by task syz-executor237/4430 CPU: 1 PID: 4430 Comm: syz-executor237 Not tainted 5.17.0-rc5-syzkaller-00306-g2293be58d6a1 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 nf_hook_entries_get_hook_ops include/linux/netfilter.h:130 [inline] hooks_validate net/netfilter/core.c:171 [inline] __nf_register_net_hook+0x77a/0x820 net/netfilter/core.c:438 nf_register_net_hook+0x114/0x170 net/netfilter/core.c:571 nf_register_net_hooks+0x59/0xc0 net/netfilter/core.c:587 nf_synproxy_ipv6_init+0x85/0xe0 net/netfilter/nf_synproxy_core.c:1218 synproxy_tg6_check+0x30d/0x560 net/ipv6/netfilter/ip6t_SYNPROXY.c:81 xt_check_target+0x26c/0x9e0 net/netfilter/x_tables.c:1038 check_target net/ipv6/netfilter/ip6_tables.c:530 [inline] find_check_entry.constprop.0+0x7f1/0x9e0 net/ipv6/netfilter/ip6_tables.c:573 translate_table+0xc8b/0x1750 net/ipv6/netfilter/ip6_tables.c:735 do_replace net/ipv6/netfilter/ip6_tables.c:1153 [inline] do_ip6t_set_ctl+0x56e/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 ipv6_setsockopt+0x122/0x180 net/ipv6/ipv6_sockglue.c:1024 rawv6_setsockopt+0xd3/0x6a0 net/ipv6/raw.c:1084 __sys_setsockopt+0x2db/0x610 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f65a1ace7d9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 71 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f65a1a7f308 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f65a1ace7d9 RDX: 0000000000000040 RSI: 0000000000000029 RDI: 0000000000000003 RBP: 00007f65a1b574c8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000020000000 R11: 0000000000000246 R12: 00007f65a1b55130 R13: 00007f65a1b574c0 R14: 00007f65a1b24090 R15: 0000000000022000 The buggy address belongs to the page: page:ffffea0000706a00 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1c1a8 flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000000 ffffea0001c1b108 ffffea000046dd08 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as freed page last allocated via order 2, migratetype Unmovable, gfp_mask 0x52dc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_ZERO), pid 4430, ts 1061781545818, free_ts 1061791488993 prep_new_page mm/page_alloc.c:2434 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389 __alloc_pages_node include/linux/gfp.h:572 [inline] alloc_pages_node include/linux/gfp.h:595 [inline] kmalloc_large_node+0x62/0x130 mm/slub.c:4438 __kmalloc_node+0x35a/0x4a0 mm/slub.c:4454 kmalloc_node include/linux/slab.h:604 [inline] kvmalloc_node+0x97/0x100 mm/util.c:580 kvmalloc include/linux/slab.h:731 [inline] kvzalloc include/linux/slab.h:739 [inline] allocate_hook_entries_size net/netfilter/core.c:61 [inline] nf_hook_entries_grow+0x140/0x780 net/netfilter/core.c:128 __nf_register_net_hook+0x144/0x820 net/netfilter/core.c:429 nf_register_net_hook+0x114/0x170 net/netfilter/core.c:571 nf_register_net_hooks+0x59/0xc0 net/netfilter/core.c:587 nf_synproxy_ipv6_init+0x85/0xe0 net/netfilter/nf_synproxy_core.c:1218 synproxy_tg6_check+0x30d/0x560 net/ipv6/netfilter/ip6t_SYNPROXY.c:81 xt_check_target+0x26c/0x9e0 net/netfilter/x_tables.c:1038 check_target net/ipv6/netfilter/ip6_tables.c:530 [inline] find_check_entry.constprop.0+0x7f1/0x9e0 net/ipv6/netfilter/ip6_tables.c:573 translate_table+0xc8b/0x1750 net/ipv6/netfilter/ip6_tables.c:735 do_replace net/ipv6/netfilter/ip6_tables.c:1153 [inline] do_ip6t_set_ctl+0x56e/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1352 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404 free_unref_page_prepare mm/page_alloc.c:3325 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3404 kvfree+0x42/0x50 mm/util.c:613 rcu_do_batch kernel/rcu/tree.c:2527 [inline] rcu_core+0x7b1/0x1820 kernel/rcu/tree.c:2778 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Memory state around the buggy address: ffff88801c1a7f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88801c1a7f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88801c1a8000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff88801c1a8080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88801c1a8100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Fixes: 2420b79f8c18 ("netfilter: debug: check for sorted array") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 354cb472f386..8a77a3fd69bc 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -428,14 +428,15 @@ static int __nf_register_net_hook(struct net *net, int pf, p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); - if (!IS_ERR(new_hooks)) + if (!IS_ERR(new_hooks)) { + hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); + } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); - hooks_validate(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); From 17a8f31bba7bac8cce4bd12bab50697da96e7710 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Feb 2022 04:18:05 +0100 Subject: [PATCH 524/773] netfilter: egress: silence egress hook lockdep splats Netfilter assumes its called with rcu_read_lock held, but in egress hook case it may be called with BH readlock. This triggers lockdep splat. In order to avoid to change all rcu_dereference() to rcu_dereference_check(..., rcu_read_lock_bh_held()), wrap nf_hook_slow with read lock/unlock pair. Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_netdev.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index b4dd96e4dc8d..e6487a691136 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -101,7 +101,11 @@ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, nf_hook_state_init(&state, NF_NETDEV_EGRESS, NFPROTO_NETDEV, dev, NULL, NULL, dev_net(dev), NULL); + + /* nf assumes rcu_read_lock, not just read_lock_bh */ + rcu_read_lock(); ret = nf_hook_slow(skb, &state, e, 0); + rcu_read_unlock(); if (ret == 1) { return skb; From 1866aa0d0d6492bc2f8d22d0df49abaccf50cddd Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Tue, 25 Jan 2022 19:31:23 +0200 Subject: [PATCH 525/773] e1000e: Fix possible HW unit hang after an s0ix exit Disable the OEM bit/Gig Disable/restart AN impact and disable the PHY LAN connected device (LCD) reset during power management flows. This fixes possible HW unit hangs on the s0ix exit on some corporate ADL platforms. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=214821 Fixes: 3e55d231716e ("e1000e: Add handshake with the CSME to support S0ix") Suggested-by: Dima Ruinskiy Suggested-by: Nir Efrati Signed-off-by: Sasha Neftin Tested-by: Kai-Heng Feng Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/hw.h | 1 + drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++++ drivers/net/ethernet/intel/e1000e/ich8lan.h | 1 + drivers/net/ethernet/intel/e1000e/netdev.c | 26 +++++++++++++++++++++ 4 files changed, 32 insertions(+) diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index bcf680e83811..13382df2f2ef 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -630,6 +630,7 @@ struct e1000_phy_info { bool disable_polarity_correction; bool is_mdix; bool polarity_correction; + bool reset_disable; bool speed_downgraded; bool autoneg_wait_to_complete; }; diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index c908c84b86d2..e298da712758 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -2050,6 +2050,10 @@ static s32 e1000_check_reset_block_ich8lan(struct e1000_hw *hw) bool blocked = false; int i = 0; + /* Check the PHY (LCD) reset flag */ + if (hw->phy.reset_disable) + return true; + while ((blocked = !(er32(FWSM) & E1000_ICH_FWSM_RSPCIPHY)) && (i++ < 30)) usleep_range(10000, 11000); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h b/drivers/net/ethernet/intel/e1000e/ich8lan.h index 2504b11c3169..638a3ddd7ada 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.h +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h @@ -271,6 +271,7 @@ #define I217_CGFREG_ENABLE_MTA_RESET 0x0002 #define I217_MEMPWR PHY_REG(772, 26) #define I217_MEMPWR_DISABLE_SMB_RELEASE 0x0010 +#define I217_MEMPWR_MOEM 0x1000 /* Receive Address Initial CRC Calculation */ #define E1000_PCH_RAICC(_n) (0x05F50 + ((_n) * 4)) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index a42aeb555f34..c5bdef3ffe26 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6987,8 +6987,21 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); + struct e1000_hw *hw = &adapter->hw; + u16 phy_data; int rc; + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && + hw->mac.type >= e1000_pch_adp) { + /* Mask OEM Bits / Gig Disable / Restart AN (772_26[12] = 1) */ + e1e_rphy(hw, I217_MEMPWR, &phy_data); + phy_data |= I217_MEMPWR_MOEM; + e1e_wphy(hw, I217_MEMPWR, phy_data); + + /* Disable LCD reset */ + hw->phy.reset_disable = true; + } + e1000e_flush_lpic(pdev); e1000e_pm_freeze(dev); @@ -7010,6 +7023,8 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); + struct e1000_hw *hw = &adapter->hw; + u16 phy_data; int rc; /* Introduce S0ix implementation */ @@ -7020,6 +7035,17 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) if (rc) return rc; + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID && + hw->mac.type >= e1000_pch_adp) { + /* Unmask OEM Bits / Gig Disable / Restart AN 772_26[12] = 0 */ + e1e_rphy(hw, I217_MEMPWR, &phy_data); + phy_data &= ~I217_MEMPWR_MOEM; + e1e_wphy(hw, I217_MEMPWR, phy_data); + + /* Enable LCD reset */ + hw->phy.reset_disable = false; + } + return e1000e_pm_thaw(dev); } From ffd24fa2fcc76ecb2e61e7a4ef8588177bcb42a6 Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Thu, 3 Feb 2022 14:21:49 +0200 Subject: [PATCH 526/773] e1000e: Correct NVM checksum verification flow Update MAC type check e1000_pch_tgp because for e1000_pch_cnp, NVM checksum update is still possible. Emit a more detailed warning message. Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=1191663 Fixes: 4051f68318ca ("e1000e: Do not take care about recovery NVM checksum") Reported-by: Thomas Bogendoerfer Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index e298da712758..d60e2016d03c 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -4140,9 +4140,9 @@ static s32 e1000_validate_nvm_checksum_ich8lan(struct e1000_hw *hw) return ret_val; if (!(data & valid_csum_mask)) { - e_dbg("NVM Checksum Invalid\n"); + e_dbg("NVM Checksum valid bit not set\n"); - if (hw->mac.type < e1000_pch_cnp) { + if (hw->mac.type < e1000_pch_tgp) { data |= valid_csum_mask; ret_val = e1000_write_nvm(hw, word, 1, &data); if (ret_val) From 7795686d573de0438bba6b2b344e6b203223c889 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 26 Jan 2022 12:02:04 +0100 Subject: [PATCH 527/773] pinctrl-sunxi: sunxi_pinctrl_gpio_direction_in/output: use correct offset The commit that sets the direction directly without calling pinctrl_gpio_direction(), forgot to add chip->base to the offset when calling sunxi_pmx_gpio_set_direction(). This caused failures for various Allwinner boards which have two GPIO blocks. Signed-off-by: Hans Verkuil Reported-by: 5kft <5kft@5kft.org> Suggested-by: 5kft <5kft@5kft.org> Reported-by: Corentin Labbe Fixes: 8df89a7cbc63 (pinctrl-sunxi: don't call pinctrl_gpio_direction()) Tested-by: Corentin Labbe Tested-by: Jernej Skrabec Acked-by: Jernej Skrabec Link: https://lore.kernel.org/r/0f536cd8-01db-5d16-2cec-ec6d19409a49@xs4all.nl Signed-off-by: Guenter Roeck [Picked from linux-next to pinctrl fixes] Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sunxi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi.c b/drivers/pinctrl/sunxi/pinctrl-sunxi.c index 80d6750c74a6..061323eab8b1 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi.c @@ -837,7 +837,8 @@ static int sunxi_pinctrl_gpio_direction_input(struct gpio_chip *chip, { struct sunxi_pinctrl *pctl = gpiochip_get_data(chip); - return sunxi_pmx_gpio_set_direction(pctl->pctl_dev, NULL, offset, true); + return sunxi_pmx_gpio_set_direction(pctl->pctl_dev, NULL, + chip->base + offset, true); } static int sunxi_pinctrl_gpio_get(struct gpio_chip *chip, unsigned offset) @@ -890,7 +891,8 @@ static int sunxi_pinctrl_gpio_direction_output(struct gpio_chip *chip, struct sunxi_pinctrl *pctl = gpiochip_get_data(chip); sunxi_pinctrl_gpio_set(chip, offset, value); - return sunxi_pmx_gpio_set_direction(pctl->pctl_dev, NULL, offset, false); + return sunxi_pmx_gpio_set_direction(pctl->pctl_dev, NULL, + chip->base + offset, false); } static int sunxi_pinctrl_gpio_of_xlate(struct gpio_chip *gc, From bac129dbc6560dfeb634c03f0c08b78024e71915 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 15 Feb 2022 22:00:36 -0600 Subject: [PATCH 528/773] pinctrl: sunxi: Use unique lockdep classes for IRQs This driver, like several others, uses a chained IRQ for each GPIO bank, and forwards .irq_set_wake to the GPIO bank's upstream IRQ. As a result, a call to irq_set_irq_wake() needs to lock both the upstream and downstream irq_desc's. Lockdep considers this to be a possible deadlock when the irq_desc's share lockdep classes, which they do by default: ============================================ WARNING: possible recursive locking detected 5.17.0-rc3-00394-gc849047c2473 #1 Not tainted -------------------------------------------- init/307 is trying to acquire lock: c2dfe27c (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0x58/0xa0 but task is already holding lock: c3c0ac7c (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0x58/0xa0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&irq_desc_lock_class); lock(&irq_desc_lock_class); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by init/307: #0: c1f29f18 (system_transition_mutex){+.+.}-{3:3}, at: __do_sys_reboot+0x90/0x23c #1: c20f7760 (&dev->mutex){....}-{3:3}, at: device_shutdown+0xf4/0x224 #2: c2e804d8 (&dev->mutex){....}-{3:3}, at: device_shutdown+0x104/0x224 #3: c3c0ac7c (&irq_desc_lock_class){-.-.}-{2:2}, at: __irq_get_desc_lock+0x58/0xa0 stack backtrace: CPU: 0 PID: 307 Comm: init Not tainted 5.17.0-rc3-00394-gc849047c2473 #1 Hardware name: Allwinner sun8i Family unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x68/0x90 dump_stack_lvl from __lock_acquire+0x1680/0x31a0 __lock_acquire from lock_acquire+0x148/0x3dc lock_acquire from _raw_spin_lock_irqsave+0x50/0x6c _raw_spin_lock_irqsave from __irq_get_desc_lock+0x58/0xa0 __irq_get_desc_lock from irq_set_irq_wake+0x2c/0x19c irq_set_irq_wake from irq_set_irq_wake+0x13c/0x19c [tail call from sunxi_pinctrl_irq_set_wake] irq_set_irq_wake from gpio_keys_suspend+0x80/0x1a4 gpio_keys_suspend from gpio_keys_shutdown+0x10/0x2c gpio_keys_shutdown from device_shutdown+0x180/0x224 device_shutdown from __do_sys_reboot+0x134/0x23c __do_sys_reboot from ret_fast_syscall+0x0/0x1c However, this can never deadlock because the upstream and downstream IRQs are never the same (nor do they even involve the same irqchip). Silence this erroneous lockdep splat by applying what appears to be the usual fix of moving the GPIO IRQs to separate lockdep classes. Fixes: a59c99d9eaf9 ("pinctrl: sunxi: Forward calls to irq_set_irq_wake") Reported-by: Guenter Roeck Signed-off-by: Samuel Holland Reviewed-by: Jernej Skrabec Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20220216040037.22730-1-samuel@sholland.org Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sunxi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi.c b/drivers/pinctrl/sunxi/pinctrl-sunxi.c index 061323eab8b1..1f401377ff60 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi.c @@ -36,6 +36,13 @@ #include "../core.h" #include "pinctrl-sunxi.h" +/* + * These lock classes tell lockdep that GPIO IRQs are in a different + * category than their parents, so it won't report false recursion. + */ +static struct lock_class_key sunxi_pinctrl_irq_lock_class; +static struct lock_class_key sunxi_pinctrl_irq_request_class; + static struct irq_chip sunxi_pinctrl_edge_irq_chip; static struct irq_chip sunxi_pinctrl_level_irq_chip; @@ -1557,6 +1564,8 @@ int sunxi_pinctrl_init_with_variant(struct platform_device *pdev, for (i = 0; i < (pctl->desc->irq_banks * IRQ_PER_BANK); i++) { int irqno = irq_create_mapping(pctl->domain, i); + irq_set_lockdep_class(irqno, &sunxi_pinctrl_irq_lock_class, + &sunxi_pinctrl_irq_request_class); irq_set_chip_and_handler(irqno, &sunxi_pinctrl_edge_irq_chip, handle_edge_irq); irq_set_chip_data(irqno, pctl); From d176708ffc20332d1c730098d2b111e0b77ece82 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 28 Feb 2022 22:52:31 -0800 Subject: [PATCH 529/773] Input: goodix - use the new soc_intel_is_byt() helper Use the new soc_intel_is_byt() helper from linux/platform_data/x86/soc.h. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220131143539.109142-5-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/goodix.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index a3bfc7a41679..3647147fd818 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -805,21 +806,6 @@ static int goodix_reset(struct goodix_ts_data *ts) } #ifdef ACPI_GPIO_SUPPORT -#include -#include - -static const struct x86_cpu_id baytrail_cpu_ids[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, X86_FEATURE_ANY, }, - {} -}; - -static inline bool is_byt(void) -{ - const struct x86_cpu_id *id = x86_match_cpu(baytrail_cpu_ids); - - return !!id; -} - static const struct acpi_gpio_params first_gpio = { 0, 0, false }; static const struct acpi_gpio_params second_gpio = { 1, 0, false }; @@ -903,7 +889,7 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) dev_info(dev, "Using ACPI INTI and INTO methods for IRQ pin access\n"); ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_METHOD; gpio_mapping = acpi_goodix_reset_only_gpios; - } else if (is_byt() && ts->gpio_count == 2 && ts->gpio_int_idx == -1) { + } else if (soc_intel_is_byt() && ts->gpio_count == 2 && ts->gpio_int_idx == -1) { dev_info(dev, "No ACPI GpioInt resource, assuming that the GPIO order is reset, int\n"); ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO; gpio_mapping = acpi_goodix_int_last_gpios; From d982992669733dd75520000c6057d8ee0725a363 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 28 Feb 2022 22:53:12 -0800 Subject: [PATCH 530/773] Input: goodix - workaround Cherry Trail devices with a bogus ACPI Interrupt() resource ACPI/x86 devices with a Cherry Trail SoC should have a GpioInt + a regular GPIO ACPI resource in their ACPI tables. Some CHT devices have a bug, where the also is bogus interrupt resource (likely copied from a previous Bay Trail based generation of the device). The i2c-core-acpi code will assign the bogus, non-working, interrupt resource to client->irq. Add a workaround to fix this up. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=2043960 Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220228111613.363336-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/goodix.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index 3647147fd818..752e8ba4fecb 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -864,7 +864,7 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) const struct acpi_gpio_mapping *gpio_mapping = NULL; struct device *dev = &ts->client->dev; LIST_HEAD(resources); - int ret; + int irq, ret; ts->gpio_count = 0; ts->gpio_int_idx = -1; @@ -877,6 +877,20 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) acpi_dev_free_resource_list(&resources); + /* + * CHT devices should have a GpioInt + a regular GPIO ACPI resource. + * Some CHT devices have a bug (where the also is bogus Interrupt + * resource copied from a previous BYT based generation). i2c-core-acpi + * will use the non-working Interrupt resource, fix this up. + */ + if (soc_intel_is_cht() && ts->gpio_count == 2 && ts->gpio_int_idx != -1) { + irq = acpi_dev_gpio_irq_get(ACPI_COMPANION(dev), 0); + if (irq > 0 && irq != ts->client->irq) { + dev_warn(dev, "Overriding IRQ %d -> %d\n", ts->client->irq, irq); + ts->client->irq = irq; + } + } + if (ts->gpio_count == 2 && ts->gpio_int_idx == 0) { ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO; gpio_mapping = acpi_goodix_int_first_gpios; From c432cd598a185afefba1ac3b0ee226f222f71341 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Fri, 28 Jan 2022 15:20:56 +0100 Subject: [PATCH 531/773] soc: mediatek: mt8192-mmsys: Fix dither to dsi0 path's input sel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit d687e056a18f ("soc: mediatek: mmsys: Add mt8192 mmsys routing table"), the mmsys routing table for mt8192 was introduced but the input selector for DITHER->DSI0 has no value assigned to it. This means that we are clearing bit 0 instead of setting it, blocking communication between these two blocks; due to that, any display that is connected to DSI0 will not work, as no data will go through. The effect of that issue is that, during bootup, the DRM will block for some time, while atomically waiting for a vblank that never happens; later, the situation doesn't get better, leaving the display in a non-functional state. To fix this issue, fix the route entry in the table by assigning the dither input selector to MT8192_DISP_DSI0_SEL_IN. Fixes: d687e056a18f ("soc: mediatek: mmsys: Add mt8192 mmsys routing table") Signed-off-by: AngeloGioacchino Del Regno Tested-by: Alyssa Rosenzweig Reviewed-by: Nícolas F. R. A. Prado Link: https://lore.kernel.org/r/20220128142056.359900-1-angelogioacchino.delregno@collabora.com Signed-off-by: Matthias Brugger --- drivers/soc/mediatek/mt8192-mmsys.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/soc/mediatek/mt8192-mmsys.h b/drivers/soc/mediatek/mt8192-mmsys.h index 6f0a57044a7b..6aae0b12b6ff 100644 --- a/drivers/soc/mediatek/mt8192-mmsys.h +++ b/drivers/soc/mediatek/mt8192-mmsys.h @@ -53,7 +53,8 @@ static const struct mtk_mmsys_routes mmsys_mt8192_routing_table[] = { MT8192_AAL0_SEL_IN_CCORR0 }, { DDP_COMPONENT_DITHER, DDP_COMPONENT_DSI0, - MT8192_DISP_DSI0_SEL_IN, MT8192_DSI0_SEL_IN_DITHER0 + MT8192_DISP_DSI0_SEL_IN, MT8192_DSI0_SEL_IN_DITHER0, + MT8192_DSI0_SEL_IN_DITHER0 }, { DDP_COMPONENT_RDMA0, DDP_COMPONENT_COLOR0, MT8192_DISP_RDMA0_SOUT_SEL, MT8192_RDMA0_SOUT_COLOR0, From 5d8965704fe5662e2e4a7e4424a2cbe53e182670 Mon Sep 17 00:00:00 2001 From: Ilya Lipnitskiy Date: Mon, 28 Feb 2022 17:15:07 -0800 Subject: [PATCH 532/773] MIPS: ralink: mt7621: use bitwise NOT instead of logical It was the intention to reverse the bits, not make them all zero by using logical NOT operator. Fixes: cc19db8b312a ("MIPS: ralink: mt7621: do memory detection on KSEG1") Suggested-by: Chuanhong Guo Signed-off-by: Ilya Lipnitskiy Reviewed-by: Sergio Paracuellos Signed-off-by: Thomas Bogendoerfer --- arch/mips/ralink/mt7621.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c index 12c8808e0dea..fb0565bc34fd 100644 --- a/arch/mips/ralink/mt7621.c +++ b/arch/mips/ralink/mt7621.c @@ -69,7 +69,7 @@ static bool __init mt7621_addr_wraparound_test(phys_addr_t size) __raw_writel(MT7621_MEM_TEST_PATTERN, dm); if (__raw_readl(dm) != __raw_readl(dm + size)) return false; - __raw_writel(!MT7621_MEM_TEST_PATTERN, dm); + __raw_writel(~MT7621_MEM_TEST_PATTERN, dm); return __raw_readl(dm) == __raw_readl(dm + size); } From 50bb467c9e76743fbc8441d29113cdad62dbc4fe Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 18 Feb 2022 09:38:58 +0000 Subject: [PATCH 533/773] rfkill: define rfill_soft_blocked() if !RFKILL If CONFIG_RFKILL is not set, the Intel WiFi driver will not build the iw_mvm driver part due to the missing rfill_soft_blocked() call. Adding a inline declaration of rfill_soft_blocked() if CONFIG_RFKILL=n fixes the following error: drivers/net/wireless/intel/iwlwifi/mvm/mvm.h: In function 'iwl_mvm_mei_set_sw_rfkill_state': drivers/net/wireless/intel/iwlwifi/mvm/mvm.h:2215:38: error: implicit declaration of function 'rfkill_soft_blocked'; did you mean 'rfkill_blocked'? [-Werror=implicit-function-declaration] 2215 | mvm->hw_registered ? rfkill_soft_blocked(mvm->hw->wiphy->rfkill) : false; | ^~~~~~~~~~~~~~~~~~~ | rfkill_blocked Signed-off-by: Ben Dooks Reported-by: Neill Whillans Fixes: 5bc9a9dd7535 ("rfkill: allow to get the software rfkill state") Link: https://lore.kernel.org/r/20220218093858.1245677-1-ben.dooks@codethink.co.uk Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index c35f3962dc4f..373003ace639 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -308,6 +308,11 @@ static inline bool rfkill_blocked(struct rfkill *rfkill) return false; } +static inline bool rfkill_soft_blocked(struct rfkill *rfkill) +{ + return false; +} + static inline enum rfkill_type rfkill_find_type(const char *name) { return RFKILL_TYPE_ALL; From 1db5fcbba2631277b78d7f8aff99c9607d29f6d8 Mon Sep 17 00:00:00 2001 From: Golan Ben Ami Date: Tue, 1 Mar 2022 09:29:26 +0200 Subject: [PATCH 534/773] iwlwifi: don't advertise TWT support Some APs misbehave when TWT is used and cause our firmware to crash. We don't know a reasonable way to detect and work around this problem in the FW yet. To prevent these crashes, disable TWT in the driver by stopping to advertise TWT support. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215523 Signed-off-by: Golan Ben Ami [reworded the commit message] Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20220301072926.153969-1-luca@coelho.fi Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 3 +-- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index dd58c8f9aa11..04addf964d83 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -553,8 +553,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = { .has_he = true, .he_cap_elem = { .mac_cap_info[0] = - IEEE80211_HE_MAC_CAP0_HTC_HE | - IEEE80211_HE_MAC_CAP0_TWT_REQ, + IEEE80211_HE_MAC_CAP0_HTC_HE, .mac_cap_info[1] = IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 4ac599f6ad22..709a3df57b10 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -226,7 +226,6 @@ static const u8 he_if_types_ext_capa_sta[] = { [0] = WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING, [2] = WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT, [7] = WLAN_EXT_CAPA8_OPMODE_NOTIF, - [9] = WLAN_EXT_CAPA10_TWT_REQUESTER_SUPPORT, }; static const struct wiphy_iftype_ext_capab he_iftypes_ext_capa[] = { From 5a6248c0a22352f09ea041665d3bd3e18f6f872c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 22 Feb 2022 19:06:30 -0800 Subject: [PATCH 535/773] iwlwifi: mvm: check debugfs_dir ptr before use When "debugfs=off" is used on the kernel command line, iwiwifi's mvm module uses an invalid/unchecked debugfs_dir pointer and causes a BUG: BUG: kernel NULL pointer dereference, address: 000000000000004f #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 1 PID: 503 Comm: modprobe Tainted: G W 5.17.0-rc5 #7 Hardware name: Dell Inc. Inspiron 15 5510/076F7Y, BIOS 2.4.1 11/05/2021 RIP: 0010:iwl_mvm_dbgfs_register+0x692/0x700 [iwlmvm] Code: 69 a0 be 80 01 00 00 48 c7 c7 50 73 6a a0 e8 95 cf ee e0 48 8b 83 b0 1e 00 00 48 c7 c2 54 73 6a a0 be 64 00 00 00 48 8d 7d 8c <48> 8b 48 50 e8 15 22 07 e1 48 8b 43 28 48 8d 55 8c 48 c7 c7 5f 73 RSP: 0018:ffffc90000a0ba68 EFLAGS: 00010246 RAX: ffffffffffffffff RBX: ffff88817d6e3328 RCX: ffff88817d6e3328 RDX: ffffffffa06a7354 RSI: 0000000000000064 RDI: ffffc90000a0ba6c RBP: ffffc90000a0bae0 R08: ffffffff824e4880 R09: ffffffffa069d620 R10: ffffc90000a0ba00 R11: ffffffffffffffff R12: 0000000000000000 R13: ffffc90000a0bb28 R14: ffff88817d6e3328 R15: ffff88817d6e3320 FS: 00007f64dd92d740(0000) GS:ffff88847f640000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000004f CR3: 000000016fc79001 CR4: 0000000000770ee0 PKRU: 55555554 Call Trace: ? iwl_mvm_mac_setup_register+0xbdc/0xda0 [iwlmvm] iwl_mvm_start_post_nvm+0x71/0x100 [iwlmvm] iwl_op_mode_mvm_start+0xab8/0xb30 [iwlmvm] _iwl_op_mode_start+0x6f/0xd0 [iwlwifi] iwl_opmode_register+0x6a/0xe0 [iwlwifi] ? 0xffffffffa0231000 iwl_mvm_init+0x35/0x1000 [iwlmvm] ? 0xffffffffa0231000 do_one_initcall+0x5a/0x1b0 ? kmem_cache_alloc+0x1e5/0x2f0 ? do_init_module+0x1e/0x220 do_init_module+0x48/0x220 load_module+0x2602/0x2bc0 ? __kernel_read+0x145/0x2e0 ? kernel_read_file+0x229/0x290 __do_sys_finit_module+0xc5/0x130 ? __do_sys_finit_module+0xc5/0x130 __x64_sys_finit_module+0x13/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f64dda564dd Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 1b 29 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007ffdba393f88 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f64dda564dd RDX: 0000000000000000 RSI: 00005575399e2ab2 RDI: 0000000000000001 RBP: 000055753a91c5e0 R08: 0000000000000000 R09: 0000000000000002 R10: 0000000000000001 R11: 0000000000000246 R12: 00005575399e2ab2 R13: 000055753a91ceb0 R14: 0000000000000000 R15: 000055753a923018 Modules linked in: btintel(+) btmtk bluetooth vfat snd_hda_codec_hdmi fat snd_hda_codec_realtek snd_hda_codec_generic iwlmvm(+) snd_sof_pci_intel_tgl mac80211 snd_sof_intel_hda_common soundwire_intel soundwire_generic_allocation soundwire_cadence soundwire_bus snd_sof_intel_hda snd_sof_pci snd_sof snd_sof_xtensa_dsp snd_soc_hdac_hda snd_hda_ext_core snd_soc_acpi_intel_match snd_soc_acpi snd_soc_core btrfs snd_compress snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec raid6_pq iwlwifi snd_hda_core snd_pcm snd_timer snd soundcore cfg80211 intel_ish_ipc(+) thunderbolt rfkill intel_ishtp ucsi_acpi wmi i2c_hid_acpi i2c_hid evdev CR2: 000000000000004f ---[ end trace 0000000000000000 ]--- Check the debugfs_dir pointer for an error before using it. Fixes: 8c082a99edb9 ("iwlwifi: mvm: simplify iwl_mvm_dbgfs_register") Signed-off-by: Randy Dunlap Cc: Luca Coelho Cc: linux-wireless@vger.kernel.org Cc: Kalle Valo Cc: Greg Kroah-Hartman Cc: Emmanuel Grumbach Cc: stable Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220223030630.23241-1-rdunlap@infradead.org [change to make both conditional] Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c index 63432c24eb59..445c94adb076 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c @@ -5,6 +5,7 @@ * Copyright (C) 2016-2017 Intel Deutschland GmbH */ #include +#include #include #include @@ -1857,7 +1858,6 @@ void iwl_mvm_sta_add_debugfs(struct ieee80211_hw *hw, void iwl_mvm_dbgfs_register(struct iwl_mvm *mvm) { struct dentry *bcast_dir __maybe_unused; - char buf[100]; spin_lock_init(&mvm->drv_stats_lock); @@ -1939,6 +1939,11 @@ void iwl_mvm_dbgfs_register(struct iwl_mvm *mvm) * Create a symlink with mac80211. It will be removed when mac80211 * exists (before the opmode exists which removes the target.) */ - snprintf(buf, 100, "../../%pd2", mvm->debugfs_dir->d_parent); - debugfs_create_symlink("iwlwifi", mvm->hw->wiphy->debugfsdir, buf); + if (!IS_ERR(mvm->debugfs_dir)) { + char buf[100]; + + snprintf(buf, 100, "../../%pd2", mvm->debugfs_dir->d_parent); + debugfs_create_symlink("iwlwifi", mvm->hw->wiphy->debugfsdir, + buf); + } } From 6ad27f522cb3b210476daf63ce6ddb6568c0508b Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 1 Mar 2022 18:00:20 +0800 Subject: [PATCH 536/773] nl80211: Handle nla_memdup failures in handle_nan_filter As there's potential for failure of the nla_memdup(), check the return value. Fixes: a442b761b24b ("cfg80211: add add_nan_func / del_nan_func") Signed-off-by: Jiasheng Jiang Link: https://lore.kernel.org/r/20220301100020.3801187-1-jiasheng@iscas.ac.cn Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 578bff9c378b..b1909ce2b739 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13411,6 +13411,9 @@ static int handle_nan_filter(struct nlattr *attr_filter, i = 0; nla_for_each_nested(attr, attr_filter, rem) { filter[i].filter = nla_memdup(attr, GFP_KERNEL); + if (!filter[i].filter) + goto err; + filter[i].len = nla_len(attr); i++; } @@ -13423,6 +13426,15 @@ static int handle_nan_filter(struct nlattr *attr_filter, } return 0; + +err: + i = 0; + nla_for_each_nested(attr, attr_filter, rem) { + kfree(filter[i].filter); + i++; + } + kfree(filter); + return -ENOMEM; } static int nl80211_nan_add_func(struct sk_buff *skb, From 94d9864cc86f572f881db9b842a78e9d075493ae Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 24 Feb 2022 10:39:34 +0100 Subject: [PATCH 537/773] mac80211: treat some SAE auth steps as final When we get anti-clogging token required (added by the commit mentioned below), or the other status codes added by the later commit 4e56cde15f7d ("mac80211: Handle special status codes in SAE commit") we currently just pretend (towards the internal state machine of authentication) that we didn't receive anything. This has the undesirable consequence of retransmitting the prior frame, which is not expected, because the timer is still armed. If we just disarm the timer at that point, it would result in the undesirable side effect of being in this state indefinitely if userspace crashes, or so. So to fix this, reset the timer and set a new auth_data->waiting in order to have no more retransmissions, but to have the data destroyed when the timer actually fires, which will only happen if userspace didn't continue (i.e. crashed or abandoned it.) Fixes: a4055e74a2ff ("mac80211: Don't destroy auth data in case of anti-clogging") Reported-by: Jouni Malinen Link: https://lore.kernel.org/r/20220224103932.75964e1d7932.Ia487f91556f29daae734bf61f8181404642e1eec@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mlme.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 330ea62231fa..e87bccaab561 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -376,7 +376,7 @@ struct ieee80211_mgd_auth_data { u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; - bool done; + bool done, waiting; bool peer_confirmed; bool timeout_started; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e5ccf17618ab..744842c4513b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -37,6 +37,7 @@ #define IEEE80211_AUTH_TIMEOUT_SAE (HZ * 2) #define IEEE80211_AUTH_MAX_TRIES 3 #define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) +#define IEEE80211_AUTH_WAIT_SAE_RETRY (HZ * 2) #define IEEE80211_ASSOC_TIMEOUT (HZ / 5) #define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2) #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) @@ -3011,8 +3012,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || - status_code == WLAN_STATUS_SAE_PK)))) + status_code == WLAN_STATUS_SAE_PK)))) { + /* waiting for userspace now */ + ifmgd->auth_data->waiting = true; + ifmgd->auth_data->timeout = + jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; + ifmgd->auth_data->timeout_started = true; + run_again(sdata, ifmgd->auth_data->timeout); goto notify_driver; + } sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); @@ -4603,10 +4611,10 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) if (ifmgd->auth_data && ifmgd->auth_data->timeout_started && time_after(jiffies, ifmgd->auth_data->timeout)) { - if (ifmgd->auth_data->done) { + if (ifmgd->auth_data->done || ifmgd->auth_data->waiting) { /* - * ok ... we waited for assoc but userspace didn't, - * so let's just kill the auth data + * ok ... we waited for assoc or continuation but + * userspace didn't do it, so kill the auth data */ ieee80211_destroy_auth_data(sdata, false); } else if (ieee80211_auth(sdata)) { From 747670fd9a2d1b7774030dba65ca022ba442ce71 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 25 Feb 2022 14:02:41 +0100 Subject: [PATCH 538/773] netfilter: nf_queue: don't assume sk is full socket There is no guarantee that state->sk refers to a full socket. If refcount transitions to 0, sock_put calls sk_free which then ends up with garbage fields. I'd like to thank Oleksandr Natalenko and Jiri Benc for considerable debug work and pointing out state->sk oddities. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Tested-by: Oleksandr Natalenko Signed-off-by: Florian Westphal --- net/netfilter/nf_queue.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 6d12afabfe8a..5ab0680db445 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,6 +46,15 @@ void nf_unregister_queue_handler(void) } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_sock_put(struct sock *sk) +{ +#ifdef CONFIG_INET + sock_gen_put(sk); +#else + sock_put(sk); +#endif +} + static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -54,7 +63,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(state->in); dev_put(state->out); if (state->sk) - sock_put(state->sk); + nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_put(entry->physin); From 2e78855d311c401083df9776aa450d32d716e83e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 25 Feb 2022 12:01:23 +0100 Subject: [PATCH 539/773] selftests: netfilter: add nfqueue TCP_NEW_SYN_RECV socket race test causes: BUG: KASAN: slab-out-of-bounds in sk_free+0x25/0x80 Write of size 4 at addr ffff888106df0284 by task nf-queue/1459 sk_free+0x25/0x80 nf_queue_entry_release_refs+0x143/0x1a0 nf_reinject+0x233/0x770 ... without 'netfilter: nf_queue: don't assume sk is full socket'. Signed-off-by: Florian Westphal --- tools/testing/selftests/netfilter/.gitignore | 1 + tools/testing/selftests/netfilter/Makefile | 2 +- .../selftests/netfilter/connect_close.c | 136 ++++++++++++++++++ .../testing/selftests/netfilter/nft_queue.sh | 19 +++ 4 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/netfilter/connect_close.c diff --git a/tools/testing/selftests/netfilter/.gitignore b/tools/testing/selftests/netfilter/.gitignore index 8448f74adfec..4cb887b57413 100644 --- a/tools/testing/selftests/netfilter/.gitignore +++ b/tools/testing/selftests/netfilter/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only nf-queue +connect_close diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index e4f845dd942b..7e81c9a7fff9 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -9,6 +9,6 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ conntrack_vrf.sh nft_synproxy.sh LDLIBS = -lmnl -TEST_GEN_FILES = nf-queue +TEST_GEN_FILES = nf-queue connect_close include ../lib.mk diff --git a/tools/testing/selftests/netfilter/connect_close.c b/tools/testing/selftests/netfilter/connect_close.c new file mode 100644 index 000000000000..1c3b0add54c4 --- /dev/null +++ b/tools/testing/selftests/netfilter/connect_close.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PORT 12345 +#define RUNTIME 10 + +static struct { + unsigned int timeout; + unsigned int port; +} opts = { + .timeout = RUNTIME, + .port = PORT, +}; + +static void handler(int sig) +{ + _exit(sig == SIGALRM ? 0 : 1); +} + +static void set_timeout(void) +{ + struct sigaction action = { + .sa_handler = handler, + }; + + sigaction(SIGALRM, &action, NULL); + + alarm(opts.timeout); +} + +static void do_connect(const struct sockaddr_in *dst) +{ + int s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + if (s >= 0) + fcntl(s, F_SETFL, O_NONBLOCK); + + connect(s, (struct sockaddr *)dst, sizeof(*dst)); + close(s); +} + +static void do_accept(const struct sockaddr_in *src) +{ + int c, one = 1, s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + if (s < 0) + return; + + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + + bind(s, (struct sockaddr *)src, sizeof(*src)); + + listen(s, 16); + + c = accept(s, NULL, NULL); + if (c >= 0) + close(c); + + close(s); +} + +static int accept_loop(void) +{ + struct sockaddr_in src = { + .sin_family = AF_INET, + .sin_port = htons(opts.port), + }; + + inet_pton(AF_INET, "127.0.0.1", &src.sin_addr); + + set_timeout(); + + for (;;) + do_accept(&src); + + return 1; +} + +static int connect_loop(void) +{ + struct sockaddr_in dst = { + .sin_family = AF_INET, + .sin_port = htons(opts.port), + }; + + inet_pton(AF_INET, "127.0.0.1", &dst.sin_addr); + + set_timeout(); + + for (;;) + do_connect(&dst); + + return 1; +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "t:p:")) != -1) { + switch (c) { + case 't': + opts.timeout = atoi(optarg); + break; + case 'p': + opts.port = atoi(optarg); + break; + } + } +} + +int main(int argc, char *argv[]) +{ + pid_t p; + + parse_opts(argc, argv); + + p = fork(); + if (p < 0) + return 111; + + if (p > 0) + return accept_loop(); + + return connect_loop(); +} diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh index 7d27f1f3bc01..e12729753351 100755 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ b/tools/testing/selftests/netfilter/nft_queue.sh @@ -113,6 +113,7 @@ table inet $name { chain output { type filter hook output priority $prio; policy accept; tcp dport 12345 queue num 3 + tcp sport 23456 queue num 3 jump nfq } chain post { @@ -296,6 +297,23 @@ test_tcp_localhost() wait 2>/dev/null } +test_tcp_localhost_connectclose() +{ + tmpfile=$(mktemp) || exit 1 + + ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout & + + ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & + local nfqpid=$! + + sleep 1 + rm -f "$tmpfile" + + wait $rpid + [ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close" + wait 2>/dev/null +} + test_tcp_localhost_requeue() { ip netns exec ${nsrouter} nft -f /dev/stdin < Date: Mon, 28 Feb 2022 06:22:22 +0100 Subject: [PATCH 540/773] netfilter: nf_queue: fix possible use-after-free Eric Dumazet says: The sock_hold() side seems suspect, because there is no guarantee that sk_refcnt is not already 0. On failure, we cannot queue the packet and need to indicate an error. The packet will be dropped by the caller. v2: split skb prefetch hunk into separate change Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.") Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: Florian Westphal --- include/net/netfilter/nf_queue.h | 2 +- net/netfilter/nf_queue.c | 13 +++++++++---- net/netfilter/nfnetlink_queue.c | 12 +++++++++--- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 9eed51e920e8..980daa6e1e3a 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -37,7 +37,7 @@ void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5ab0680db445..e39549c55945 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,19 +96,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) } /* Bump dev refs so they don't vanish while packet is out */ -void nf_queue_entry_get_refs(struct nf_queue_entry *entry) +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; + if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) + return false; + dev_hold(state->in); dev_hold(state->out); - if (state->sk) - sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_hold(entry->physin); dev_hold(entry->physout); #endif + return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -196,7 +198,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, __nf_queue_entry_init_physdevs(entry); - nf_queue_entry_get_refs(entry); + if (!nf_queue_entry_get_refs(entry)) { + kfree(entry); + return -ENOTCONN; + } switch (entry->state.pf) { case AF_INET: diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index ea2d9c2a44cf..64a6acb6aeae 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -710,9 +710,15 @@ static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) - nf_queue_entry_get_refs(entry); - return entry; + + if (!entry) + return NULL; + + if (nf_queue_entry_get_refs(entry)) + return entry; + + kfree(entry); + return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) From 3b836da4081fa585cf6c392f62557496f2cb0efe Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 1 Mar 2022 00:46:19 +0100 Subject: [PATCH 541/773] netfilter: nf_queue: handle socket prefetch In case someone combines bpf socket assign and nf_queue, then we will queue an skb who references a struct sock that did not have its reference count incremented. As we leave rcu protection, there is no guarantee that skb->sk is still valid. For refcount-less skb->sk case, try to increment the reference count and then override the destructor. In case of failure we have two choices: orphan the skb and 'delete' preselect or let nf_queue() drop the packet. Do the latter, it should not happen during normal operation. Fixes: cf7fbe660f2d ("bpf: Add socket assign support") Acked-by: Joe Stringer Signed-off-by: Florian Westphal --- net/netfilter/nf_queue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index e39549c55945..63d1516816b1 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -180,6 +180,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, break; } + if (skb_sk_is_prefetched(skb)) { + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) { + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + return -ENOTCONN; + + /* drop refcount on skb_orphan */ + skb->destructor = sock_edemux; + } + } + entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) return -ENOMEM; From 4ff2980b6bd2aa6b4ded3ce3b7c0ccfab29980af Mon Sep 17 00:00:00 2001 From: Lina Wang Date: Sat, 26 Feb 2022 15:48:01 +0800 Subject: [PATCH 542/773] xfrm: fix tunnel model fragmentation behavior in tunnel mode, if outer interface(ipv4) is less, it is easily to let inner IPV6 mtu be less than 1280. If so, a Packet Too Big ICMPV6 message is received. When send again, packets are fragmentized with 1280, they are still rejected with ICMPV6(Packet Too Big) by xfrmi_xmit2(). According to RFC4213 Section3.2.2: if (IPv4 path MTU - 20) is less than 1280 if packet is larger than 1280 bytes Send ICMPv6 "packet too big" with MTU=1280 Drop packet else Encapsulate but do not set the Don't Fragment flag in the IPv4 header. The resulting IPv4 packet might be fragmented by the IPv4 layer on the encapsulator or by some router along the IPv4 path. endif else if packet is larger than (IPv4 path MTU - 20) Send ICMPv6 "packet too big" with MTU = (IPv4 path MTU - 20). Drop packet. else Encapsulate and set the Don't Fragment flag in the IPv4 header. endif endif Packets should be fragmentized with ipv4 outer interface, so change it. After it is fragemtized with ipv4, there will be double fragmenation. No.48 & No.51 are ipv6 fragment packets, No.48 is double fragmentized, then tunneled with IPv4(No.49& No.50), which obey spec. And received peer cannot decrypt it rightly. 48 2002::10 2002::11 1296(length) IPv6 fragment (off=0 more=y ident=0xa20da5bc nxt=50) 49 0x0000 (0) 2002::10 2002::11 1304 IPv6 fragment (off=0 more=y ident=0x7448042c nxt=44) 50 0x0000 (0) 2002::10 2002::11 200 ESP (SPI=0x00035000) 51 2002::10 2002::11 180 Echo (ping) request 52 0x56dc 2002::10 2002::11 248 IPv6 fragment (off=1232 more=n ident=0xa20da5bc nxt=50) xfrm6_noneed_fragment has fixed above issues. Finally, it acted like below: 1 0x6206 192.168.1.138 192.168.1.1 1316 Fragmented IP protocol (proto=Encap Security Payload 50, off=0, ID=6206) [Reassembled in #2] 2 0x6206 2002::10 2002::11 88 IPv6 fragment (off=0 more=y ident=0x1f440778 nxt=50) 3 0x0000 2002::10 2002::11 248 ICMPv6 Echo (ping) request Signed-off-by: Lina Wang Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_output.c | 16 ++++++++++++++++ net/xfrm/xfrm_interface.c | 5 ++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index d0d280077721..ad07904642ca 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -45,6 +45,19 @@ static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buf return xfrm_output(sk, skb); } +static int xfrm6_noneed_fragment(struct sk_buff *skb) +{ + struct frag_hdr *fh; + u8 prevhdr = ipv6_hdr(skb)->nexthdr; + + if (prevhdr != NEXTHDR_FRAGMENT) + return 0; + fh = (struct frag_hdr *)(skb->data + sizeof(struct ipv6hdr)); + if (fh->nexthdr == NEXTHDR_ESP || fh->nexthdr == NEXTHDR_AUTH) + return 1; + return 0; +} + static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -73,6 +86,9 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; + } else if (toobig && xfrm6_noneed_fragment(skb)) { + skb->ignore_df = 1; + goto skip_frag; } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 4e3c62d1ad9e..1e8b26eecb3f 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -304,7 +304,10 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + if (skb->len > 1280) + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; From ea49432d184a6a09f84461604b7711a4e9f5ec9c Mon Sep 17 00:00:00 2001 From: Daniel Palmer Date: Tue, 1 Mar 2022 19:43:49 +0900 Subject: [PATCH 543/773] ARM: mstar: Select HAVE_ARM_ARCH_TIMER The mstar SoCs have an arch timer but HAVE_ARM_ARCH_TIMER wasn't selected. If MSC313E_TIMER isn't selected then the kernel gets stuck at boot because there are no timers available. Signed-off-by: Daniel Palmer Link: https://lore.kernel.org/r/20220301104349.3040422-1-daniel@0x0f.com' Signed-off-by: Arnd Bergmann --- arch/arm/mach-mstar/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-mstar/Kconfig b/arch/arm/mach-mstar/Kconfig index cd300eeedc20..0bf4d312bcfd 100644 --- a/arch/arm/mach-mstar/Kconfig +++ b/arch/arm/mach-mstar/Kconfig @@ -3,6 +3,7 @@ menuconfig ARCH_MSTARV7 depends on ARCH_MULTI_V7 select ARM_GIC select ARM_HEAVY_MB + select HAVE_ARM_ARCH_TIMER select MST_IRQ select MSTAR_MSC313_MPLL help From a12f76345e026f1b300a0d17c56f020b6949b093 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 21 Feb 2022 15:55:12 +0100 Subject: [PATCH 544/773] cfg80211: fix CONFIG_CFG80211_EXTRA_REGDB_KEYDIR typo The kbuild change here accidentally removed not only the unquoting, but also the last character of the variable name. Fix that. Fixes: 129ab0d2d9f3 ("kbuild: do not quote string values in include/config/auto.conf") Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20220221155512.1d25895f7c5f.I50fa3d4189fcab90a2896fe8cae215035dae9508@changeid Signed-off-by: Johannes Berg --- net/wireless/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 1e9be50469ce..527ae669f6f7 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -33,7 +33,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \ ) > $@ -$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDI) \ +$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR)/*.x509) @$(kecho) " GEN $@" $(Q)(set -e; \ From cc71d37fd1f11e0495b1cf580909ebea37eaa886 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 25 Feb 2022 17:18:58 -0800 Subject: [PATCH 545/773] HID: vivaldi: fix sysfs attributes leak The driver creates the top row map sysfs attribute in input_configured() method; unfortunately we do not have a callback that is executed when HID interface is unbound, thus we are leaking these sysfs attributes, for example when device is disconnected. To fix it let's switch to managed version of adding sysfs attributes which will ensure that they are destroyed when the driver is unbound. Fixes: 14c9c014babe ("HID: add vivaldi HID driver") Signed-off-by: Dmitry Torokhov Tested-by: Stephen Boyd Reviewed-by: Stephen Boyd Signed-off-by: Jiri Kosina --- drivers/hid/hid-vivaldi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-vivaldi.c b/drivers/hid/hid-vivaldi.c index efa6140915f4..42ceb2058a09 100644 --- a/drivers/hid/hid-vivaldi.c +++ b/drivers/hid/hid-vivaldi.c @@ -144,7 +144,7 @@ out: static int vivaldi_input_configured(struct hid_device *hdev, struct hid_input *hidinput) { - return sysfs_create_group(&hdev->dev.kobj, &input_attribute_group); + return devm_device_add_group(&hdev->dev, &input_attribute_group); } static const struct hid_device_id vivaldi_table[] = { From fe23b6bbeac40de957724b90a88d46fb336e29a9 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Thu, 24 Feb 2022 19:41:10 -0800 Subject: [PATCH 546/773] HID: nintendo: check the return value of alloc_workqueue() The function alloc_workqueue() in nintendo_hid_probe() can fail, but there is no check of its return value. To fix this bug, its return value should be checked with new error handling code. Fixes: c4eae84feff3e ("HID: nintendo: add rumble support") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Reviewed-by: Silvan Jegen Signed-off-by: Jiri Kosina --- drivers/hid/hid-nintendo.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c index b6a9a0f3966e..2204de889739 100644 --- a/drivers/hid/hid-nintendo.c +++ b/drivers/hid/hid-nintendo.c @@ -2128,6 +2128,10 @@ static int nintendo_hid_probe(struct hid_device *hdev, spin_lock_init(&ctlr->lock); ctlr->rumble_queue = alloc_workqueue("hid-nintendo-rumble_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, 0); + if (!ctlr->rumble_queue) { + ret = -ENOMEM; + goto err; + } INIT_WORK(&ctlr->rumble_worker, joycon_rumble_worker); ret = hid_parse(hdev); From 5838a14832d447990827d85e90afe17e6fb9c175 Mon Sep 17 00:00:00 2001 From: Nicolas Cavallari Date: Mon, 28 Feb 2022 12:03:51 +0100 Subject: [PATCH 547/773] thermal: core: Fix TZ_GET_TRIP NULL pointer dereference Do not call get_trip_hyst() from thermal_genl_cmd_tz_get_trip() if the thermal zone does not define one. Fixes: 1ce50e7d408e ("thermal: core: genetlink support for events/cmd/sampling") Signed-off-by: Nicolas Cavallari Cc: 5.10+ # 5.10+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c index a16dd4d5d710..73e68cce292e 100644 --- a/drivers/thermal/thermal_netlink.c +++ b/drivers/thermal/thermal_netlink.c @@ -419,11 +419,12 @@ static int thermal_genl_cmd_tz_get_trip(struct param *p) for (i = 0; i < tz->trips; i++) { enum thermal_trip_type type; - int temp, hyst; + int temp, hyst = 0; tz->ops->get_trip_type(tz, i, &type); tz->ops->get_trip_temp(tz, i, &temp); - tz->ops->get_trip_hyst(tz, i, &hyst); + if (tz->ops->get_trip_hyst) + tz->ops->get_trip_hyst(tz, i, &hyst); if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, i) || nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, type) || From 439a8468242b313486e69b8cc3b45ddcfa898fbf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 28 Feb 2022 10:59:12 -0800 Subject: [PATCH 548/773] binfmt_elf: Avoid total_mapping_size for ET_EXEC Partially revert commit 5f501d555653 ("binfmt_elf: reintroduce using MAP_FIXED_NOREPLACE"), which applied the ET_DYN "total_mapping_size" logic also to ET_EXEC. At least ia64 has ET_EXEC PT_LOAD segments that are not virtual-address contiguous (but _are_ file-offset contiguous). This would result in a giant mapping attempting to cover the entire span, including the virtual address range hole, and well beyond the size of the ELF file itself, causing the kernel to refuse to load it. For example: $ readelf -lW /usr/bin/gcc ... Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz ... ... LOAD 0x000000 0x4000000000000000 0x4000000000000000 0x00b5a0 0x00b5a0 ... LOAD 0x00b5a0 0x600000000000b5a0 0x600000000000b5a0 0x0005ac 0x000710 ... ... ^^^^^^^^ ^^^^^^^^^^^^^^^^^^ ^^^^^^^^ ^^^^^^^^ File offset range : 0x000000-0x00bb4c 0x00bb4c bytes Virtual address range : 0x4000000000000000-0x600000000000bcb0 0x200000000000bcb0 bytes Remove the total_mapping_size logic for ET_EXEC, which reduces the ET_EXEC MAP_FIXED_NOREPLACE coverage to only the first PT_LOAD (better than nothing), and retains it for ET_DYN. Ironically, this is the reverse of the problem that originally caused problems with MAP_FIXED_NOREPLACE: overlapping PT_LOAD segments. Future work could restore full coverage if load_elf_binary() were to perform mappings in a separate phase from the loading (where it could resolve both overlaps and holes). Cc: Eric Biederman Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Reported-by: matoro Fixes: 5f501d555653 ("binfmt_elf: reintroduce using MAP_FIXED_NOREPLACE") Link: https://lore.kernel.org/r/a3edd529-c42d-3b09-135c-7e98a15b150f@leemhuis.info Tested-by: matoro Link: https://lore.kernel.org/lkml/ce8af9c13bcea9230c7689f3c1e0e2cd@matoro.tk Tested-By: John Paul Adrian Glaubitz Link: https://lore.kernel.org/lkml/49182d0d-708b-4029-da5f-bc18603440a6@physik.fu-berlin.de Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 605017eb9349..c36b1ec357fb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1135,14 +1135,25 @@ out_free_interp: * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); - } - /* - * Calculate the entire size of the ELF mapping (total_size). - * (Note that load_addr_set is set to true later once the - * initial mapping is performed.) - */ - if (!load_addr_set) { + /* + * Calculate the entire size of the ELF mapping + * (total_size), used for the initial mapping, + * due to load_addr_set which is set to true later + * once the initial mapping is performed. + * + * Note that this is only sensible when the LOAD + * segments are contiguous (or overlapping). If + * used for LOADs that are far apart, this would + * cause the holes between LOADs to be mapped, + * running the risk of having the mapping fail, + * as it would be larger than the ELF file itself. + * + * As a result, only ET_DYN does this, since + * some ET_EXEC (e.g. ia64) may have large virtual + * memory holes between LOADs. + * + */ total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { From 62929726ef0ec72cbbe9440c5d125d4278b99894 Mon Sep 17 00:00:00 2001 From: Manasi Navare Date: Thu, 24 Feb 2022 17:30:54 -0800 Subject: [PATCH 549/773] drm/vrr: Set VRR capable prop only if it is attached to connector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VRR capable property is not attached by default to the connector It is attached only if VRR is supported. So if the driver tries to call drm core set prop function without it being attached that causes NULL dereference. Cc: Jani Nikula Cc: Ville Syrjälä Cc: dri-devel@lists.freedesktop.org Signed-off-by: Manasi Navare Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220225013055.9282-1-manasi.d.navare@intel.com --- drivers/gpu/drm/drm_connector.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index a50c82bc2b2f..76a8c707c34b 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -2330,6 +2330,9 @@ EXPORT_SYMBOL(drm_connector_atomic_hdr_metadata_equal); void drm_connector_set_vrr_capable_property( struct drm_connector *connector, bool capable) { + if (!connector->vrr_capable_property) + return; + drm_object_property_set_value(&connector->base, connector->vrr_capable_property, capable); From 6b4b54c7ca347bcb4aa7a3cc01aa16e84ac7fbe4 Mon Sep 17 00:00:00 2001 From: Alexander Egorenkov Date: Wed, 9 Feb 2022 11:25:09 +0100 Subject: [PATCH 550/773] s390/setup: preserve memory at OLDMEM_BASE and OLDMEM_SIZE We need to preserve the values at OLDMEM_BASE and OLDMEM_SIZE which are used by zgetdump in case when kdump crashes. In that case zgetdump will attempt to read OLDMEM_BASE and OLDMEM_SIZE in order to find out where the memory range [0 - OLDMEM_SIZE] belonging to the production kernel is. Fixes: f1a546947431 ("s390/setup: don't reserve memory that occupied decompressor's head") Cc: stable@vger.kernel.org # 5.15+ Signed-off-by: Alexander Egorenkov Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index f2c25d113e7b..05327be3a982 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -800,6 +800,8 @@ static void __init check_initrd(void) static void __init reserve_kernel(void) { memblock_reserve(0, STARTUP_NORMAL_OFFSET); + memblock_reserve(OLDMEM_BASE, sizeof(unsigned long)); + memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long)); memblock_reserve(__amode31_base, __eamode31 - __samode31); memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); memblock_reserve(__pa(_stext), _end - _stext); From 9fa881f7e3c74ce6626d166bca9397e5d925937f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 23 Feb 2022 13:02:59 +0100 Subject: [PATCH 551/773] s390/ftrace: fix ftrace_caller/ftrace_regs_caller generation ftrace_caller was used for both ftrace_caller and ftrace_regs_caller, which means that the target address of the hotpatch trampoline was never updated. With commit 894979689d3a ("s390/ftrace: provide separate ftrace_caller/ftrace_regs_caller implementations") a separate ftrace_regs_caller entry point was implemeted, however it was forgotten to implement the necessary changes for ftrace_modify_call and ftrace_make_call, where the branch target has to be modified accordingly. Therefore add the missing code now. Fixes: 894979689d3a ("s390/ftrace: provide separate ftrace_caller/ftrace_regs_caller implementations") Reviewed-by: Sven Schnelle Acked-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/ftrace.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 21d62d8b6b9a..5026d218066d 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -159,9 +159,38 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return 0; } +static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec) +{ + struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_insn insn; + s64 disp; + u16 opc; + + if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn))) + return ERR_PTR(-EFAULT); + disp = (s64)insn.disp * 2; + trampoline = (void *)(rec->ip + disp); + if (get_kernel_nofault(opc, &trampoline->brasl_opc)) + return ERR_PTR(-EFAULT); + if (opc != 0xc015) + return ERR_PTR(-EINVAL); + return trampoline; +} + int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { + struct ftrace_hotpatch_trampoline *trampoline; + u64 old; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + if (get_kernel_nofault(old, &trampoline->interceptor)) + return -EFAULT; + if (old != old_addr) + return -EINVAL; + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); return 0; } @@ -188,6 +217,12 @@ static void brcl_enable(void *brcl) int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { + struct ftrace_hotpatch_trampoline *trampoline; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); brcl_enable((void *)rec->ip); return 0; } From 1389f17937a03fe4ec71b094e1aa6530a901963e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 22 Feb 2022 14:53:47 +0100 Subject: [PATCH 552/773] s390/ftrace: fix arch_ftrace_get_regs implementation arch_ftrace_get_regs is supposed to return a struct pt_regs pointer only if the pt_regs structure contains all register contents, which means it must have been populated when created via ftrace_regs_caller. If it was populated via ftrace_caller the contents are not complete (the psw mask part is missing), and therefore a NULL pointer needs be returned. The current code incorrectly always returns a struct pt_regs pointer. Fix this by adding another pt_regs flag which indicates if the contents are complete, and fix arch_ftrace_get_regs accordingly. Fixes: 894979689d3a ("s390/ftrace: provide separate ftrace_caller/ftrace_regs_caller implementations") Reported-by: Christophe Leroy Reported-by: Naveen N. Rao Reviewed-by: Sven Schnelle Acked-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ftrace.h | 10 ++++++---- arch/s390/include/asm/ptrace.h | 2 ++ arch/s390/kernel/ftrace.c | 2 +- arch/s390/kernel/mcount.S | 9 +++++++++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 267f70f4393f..6f80ec9c04be 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -47,15 +47,17 @@ struct ftrace_regs { static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { - return &fregs->regs; + struct pt_regs *regs = &fregs->regs; + + if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS)) + return regs; + return NULL; } static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs, unsigned long ip) { - struct pt_regs *regs = arch_ftrace_get_regs(fregs); - - regs->psw.addr = ip; + fregs->regs.psw.addr = ip; } /* diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 4ffa8e7f0ed3..ddb70fb13fbc 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -15,11 +15,13 @@ #define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */ #define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */ #define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ +#define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */ #define _PIF_SYSCALL BIT(PIF_SYSCALL) #define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART) #define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET) #define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) +#define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS) #ifndef __ASSEMBLY__ diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 5026d218066d..89c0870d5679 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -326,7 +326,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, regs = ftrace_get_regs(fregs); p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) + if (!regs || unlikely(!p) || kprobe_disabled(p)) goto out; if (kprobe_running()) { diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 39bcc0e39a10..a24177dcd12a 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -27,6 +27,7 @@ ENDPROC(ftrace_stub) #define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) #define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) #define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2) +#define STACK_PTREGS_FLAGS (STACK_PTREGS + __PT_FLAGS) #ifdef __PACK_STACK /* allocate just enough for r14, r15 and backchain */ #define TRACED_FUNC_FRAME_SIZE 24 @@ -57,6 +58,14 @@ ENDPROC(ftrace_stub) .if \allregs == 1 stg %r14,(STACK_PTREGS_PSW)(%r15) stosm (STACK_PTREGS_PSW)(%r15),0 +#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES + mvghi STACK_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS +#else + lghi %r14,_PIF_FTRACE_FULL_REGS + stg %r14,STACK_PTREGS_FLAGS(%r15) +#endif + .else + xc STACK_PTREGS_FLAGS(8,%r15),STACK_PTREGS_FLAGS(%r15) .endif lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address From c194dad21025dfd043210912653baab823bdff67 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 24 Feb 2022 22:03:29 +0100 Subject: [PATCH 553/773] s390/extable: fix exception table sorting s390 has a swap_ex_entry_fixup function, however it is not being used since common code expects a swap_ex_entry_fixup define. If it is not defined the default implementation will be used. So fix this by adding a proper define. However also the implementation of the function must be fixed, since a NULL value for handler has a special meaning and must not be adjusted. Luckily all of this doesn't fix a real bug currently: the main extable is correctly sorted during build time, and for runtime sorting there is currently no case where the handler field is not NULL. Fixes: 05a68e892e89 ("s390/kernel: expand exception table logic to allow new handling options") Acked-by: Ilya Leoshkevich Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/extable.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index 16dc57dd90b3..8511f0e59290 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -69,8 +69,13 @@ static inline void swap_ex_entry_fixup(struct exception_table_entry *a, { a->fixup = b->fixup + delta; b->fixup = tmp.fixup - delta; - a->handler = b->handler + delta; - b->handler = tmp.handler - delta; + a->handler = b->handler; + if (a->handler) + a->handler += delta; + b->handler = tmp.handler; + if (b->handler) + b->handler -= delta; } +#define swap_ex_entry_fixup swap_ex_entry_fixup #endif From db6140e5e35a48405e669353bd54042c1d4c3841 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Mon, 28 Feb 2022 11:23:49 +0200 Subject: [PATCH 554/773] net/sched: act_ct: Fix flow table lookup failure with no originating ifindex After cited commit optimizted hw insertion, flow table entries are populated with ifindex information which was intended to only be used for HW offload. This tuple ifindex is hashed in the flow table key, so it must be filled for lookup to be successful. But tuple ifindex is only relevant for the netfilter flowtables (nft), so it's not filled in act_ct flow table lookup, resulting in lookup failure, and no SW offload and no offload teardown for TCP connection FIN/RST packets. To fix this, add new tc ifindex field to tuple, which will only be used for offloading, not for lookup, as it will not be part of the tuple hash. Fixes: 9795ded7f924 ("net/sched: act_ct: Fill offloading tuple iifidx") Signed-off-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 +++++- net/netfilter/nf_flow_table_offload.c | 6 +++++- net/sched/act_ct.c | 13 +++++++++---- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a3647fadf1cc..bd59e950f4d6 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -96,6 +96,7 @@ enum flow_offload_xmit_type { FLOW_OFFLOAD_XMIT_NEIGH, FLOW_OFFLOAD_XMIT_XFRM, FLOW_OFFLOAD_XMIT_DIRECT, + FLOW_OFFLOAD_XMIT_TC, }; #define NF_FLOW_TABLE_ENCAP_MAX 2 @@ -127,7 +128,7 @@ struct flow_offload_tuple { struct { } __hash; u8 dir:2, - xmit_type:2, + xmit_type:3, encap_num:2, in_vlan_ingress:2; u16 mtu; @@ -142,6 +143,9 @@ struct flow_offload_tuple { u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; + struct { + u32 iifidx; + } tc; }; }; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b561e0a44a45..fc4265acd9c4 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -110,7 +110,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match, nf_flow_rule_lwt_match(match, tun_info); } - key->meta.ingress_ifindex = tuple->iifidx; + if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC) + key->meta.ingress_ifindex = tuple->tc.iifidx; + else + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) && diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 33e70d60f0bf..ec19f625863a 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -361,6 +361,13 @@ static void tcf_ct_flow_table_put(struct tcf_ct_params *params) } } +static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, + struct nf_conn_act_ct_ext *act_ct_ext, u8 dir) +{ + entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC; + entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir]; +} + static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, bool tcp) @@ -385,10 +392,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, act_ct_ext = nf_conn_act_ct_ext_find(ct); if (act_ct_ext) { - entry->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = - act_ct_ext->ifindex[IP_CT_DIR_ORIGINAL]; - entry->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = - act_ct_ext->ifindex[IP_CT_DIR_REPLY]; + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL); + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY); } err = flow_offload_add(&ct_ft->nf_ft, entry); From a0e897d1b36793fe0ab899f2fe93dff25c82f418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 17 Jan 2022 19:20:06 +0100 Subject: [PATCH 555/773] arm64: dts: armada-3720-turris-mox: Add missing ethernet0 alias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit U-Boot uses ethernet* aliases for setting MAC addresses. Therefore define also alias for ethernet0. Fixes: 7109d817db2e ("arm64: dts: marvell: add DTS for Turris Mox") Signed-off-by: Pali Rohár Signed-off-by: Gregory CLEMENT Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index 04da07ae4420..1eddf31d8bd8 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -18,6 +18,7 @@ aliases { spi0 = &spi0; + ethernet0 = ð0; ethernet1 = ð1; mmc0 = &sdhci0; mmc1 = &sdhci1; From fc7f750dc9d102c1ed7bbe4591f991e770c99033 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 28 Feb 2022 10:43:31 +0300 Subject: [PATCH 556/773] staging: gdm724x: fix use after free in gdm_lte_rx() The netif_rx_ni() function frees the skb so we can't dereference it to save the skb->len. Fixes: 61e121047645 ("staging: gdm7240: adding LTE USB driver") Cc: stable Reported-by: kernel test robot Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220228074331.GA13685@kili Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gdm724x/gdm_lte.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/staging/gdm724x/gdm_lte.c b/drivers/staging/gdm724x/gdm_lte.c index 493ed4821515..0d8d8fed283d 100644 --- a/drivers/staging/gdm724x/gdm_lte.c +++ b/drivers/staging/gdm724x/gdm_lte.c @@ -76,14 +76,15 @@ static void tx_complete(void *arg) static int gdm_lte_rx(struct sk_buff *skb, struct nic *nic, int nic_type) { - int ret; + int ret, len; + len = skb->len + ETH_HLEN; ret = netif_rx_ni(skb); if (ret == NET_RX_DROP) { nic->stats.rx_dropped++; } else { nic->stats.rx_packets++; - nic->stats.rx_bytes += skb->len + ETH_HLEN; + nic->stats.rx_bytes += len; } return 0; From 275f3f64870245b06188f24bdf917e55a813d294 Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Tue, 1 Mar 2022 14:34:57 -0800 Subject: [PATCH 557/773] Bluetooth: Fix not checking MGMT cmd pending queue A number of places in the MGMT handlers we examine the command queue for other commands (in progress but not yet complete) that will interact with the process being performed. However, not all commands go into the queue if one of: 1. There is no negative side effect of consecutive or redundent commands 2. The command is entirely perform "inline". This change examines each "pending command" check, and if it is not needed, deletes the check. Of the remaining pending command checks, we make sure that the command is in the pending queue by using the mgmt_pending_add/mgmt_pending_remove pair rather than the mgmt_pending_new/mgmt_pending_free pair. Link: https://lore.kernel.org/linux-bluetooth/f648f2e11bb3c2974c32e605a85ac3a9fac944f1.camel@redhat.com/T/ Tested-by: Maxim Levitsky Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 99 ++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 37087cf7dc5a..533cf60673a3 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1218,7 +1218,13 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode *cp; + + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + return; + + cp = cmd->param; bt_dev_dbg(hdev, "err %d", err); @@ -1242,7 +1248,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) @@ -1281,7 +1287,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_POWERED, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1290,6 +1296,9 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd, mgmt_set_powered_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1383,6 +1392,10 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1402,7 +1415,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_dev_unlock(hdev); } @@ -1511,7 +1524,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1538,6 +1551,9 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_discoverable_sync, cmd, mgmt_set_discoverable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1550,6 +1566,10 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + return; + hci_dev_lock(hdev); if (err) { @@ -1562,7 +1582,9 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_free(cmd); + if (cmd) + mgmt_pending_remove(cmd); + hci_dev_unlock(hdev); } @@ -1634,7 +1656,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -1654,6 +1676,9 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, set_connectable_sync, cmd, mgmt_set_connectable_complete); + if (err < 0) + mgmt_pending_remove(cmd); + failed: hci_dev_unlock(hdev); return err; @@ -1774,6 +1799,10 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) u8 enable = cp->val; bool changed; + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + return; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -3321,6 +3350,9 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); + if (cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + return; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3493,6 +3525,9 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); + if (cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -3759,13 +3794,6 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - if (pending_find(MGMT_OP_SET_WIDEBAND_SPEECH, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_WIDEBAND_SPEECH, - MGMT_STATUS_BUSY); - goto unlock; - } - if (hdev_is_powered(hdev) && !!cp->val != hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) { @@ -5036,12 +5064,6 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) { - err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - MGMT_STATUS_BUSY); - goto unlock; - } - cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0); if (!cmd) err = -ENOMEM; @@ -5261,11 +5283,16 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && + cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5327,7 +5354,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, else hdev->discovery.limited = false; - cmd = mgmt_pending_new(sk, op, hdev, data, len); + cmd = mgmt_pending_add(sk, op, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -5336,7 +5363,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5430,7 +5457,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } - cmd = mgmt_pending_new(sk, MGMT_OP_START_SERVICE_DISCOVERY, + cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; @@ -5463,7 +5490,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto failed; } @@ -5495,11 +5522,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + if (cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + return; + bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -5535,7 +5565,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - cmd = mgmt_pending_new(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); + cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; @@ -5544,7 +5574,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, err = hci_cmd_sync_queue(hdev, stop_discovery_sync, cmd, stop_discovery_complete); if (err < 0) { - mgmt_pending_free(cmd); + mgmt_pending_remove(cmd); goto unlock; } @@ -7474,6 +7504,9 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; + if (cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) + return; + if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -7969,11 +8002,7 @@ static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags) static bool adv_busy(struct hci_dev *hdev) { - return (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_PARAMS, hdev) || - pending_find(MGMT_OP_ADD_EXT_ADV_DATA, hdev)); + return pending_find(MGMT_OP_SET_LE, hdev); } static void add_adv_complete(struct hci_dev *hdev, struct sock *sk, u8 instance, @@ -8563,9 +8592,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || - pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || - pending_find(MGMT_OP_SET_LE, hdev)) { + if (pending_find(MGMT_OP_SET_LE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; From 0b0e2ff10356e7e2ffd66ecdd6eee69a2f03449b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 28 Feb 2022 16:17:15 +0200 Subject: [PATCH 558/773] net: dsa: restore error path of dsa_tree_change_tag_proto When the DSA_NOTIFIER_TAG_PROTO returns an error, the user space process which initiated the protocol change exits the kernel processing while still holding the rtnl_mutex. So any other process attempting to lock the rtnl_mutex would deadlock after such event. The error handling of DSA_NOTIFIER_TAG_PROTO was inadvertently changed by the blamed commit, introducing this regression. We must still call rtnl_unlock(), and we must still call DSA_NOTIFIER_TAG_PROTO for the old protocol. The latter is due to the limiting design of notifier chains for cross-chip operations, which don't have a built-in error recovery mechanism - we should look into using notifier_call_chain_robust for that. Fixes: dc452a471dba ("net: dsa: introduce tagger-owned storage for private and shared data") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220228141715.146485-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dcad3100b164..209496996cdf 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1261,7 +1261,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, info.tag_ops = tag_ops; err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); if (err) - return err; + goto out_unlock; err = dsa_tree_bind_tag_proto(dst, tag_ops); if (err) From 1d1898f65616c4601208963c3376c1d828cbf2c7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 1 Mar 2022 22:29:04 -0500 Subject: [PATCH 559/773] tracing/histogram: Fix sorting on old "cpu" value When trying to add a histogram against an event with the "cpu" field, it was impossible due to "cpu" being a keyword to key off of the running CPU. So to fix this, it was changed to "common_cpu" to match the other generic fields (like "common_pid"). But since some scripts used "cpu" for keying off of the CPU (for events that did not have "cpu" as a field, which is most of them), a backward compatibility trick was added such that if "cpu" was used as a key, and the event did not have "cpu" as a field name, then it would fallback and switch over to "common_cpu". This fix has a couple of subtle bugs. One was that when switching over to "common_cpu", it did not change the field name, it just set a flag. But the code still found a "cpu" field. The "cpu" field is used for filtering and is returned when the event does not have a "cpu" field. This was found by: # cd /sys/kernel/tracing # echo hist:key=cpu,pid:sort=cpu > events/sched/sched_wakeup/trigger # cat events/sched/sched_wakeup/hist Which showed the histogram unsorted: { cpu: 19, pid: 1175 } hitcount: 1 { cpu: 6, pid: 239 } hitcount: 2 { cpu: 23, pid: 1186 } hitcount: 14 { cpu: 12, pid: 249 } hitcount: 2 { cpu: 3, pid: 994 } hitcount: 5 Instead of hard coding the "cpu" checks, take advantage of the fact that trace_event_field_field() returns a special field for "cpu" and "CPU" if the event does not have "cpu" as a field. This special field has the "filter_type" of "FILTER_CPU". Check that to test if the returned field is of the CPU type instead of doing the string compare. Also, fix the sorting bug by testing for the hist_field flag of HIST_FIELD_FL_CPU when setting up the sort routine. Otherwise it will use the special CPU field to know what compare routine to use, and since that special field does not have a size, it returns tracing_map_cmp_none. Cc: stable@vger.kernel.org Fixes: 1e3bac71c505 ("tracing/histogram: Rename "cpu" to "common_cpu"") Reported-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ada87bfb5bb8..dc7f733b4cb3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2289,9 +2289,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, /* * For backward compatibility, if field_name * was "cpu", then we treat this the same as - * common_cpu. + * common_cpu. This also works for "CPU". */ - if (strcmp(field_name, "cpu") == 0) { + if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, @@ -4832,7 +4832,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; - else if (!field) + else if (!field || hist_field->flags & HIST_FIELD_FL_CPU) cmp_fn = tracing_map_cmp_num(hist_field->size, hist_field->is_signed); else if (is_string_field(field)) From 81a36d8ce554b82b0a08e2b95d0bd44fcbff339b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 28 Feb 2022 23:39:38 -0800 Subject: [PATCH 560/773] Input: elan_i2c - move regulator_[en|dis]able() out of elan_[en|dis]able_power() elan_disable_power() is called conditionally on suspend, where as elan_enable_power() is always called on resume. This leads to an imbalance in the regulator's enable count. Move the regulator_[en|dis]able() calls out of elan_[en|dis]able_power() in preparation of fixing this. No functional changes intended. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220131135436.29638-1-hdegoede@redhat.com [dtor: consolidate elan_[en|dis]able() into elan_set_power()] Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_core.c | 62 ++++++++++------------------- 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 47af62c12267..185191b86fad 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -186,55 +186,21 @@ static int elan_get_fwinfo(u16 ic_type, u8 iap_version, u16 *validpage_count, return 0; } -static int elan_enable_power(struct elan_tp_data *data) +static int elan_set_power(struct elan_tp_data *data, bool on) { int repeat = ETP_RETRY_COUNT; int error; - error = regulator_enable(data->vcc); - if (error) { - dev_err(&data->client->dev, - "failed to enable regulator: %d\n", error); - return error; - } - do { - error = data->ops->power_control(data->client, true); + error = data->ops->power_control(data->client, on); if (error >= 0) return 0; msleep(30); } while (--repeat > 0); - dev_err(&data->client->dev, "failed to enable power: %d\n", error); - return error; -} - -static int elan_disable_power(struct elan_tp_data *data) -{ - int repeat = ETP_RETRY_COUNT; - int error; - - do { - error = data->ops->power_control(data->client, false); - if (!error) { - error = regulator_disable(data->vcc); - if (error) { - dev_err(&data->client->dev, - "failed to disable regulator: %d\n", - error); - /* Attempt to power the chip back up */ - data->ops->power_control(data->client, true); - break; - } - - return 0; - } - - msleep(30); - } while (--repeat > 0); - - dev_err(&data->client->dev, "failed to disable power: %d\n", error); + dev_err(&data->client->dev, "failed to set power %s: %d\n", + on ? "on" : "off", error); return error; } @@ -1399,9 +1365,19 @@ static int __maybe_unused elan_suspend(struct device *dev) /* Enable wake from IRQ */ data->irq_wake = (enable_irq_wake(client->irq) == 0); } else { - ret = elan_disable_power(data); + ret = elan_set_power(data, false); + if (ret) + goto err; + + ret = regulator_disable(data->vcc); + if (ret) { + dev_err(dev, "error %d disabling regulator\n", ret); + /* Attempt to power the chip back up */ + elan_set_power(data, true); + } } +err: mutex_unlock(&data->sysfs_mutex); return ret; } @@ -1417,7 +1393,13 @@ static int __maybe_unused elan_resume(struct device *dev) data->irq_wake = false; } - error = elan_enable_power(data); + error = regulator_enable(data->vcc); + if (error) { + dev_err(dev, "error %d enabling regulator\n", error); + goto err; + } + + error = elan_set_power(data, true); if (error) { dev_err(dev, "power up when resuming failed: %d\n", error); goto err; From 04b7762e37c95d9b965d16bb0e18dbd1fa2e2861 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 28 Feb 2022 23:39:50 -0800 Subject: [PATCH 561/773] Input: elan_i2c - fix regulator enable count imbalance after suspend/resume Before these changes elan_suspend() would only disable the regulator when device_may_wakeup() returns false; whereas elan_resume() would unconditionally enable it, leading to an enable count imbalance when device_may_wakeup() returns true. This triggers the "WARN_ON(regulator->enable_count)" in regulator_put() when the elan_i2c driver gets unbound, this happens e.g. with the hot-plugable dock with Elan I2C touchpad for the Asus TF103C 2-in-1. Fix this by making the regulator_enable() call also be conditional on device_may_wakeup() returning false. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220131135436.29638-2-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 185191b86fad..e1758d5ffe42 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -1388,17 +1388,17 @@ static int __maybe_unused elan_resume(struct device *dev) struct elan_tp_data *data = i2c_get_clientdata(client); int error; - if (device_may_wakeup(dev) && data->irq_wake) { + if (!device_may_wakeup(dev)) { + error = regulator_enable(data->vcc); + if (error) { + dev_err(dev, "error %d enabling regulator\n", error); + goto err; + } + } else if (data->irq_wake) { disable_irq_wake(client->irq); data->irq_wake = false; } - error = regulator_enable(data->vcc); - if (error) { - dev_err(dev, "error %d enabling regulator\n", error); - goto err; - } - error = elan_set_power(data, true); if (error) { dev_err(dev, "power up when resuming failed: %d\n", error); From 690bb6fb64f5dc7437317153902573ecad67593d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 28 Feb 2022 00:01:24 +0100 Subject: [PATCH 562/773] batman-adv: Request iflink once in batadv-on-batadv check There is no need to call dev_get_iflink multiple times for the same net_device in batadv_is_on_batman_iface. And since some of the .ndo_get_iflink callbacks are dynamic (for example via RCUs like in vxcan_get_iflink), it could easily happen that the returned values are not stable. The pre-checks before __dev_get_by_index are then of course bogus. Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 8a2b78f9c4b2..35aa1122043b 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -149,22 +149,23 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; + int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_softif_is_valid(net_dev)) return true; + iflink = dev_get_iflink(net_dev); + /* no more parents..stop recursion */ - if (dev_get_iflink(net_dev) == 0 || - dev_get_iflink(net_dev) == net_dev->ifindex) + if (iflink == 0 || iflink == net_dev->ifindex) return false; parent_net = batadv_getlink_net(net_dev, net); /* recurse over the parent device */ - parent_dev = __dev_get_by_index((struct net *)parent_net, - dev_get_iflink(net_dev)); + parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); /* if we got a NULL parent_dev there is something broken.. */ if (!parent_dev) { pr_err("Cannot find parent device\n"); From 6116ba09423f7d140f0460be6a1644dceaad00da Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 28 Feb 2022 00:01:24 +0100 Subject: [PATCH 563/773] batman-adv: Request iflink once in batadv_get_real_netdevice There is no need to call dev_get_iflink multiple times for the same net_device in batadv_get_real_netdevice. And since some of the ndo_get_iflink callbacks are dynamic (for example via RCUs like in vxcan_get_iflink), it could easily happen that the returned values are not stable. The pre-checks before __dev_get_by_index are then of course bogus. Fixes: 5ed4a460a1d3 ("batman-adv: additional checks for virtual interfaces on top of WiFi") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 35aa1122043b..e2760cfce190 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -215,14 +215,16 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; - int ifindex; + int iflink; ASSERT_RTNL(); if (!netdev) return NULL; - if (netdev->ifindex == dev_get_iflink(netdev)) { + iflink = dev_get_iflink(netdev); + + if (netdev->ifindex == iflink) { dev_hold(netdev); return netdev; } @@ -232,9 +234,8 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) goto out; net = dev_net(hard_iface->soft_iface); - ifindex = dev_get_iflink(netdev); real_net = batadv_getlink_net(netdev, net); - real_netdev = dev_get_by_index(real_net, ifindex); + real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); From 6c1f41afc1dbe59d9d3c8bb0d80b749c119aa334 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 27 Feb 2022 23:23:49 +0100 Subject: [PATCH 564/773] batman-adv: Don't expect inter-netns unique iflink indices The ifindex doesn't have to be unique for multiple network namespaces on the same machine. $ ip netns add test1 $ ip -net test1 link add dummy1 type dummy $ ip netns add test2 $ ip -net test2 link add dummy2 type dummy $ ip -net test1 link show dev dummy1 6: dummy1: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 96:81:55:1e:dd:85 brd ff:ff:ff:ff:ff:ff $ ip -net test2 link show dev dummy2 6: dummy2: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 5a:3c:af:35:07:c3 brd ff:ff:ff:ff:ff:ff But the batman-adv code to walk through the various layers of virtual interfaces uses this assumption because dev_get_iflink handles it internally and doesn't return the actual netns of the iflink. And dev_get_iflink only documents the situation where ifindex == iflink for physical devices. But only checking for dev->netdev_ops->ndo_get_iflink is also not an option because ipoib_get_iflink implements it even when it sometimes returns an iflink != ifindex and sometimes iflink == ifindex. The caller must therefore make sure itself to check both netns and iflink + ifindex for equality. Only when they are equal, a "physical" interface was detected which should stop the traversal. On the other hand, vxcan_get_iflink can also return 0 in case there was currently no valid peer. In this case, it is still necessary to stop. Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface") Fixes: 5ed4a460a1d3 ("batman-adv: additional checks for virtual interfaces on top of WiFi") Reported-by: Sabrina Dubroca Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index e2760cfce190..35fadb924849 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -157,13 +157,15 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return true; iflink = dev_get_iflink(net_dev); - - /* no more parents..stop recursion */ - if (iflink == 0 || iflink == net_dev->ifindex) + if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); + /* iflink to itself, most likely physical device */ + if (net == parent_net && iflink == net_dev->ifindex) + return false; + /* recurse over the parent device */ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); /* if we got a NULL parent_dev there is something broken.. */ @@ -223,8 +225,7 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) return NULL; iflink = dev_get_iflink(netdev); - - if (netdev->ifindex == iflink) { + if (iflink == 0) { dev_hold(netdev); return netdev; } @@ -235,6 +236,14 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) net = dev_net(hard_iface->soft_iface); real_net = batadv_getlink_net(netdev, net); + + /* iflink to itself, most likely physical device */ + if (net == real_net && netdev->ifindex == iflink) { + real_netdev = netdev; + dev_hold(real_netdev); + goto out; + } + real_netdev = dev_get_by_index(real_net, iflink); out: From 0aa6b294b312d9710804679abd2c0c8ca52cc2bc Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Wed, 2 Mar 2022 15:42:41 +0800 Subject: [PATCH 565/773] ALSA: intel_hdmi: Fix reference to PCM buffer address PCM buffers might be allocated dynamically when the buffer preallocation failed or a larger buffer is requested, and it's not guaranteed that substream->dma_buffer points to the actually used buffer. The driver needs to refer to substream->runtime->dma_addr instead for the buffer address. Signed-off-by: Zhen Ni Cc: Link: https://lore.kernel.org/r/20220302074241.30469-1-nizhen@uniontech.com Signed-off-by: Takashi Iwai --- sound/x86/intel_hdmi_audio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/x86/intel_hdmi_audio.c b/sound/x86/intel_hdmi_audio.c index 1c94eaff1931..4a3ff6468aa7 100644 --- a/sound/x86/intel_hdmi_audio.c +++ b/sound/x86/intel_hdmi_audio.c @@ -1261,7 +1261,7 @@ static int had_pcm_mmap(struct snd_pcm_substream *substream, { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return remap_pfn_range(vma, vma->vm_start, - substream->dma_buffer.addr >> PAGE_SHIFT, + substream->runtime->dma_addr >> PAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot); } From 22ba5e99b96f1c0dbdfa4f4e1d9751b4c8348541 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 22 Feb 2022 11:31:18 +0800 Subject: [PATCH 566/773] erofs: fix ztailpacking on > 4GiB filesystems z_idataoff here is an absolute physical offset, so it should use erofs_off_t (64 bits at least). Otherwise, it'll get trimmed and cause the decompresion failure. Link: https://lore.kernel.org/r/20220222033118.20540-1-hsiangkao@linux.alibaba.com Fixes: ab92184ff8f1 ("erofs: add on-disk compressed tail-packing inline support") Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b8272fb95fd6..5aa2cf2c2f80 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -325,7 +325,7 @@ struct erofs_inode { unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; unsigned long z_tailextent_headlcn; - unsigned int z_idataoff; + erofs_off_t z_idataoff; unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ From 8f4347081be32e67b0873827e0138ab0fdaaf450 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 2 Mar 2022 11:16:36 +0100 Subject: [PATCH 567/773] staging: rtl8723bs: Fix access-point mode deadlock Commit 54659ca026e5 ("staging: rtl8723bs: remove possible deadlock when disconnect (v2)") split the locking of pxmitpriv->lock vs sleep_q/lock into 2 locks in attempt to fix a lockdep reported issue with the locking order of the sta_hash_lock vs pxmitpriv->lock. But in the end this turned out to not fully solve the sta_hash_lock issue so commit a7ac783c338b ("staging: rtl8723bs: remove a second possible deadlock") was added to fix this in another way. The original fix was kept as it was still seen as a good thing to have, but now it turns out that it creates a deadlock in access-point mode: [Feb20 23:47] ====================================================== [ +0.074085] WARNING: possible circular locking dependency detected [ +0.074077] 5.16.0-1-amd64 #1 Tainted: G C E [ +0.064710] ------------------------------------------------------ [ +0.074075] ksoftirqd/3/29 is trying to acquire lock: [ +0.060542] ffffb8b30062ab00 (&pxmitpriv->lock){+.-.}-{2:2}, at: rtw_xmit_classifier+0x8a/0x140 [r8723bs] [ +0.114921] but task is already holding lock: [ +0.069908] ffffb8b3007ab704 (&psta->sleep_q.lock){+.-.}-{2:2}, at: wakeup_sta_to_xmit+0x3b/0x300 [r8723bs] [ +0.116976] which lock already depends on the new lock. [ +0.098037] the existing dependency chain (in reverse order) is: [ +0.089704] -> #1 (&psta->sleep_q.lock){+.-.}-{2:2}: [ +0.077232] _raw_spin_lock_bh+0x34/0x40 [ +0.053261] xmitframe_enqueue_for_sleeping_sta+0xc1/0x2f0 [r8723bs] [ +0.082572] rtw_xmit+0x58b/0x940 [r8723bs] [ +0.056528] _rtw_xmit_entry+0xba/0x350 [r8723bs] [ +0.062755] dev_hard_start_xmit+0xf1/0x320 [ +0.056381] sch_direct_xmit+0x9e/0x360 [ +0.052212] __dev_queue_xmit+0xce4/0x1080 [ +0.055334] ip6_finish_output2+0x18f/0x6e0 [ +0.056378] ndisc_send_skb+0x2c8/0x870 [ +0.052209] ndisc_send_ns+0xd3/0x210 [ +0.050130] addrconf_dad_work+0x3df/0x5a0 [ +0.055338] process_one_work+0x274/0x5a0 [ +0.054296] worker_thread+0x52/0x3b0 [ +0.050124] kthread+0x16c/0x1a0 [ +0.044925] ret_from_fork+0x1f/0x30 [ +0.049092] -> #0 (&pxmitpriv->lock){+.-.}-{2:2}: [ +0.074101] __lock_acquire+0x10f5/0x1d80 [ +0.054298] lock_acquire+0xd7/0x300 [ +0.049088] _raw_spin_lock_bh+0x34/0x40 [ +0.053248] rtw_xmit_classifier+0x8a/0x140 [r8723bs] [ +0.066949] rtw_xmitframe_enqueue+0xa/0x20 [r8723bs] [ +0.066946] rtl8723bs_hal_xmitframe_enqueue+0x14/0x50 [r8723bs] [ +0.078386] wakeup_sta_to_xmit+0xa6/0x300 [r8723bs] [ +0.065903] rtw_recv_entry+0xe36/0x1160 [r8723bs] [ +0.063809] rtl8723bs_recv_tasklet+0x349/0x6c0 [r8723bs] [ +0.071093] tasklet_action_common.constprop.0+0xe5/0x110 [ +0.070966] __do_softirq+0x16f/0x50a [ +0.050134] __irq_exit_rcu+0xeb/0x140 [ +0.051172] irq_exit_rcu+0xa/0x20 [ +0.047006] common_interrupt+0xb8/0xd0 [ +0.052214] asm_common_interrupt+0x1e/0x40 [ +0.056381] finish_task_switch.isra.0+0x100/0x3a0 [ +0.063670] __schedule+0x3ad/0xd20 [ +0.048047] schedule+0x4e/0xc0 [ +0.043880] smpboot_thread_fn+0xc4/0x220 [ +0.054298] kthread+0x16c/0x1a0 [ +0.044922] ret_from_fork+0x1f/0x30 [ +0.049088] other info that might help us debug this: [ +0.095950] Possible unsafe locking scenario: [ +0.070952] CPU0 CPU1 [ +0.054282] ---- ---- [ +0.054285] lock(&psta->sleep_q.lock); [ +0.047004] lock(&pxmitpriv->lock); [ +0.074082] lock(&psta->sleep_q.lock); [ +0.077209] lock(&pxmitpriv->lock); [ +0.043873] *** DEADLOCK *** [ +0.070950] 1 lock held by ksoftirqd/3/29: [ +0.049082] #0: ffffb8b3007ab704 (&psta->sleep_q.lock){+.-.}-{2:2}, at: wakeup_sta_to_xmit+0x3b/0x300 [r8723bs] Analysis shows that in hindsight the splitting of the lock was not a good idea, so revert this to fix the access-point mode deadlock. Note this is a straight-forward revert done with git revert, the commented out "/* spin_lock_bh(&psta_bmc->sleep_q.lock); */" lines were part of the code before the reverted changes. Fixes: 54659ca026e5 ("staging: rtl8723bs: remove possible deadlock when disconnect (v2)") Cc: stable Cc: Fabio Aiuto Signed-off-by: Hans de Goede BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215542 Link: https://lore.kernel.org/r/20220302101637.26542-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/core/rtw_mlme_ext.c | 7 ++++-- drivers/staging/rtl8723bs/core/rtw_recv.c | 10 +++++--- drivers/staging/rtl8723bs/core/rtw_sta_mgt.c | 24 +++++++++---------- drivers/staging/rtl8723bs/core/rtw_xmit.c | 16 +++++++------ .../staging/rtl8723bs/hal/rtl8723bs_xmit.c | 2 ++ 5 files changed, 34 insertions(+), 25 deletions(-) diff --git a/drivers/staging/rtl8723bs/core/rtw_mlme_ext.c b/drivers/staging/rtl8723bs/core/rtw_mlme_ext.c index 0f82f5031c43..49a3f45cb771 100644 --- a/drivers/staging/rtl8723bs/core/rtw_mlme_ext.c +++ b/drivers/staging/rtl8723bs/core/rtw_mlme_ext.c @@ -5907,6 +5907,7 @@ u8 chk_bmc_sleepq_hdl(struct adapter *padapter, unsigned char *pbuf) struct sta_info *psta_bmc; struct list_head *xmitframe_plist, *xmitframe_phead, *tmp; struct xmit_frame *pxmitframe = NULL; + struct xmit_priv *pxmitpriv = &padapter->xmitpriv; struct sta_priv *pstapriv = &padapter->stapriv; /* for BC/MC Frames */ @@ -5917,7 +5918,8 @@ u8 chk_bmc_sleepq_hdl(struct adapter *padapter, unsigned char *pbuf) if ((pstapriv->tim_bitmap&BIT(0)) && (psta_bmc->sleepq_len > 0)) { msleep(10);/* 10ms, ATIM(HIQ) Windows */ - spin_lock_bh(&psta_bmc->sleep_q.lock); + /* spin_lock_bh(&psta_bmc->sleep_q.lock); */ + spin_lock_bh(&pxmitpriv->lock); xmitframe_phead = get_list_head(&psta_bmc->sleep_q); list_for_each_safe(xmitframe_plist, tmp, xmitframe_phead) { @@ -5940,7 +5942,8 @@ u8 chk_bmc_sleepq_hdl(struct adapter *padapter, unsigned char *pbuf) rtw_hal_xmitframe_enqueue(padapter, pxmitframe); } - spin_unlock_bh(&psta_bmc->sleep_q.lock); + /* spin_unlock_bh(&psta_bmc->sleep_q.lock); */ + spin_unlock_bh(&pxmitpriv->lock); /* check hi queue and bmc_sleepq */ rtw_chk_hi_queue_cmd(padapter); diff --git a/drivers/staging/rtl8723bs/core/rtw_recv.c b/drivers/staging/rtl8723bs/core/rtw_recv.c index 41bfca549c64..105fe0e3482a 100644 --- a/drivers/staging/rtl8723bs/core/rtw_recv.c +++ b/drivers/staging/rtl8723bs/core/rtw_recv.c @@ -957,8 +957,10 @@ static signed int validate_recv_ctrl_frame(struct adapter *padapter, union recv_ if ((psta->state&WIFI_SLEEP_STATE) && (pstapriv->sta_dz_bitmap&BIT(psta->aid))) { struct list_head *xmitframe_plist, *xmitframe_phead; struct xmit_frame *pxmitframe = NULL; + struct xmit_priv *pxmitpriv = &padapter->xmitpriv; - spin_lock_bh(&psta->sleep_q.lock); + /* spin_lock_bh(&psta->sleep_q.lock); */ + spin_lock_bh(&pxmitpriv->lock); xmitframe_phead = get_list_head(&psta->sleep_q); xmitframe_plist = get_next(xmitframe_phead); @@ -989,10 +991,12 @@ static signed int validate_recv_ctrl_frame(struct adapter *padapter, union recv_ update_beacon(padapter, WLAN_EID_TIM, NULL, true); } - spin_unlock_bh(&psta->sleep_q.lock); + /* spin_unlock_bh(&psta->sleep_q.lock); */ + spin_unlock_bh(&pxmitpriv->lock); } else { - spin_unlock_bh(&psta->sleep_q.lock); + /* spin_unlock_bh(&psta->sleep_q.lock); */ + spin_unlock_bh(&pxmitpriv->lock); if (pstapriv->tim_bitmap&BIT(psta->aid)) { if (psta->sleepq_len == 0) { diff --git a/drivers/staging/rtl8723bs/core/rtw_sta_mgt.c b/drivers/staging/rtl8723bs/core/rtw_sta_mgt.c index 0c9ea1520fd0..beb11d89db18 100644 --- a/drivers/staging/rtl8723bs/core/rtw_sta_mgt.c +++ b/drivers/staging/rtl8723bs/core/rtw_sta_mgt.c @@ -293,48 +293,46 @@ u32 rtw_free_stainfo(struct adapter *padapter, struct sta_info *psta) /* list_del_init(&psta->wakeup_list); */ - spin_lock_bh(&psta->sleep_q.lock); - rtw_free_xmitframe_queue(pxmitpriv, &psta->sleep_q); - psta->sleepq_len = 0; - spin_unlock_bh(&psta->sleep_q.lock); - spin_lock_bh(&pxmitpriv->lock); + rtw_free_xmitframe_queue(pxmitpriv, &psta->sleep_q); + psta->sleepq_len = 0; + /* vo */ - spin_lock_bh(&pstaxmitpriv->vo_q.sta_pending.lock); + /* spin_lock_bh(&(pxmitpriv->vo_pending.lock)); */ rtw_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->vo_q.sta_pending); list_del_init(&(pstaxmitpriv->vo_q.tx_pending)); phwxmit = pxmitpriv->hwxmits; phwxmit->accnt -= pstaxmitpriv->vo_q.qcnt; pstaxmitpriv->vo_q.qcnt = 0; - spin_unlock_bh(&pstaxmitpriv->vo_q.sta_pending.lock); + /* spin_unlock_bh(&(pxmitpriv->vo_pending.lock)); */ /* vi */ - spin_lock_bh(&pstaxmitpriv->vi_q.sta_pending.lock); + /* spin_lock_bh(&(pxmitpriv->vi_pending.lock)); */ rtw_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->vi_q.sta_pending); list_del_init(&(pstaxmitpriv->vi_q.tx_pending)); phwxmit = pxmitpriv->hwxmits+1; phwxmit->accnt -= pstaxmitpriv->vi_q.qcnt; pstaxmitpriv->vi_q.qcnt = 0; - spin_unlock_bh(&pstaxmitpriv->vi_q.sta_pending.lock); + /* spin_unlock_bh(&(pxmitpriv->vi_pending.lock)); */ /* be */ - spin_lock_bh(&pstaxmitpriv->be_q.sta_pending.lock); + /* spin_lock_bh(&(pxmitpriv->be_pending.lock)); */ rtw_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->be_q.sta_pending); list_del_init(&(pstaxmitpriv->be_q.tx_pending)); phwxmit = pxmitpriv->hwxmits+2; phwxmit->accnt -= pstaxmitpriv->be_q.qcnt; pstaxmitpriv->be_q.qcnt = 0; - spin_unlock_bh(&pstaxmitpriv->be_q.sta_pending.lock); + /* spin_unlock_bh(&(pxmitpriv->be_pending.lock)); */ /* bk */ - spin_lock_bh(&pstaxmitpriv->bk_q.sta_pending.lock); + /* spin_lock_bh(&(pxmitpriv->bk_pending.lock)); */ rtw_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->bk_q.sta_pending); list_del_init(&(pstaxmitpriv->bk_q.tx_pending)); phwxmit = pxmitpriv->hwxmits+3; phwxmit->accnt -= pstaxmitpriv->bk_q.qcnt; pstaxmitpriv->bk_q.qcnt = 0; - spin_unlock_bh(&pstaxmitpriv->bk_q.sta_pending.lock); + /* spin_unlock_bh(&(pxmitpriv->bk_pending.lock)); */ spin_unlock_bh(&pxmitpriv->lock); diff --git a/drivers/staging/rtl8723bs/core/rtw_xmit.c b/drivers/staging/rtl8723bs/core/rtw_xmit.c index 13b8bd5ffabc..f466bfd248fb 100644 --- a/drivers/staging/rtl8723bs/core/rtw_xmit.c +++ b/drivers/staging/rtl8723bs/core/rtw_xmit.c @@ -1734,12 +1734,15 @@ void rtw_free_xmitframe_queue(struct xmit_priv *pxmitpriv, struct __queue *pfram struct list_head *plist, *phead, *tmp; struct xmit_frame *pxmitframe; + spin_lock_bh(&pframequeue->lock); + phead = get_list_head(pframequeue); list_for_each_safe(plist, tmp, phead) { pxmitframe = list_entry(plist, struct xmit_frame, list); rtw_free_xmitframe(pxmitpriv, pxmitframe); } + spin_unlock_bh(&pframequeue->lock); } s32 rtw_xmitframe_enqueue(struct adapter *padapter, struct xmit_frame *pxmitframe) @@ -1794,7 +1797,6 @@ s32 rtw_xmit_classifier(struct adapter *padapter, struct xmit_frame *pxmitframe) struct sta_info *psta; struct tx_servq *ptxservq; struct pkt_attrib *pattrib = &pxmitframe->attrib; - struct xmit_priv *xmit_priv = &padapter->xmitpriv; struct hw_xmit *phwxmits = padapter->xmitpriv.hwxmits; signed int res = _SUCCESS; @@ -1812,14 +1814,12 @@ s32 rtw_xmit_classifier(struct adapter *padapter, struct xmit_frame *pxmitframe) ptxservq = rtw_get_sta_pending(padapter, psta, pattrib->priority, (u8 *)(&ac_index)); - spin_lock_bh(&xmit_priv->lock); if (list_empty(&ptxservq->tx_pending)) list_add_tail(&ptxservq->tx_pending, get_list_head(phwxmits[ac_index].sta_queue)); list_add_tail(&pxmitframe->list, get_list_head(&ptxservq->sta_pending)); ptxservq->qcnt++; phwxmits[ac_index].accnt++; - spin_unlock_bh(&xmit_priv->lock); exit: @@ -2202,10 +2202,11 @@ void wakeup_sta_to_xmit(struct adapter *padapter, struct sta_info *psta) struct list_head *xmitframe_plist, *xmitframe_phead, *tmp; struct xmit_frame *pxmitframe = NULL; struct sta_priv *pstapriv = &padapter->stapriv; + struct xmit_priv *pxmitpriv = &padapter->xmitpriv; psta_bmc = rtw_get_bcmc_stainfo(padapter); - spin_lock_bh(&psta->sleep_q.lock); + spin_lock_bh(&pxmitpriv->lock); xmitframe_phead = get_list_head(&psta->sleep_q); list_for_each_safe(xmitframe_plist, tmp, xmitframe_phead) { @@ -2306,7 +2307,7 @@ void wakeup_sta_to_xmit(struct adapter *padapter, struct sta_info *psta) _exit: - spin_unlock_bh(&psta->sleep_q.lock); + spin_unlock_bh(&pxmitpriv->lock); if (update_mask) update_beacon(padapter, WLAN_EID_TIM, NULL, true); @@ -2318,8 +2319,9 @@ void xmit_delivery_enabled_frames(struct adapter *padapter, struct sta_info *pst struct list_head *xmitframe_plist, *xmitframe_phead, *tmp; struct xmit_frame *pxmitframe = NULL; struct sta_priv *pstapriv = &padapter->stapriv; + struct xmit_priv *pxmitpriv = &padapter->xmitpriv; - spin_lock_bh(&psta->sleep_q.lock); + spin_lock_bh(&pxmitpriv->lock); xmitframe_phead = get_list_head(&psta->sleep_q); list_for_each_safe(xmitframe_plist, tmp, xmitframe_phead) { @@ -2372,7 +2374,7 @@ void xmit_delivery_enabled_frames(struct adapter *padapter, struct sta_info *pst } } - spin_unlock_bh(&psta->sleep_q.lock); + spin_unlock_bh(&pxmitpriv->lock); } void enqueue_pending_xmitbuf(struct xmit_priv *pxmitpriv, struct xmit_buf *pxmitbuf) diff --git a/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c b/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c index b5d5e922231c..15810438a472 100644 --- a/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c +++ b/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c @@ -502,7 +502,9 @@ s32 rtl8723bs_hal_xmit( rtw_issue_addbareq_cmd(padapter, pxmitframe); } + spin_lock_bh(&pxmitpriv->lock); err = rtw_xmitframe_enqueue(padapter, pxmitframe); + spin_unlock_bh(&pxmitpriv->lock); if (err != _SUCCESS) { rtw_free_xmitframe(pxmitpriv, pxmitframe); From 342e7c6ea58200e45bcaa9bdd8402a5531c4777e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 2 Mar 2022 11:16:37 +0100 Subject: [PATCH 568/773] staging: rtl8723bs: Improve the comment explaining the locking rules rtw_mlme.h has a comment which briefly describes the locking rules for the rtl8723bs driver, improve this to also mention the locking order of xmit_priv.lock vs the lock(s) embedded in the various queues. Cc: Fabio Aiuto Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20220302101637.26542-2-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/include/rtw_mlme.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/staging/rtl8723bs/include/rtw_mlme.h b/drivers/staging/rtl8723bs/include/rtw_mlme.h index c94fa7d8d5a9..1b343b434f4d 100644 --- a/drivers/staging/rtl8723bs/include/rtw_mlme.h +++ b/drivers/staging/rtl8723bs/include/rtw_mlme.h @@ -102,13 +102,17 @@ there are several "locks" in mlme_priv, since mlme_priv is a shared resource between many threads, like ISR/Call-Back functions, the OID handlers, and even timer functions. - Each struct __queue has its own locks, already. -Other items are protected by mlme_priv.lock. +Other items in mlme_priv are protected by mlme_priv.lock, while items in +xmit_priv are protected by xmit_priv.lock. To avoid possible dead lock, any thread trying to modifiying mlme_priv SHALL not lock up more than one locks at a time! +The only exception is that queue functions which take the __queue.lock +may be called with the xmit_priv.lock held. In this case the order +MUST always be first lock xmit_priv.lock and then call any queue functions +which take __queue.lock. */ From c992fa1fd52380d0c4ced7b07479e877311ae645 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 18 Feb 2022 10:13:00 +0800 Subject: [PATCH 569/773] btrfs: subpage: fix a wrong check on subpage->writers [BUG] When looping btrfs/074 with 64K page size and 4K sectorsize, there is a low chance (1/50~1/100) to crash with the following ASSERT() triggered in btrfs_subpage_start_writer(): ret = atomic_add_return(nbits, &subpage->writers); ASSERT(ret == nbits); <<< This one <<< [CAUSE] With more debugging output on the parameters of btrfs_subpage_start_writer(), it shows a very concerning error: ret=29 nbits=13 start=393216 len=53248 For @nbits it's correct, but @ret which is the returned value from atomic_add_return(), it's not only larger than nbits, but also larger than max sectors per page value (for 64K page size and 4K sector size, it's 16). This indicates that some call sites are not properly decreasing the value. And that's exactly the case, in btrfs_page_unlock_writer(), due to the fact that we can have page locked either by lock_page() or process_one_page(), we have to check if the subpage has any writer. If no writers, it's locked by lock_page() and we only need to unlock it. But unfortunately the check for the writers are completely opposite: if (atomic_read(&subpage->writers)) /* No writers, locked by plain lock_page() */ return unlock_page(page); We directly unlock the page if it has writers, which is the completely opposite what we want. Thankfully the affected call site is only limited to extent_write_locked_range(), so it's mostly affecting compressed write. [FIX] Just fix the wrong check condition to fix the bug. Fixes: e55a0de18572 ("btrfs: rework page locking in __extent_writepage()") CC: stable@vger.kernel.org # 5.16 Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 29bd8c7a7706..ef7ae20d2b77 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, * Since we own the page lock, no one else could touch subpage::writers * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers)) + if (atomic_read(&subpage->writers) == 0) /* No writers, locked by plain lock_page() */ return unlock_page(page); From d99478874355d3a7b9d86dfb5d7590d5b1754b1f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 17 Feb 2022 12:12:02 +0000 Subject: [PATCH 570/773] btrfs: fix lost prealloc extents beyond eof after full fsync When doing a full fsync, if we have prealloc extents beyond (or at) eof, and the leaves that contain them were not modified in the current transaction, we end up not logging them. This results in losing those extents when we replay the log after a power failure, since the inode is truncated to the current value of the logged i_size. Just like for the fast fsync path, we need to always log all prealloc extents starting at or beyond i_size. The fast fsync case was fixed in commit 471d557afed155 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay") but it missed the full fsync path. The problem exists since the very early days, when the log tree was added by commit e02119d5a7b439 ("Btrfs: Add a write ahead tree log to optimize synchronous operations"). Example reproducer: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt # Create our test file with many file extent items, so that they span # several leaves of metadata, even if the node/page size is 64K. Use # direct IO and not fsync/O_SYNC because it's both faster and it avoids # clearing the full sync flag from the inode - we want the fsync below # to trigger the slow full sync code path. $ xfs_io -f -d -c "pwrite -b 4K 0 16M" /mnt/foo # Now add two preallocated extents to our file without extending the # file's size. One right at i_size, and another further beyond, leaving # a gap between the two prealloc extents. $ xfs_io -c "falloc -k 16M 1M" /mnt/foo $ xfs_io -c "falloc -k 20M 1M" /mnt/foo # Make sure everything is durably persisted and the transaction is # committed. This makes all created extents to have a generation lower # than the generation of the transaction used by the next write and # fsync. sync # Now overwrite only the first extent, which will result in modifying # only the first leaf of metadata for our inode. Then fsync it. This # fsync will use the slow code path (inode full sync bit is set) because # it's the first fsync since the inode was created/loaded. $ xfs_io -c "pwrite 0 4K" -c "fsync" /mnt/foo # Extent list before power failure. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x0 3: [32768..34815]: 2172928..2174975 2048 0x800 4: [34816..40959]: hole 6144 5: [40960..43007]: 2174976..2177023 2048 0x801 # Mount fs again, trigger log replay. $ mount /dev/sdc /mnt # Extent list after power failure and log replay. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x1 # The prealloc extents at file offsets 16M and 20M are missing. So fix this by calling btrfs_log_prealloc_extents() when we are doing a full fsync, so that we always log all prealloc extents beyond eof. A test case for fstests will follow soon. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3ee014c06b82..42caf595c936 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4635,7 +4635,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, /* * Log all prealloc extents beyond the inode's i_size to make sure we do not - * lose them after doing a fast fsync and replaying the log. We scan the + * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they @@ -5414,6 +5414,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { + const u64 i_size = i_size_read(&inode->vfs_inode); struct btrfs_root *root = inode->root; int ins_start_slot = 0; int ins_nr = 0; @@ -5434,13 +5435,21 @@ again: if (min_key->type > max_key->type) break; - if (min_key->type == BTRFS_INODE_ITEM_KEY) + if (min_key->type == BTRFS_INODE_ITEM_KEY) { *need_log_inode_item = false; - - if ((min_key->type == BTRFS_INODE_REF_KEY || - min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { u64 other_ino = 0; u64 other_parent = 0; @@ -5471,10 +5480,8 @@ again: btrfs_release_path(path); goto next_key; } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, @@ -5527,9 +5534,21 @@ next_key: break; } } - if (ins_nr) + if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } return ret; } From a50e1fcbc9b85fd4e95b89a75c0884cb032a3e06 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Feb 2022 10:17:39 -0500 Subject: [PATCH 571/773] btrfs: do not WARN_ON() if we have PageError set Whenever we do any extent buffer operations we call assert_eb_page_uptodate() to complain loudly if we're operating on an non-uptodate page. Our overnight tests caught this warning earlier this week WARNING: CPU: 1 PID: 553508 at fs/btrfs/extent_io.c:6849 assert_eb_page_uptodate+0x3f/0x50 CPU: 1 PID: 553508 Comm: kworker/u4:13 Tainted: G W 5.17.0-rc3+ #564 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Workqueue: btrfs-cache btrfs_work_helper RIP: 0010:assert_eb_page_uptodate+0x3f/0x50 RSP: 0018:ffffa961440a7c68 EFLAGS: 00010246 RAX: 0017ffffc0002112 RBX: ffffe6e74453f9c0 RCX: 0000000000001000 RDX: ffffe6e74467c887 RSI: ffffe6e74453f9c0 RDI: ffff8d4c5efc2fc0 RBP: 0000000000000d56 R08: ffff8d4d4a224000 R09: 0000000000000000 R10: 00015817fa9d1ef0 R11: 000000000000000c R12: 00000000000007b1 R13: ffff8d4c5efc2fc0 R14: 0000000001500000 R15: 0000000001cb1000 FS: 0000000000000000(0000) GS:ffff8d4dbbd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff31d3448d8 CR3: 0000000118be8004 CR4: 0000000000370ee0 Call Trace: extent_buffer_test_bit+0x3f/0x70 free_space_test_bit+0xa6/0xc0 load_free_space_tree+0x1f6/0x470 caching_thread+0x454/0x630 ? rcu_read_lock_sched_held+0x12/0x60 ? rcu_read_lock_sched_held+0x12/0x60 ? rcu_read_lock_sched_held+0x12/0x60 ? lock_release+0x1f0/0x2d0 btrfs_work_helper+0xf2/0x3e0 ? lock_release+0x1f0/0x2d0 ? finish_task_switch.isra.0+0xf9/0x3a0 process_one_work+0x26d/0x580 ? process_one_work+0x580/0x580 worker_thread+0x55/0x3b0 ? process_one_work+0x580/0x580 kthread+0xf0/0x120 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 This was partially fixed by c2e39305299f01 ("btrfs: clear extent buffer uptodate when we fail to write it"), however all that fix did was keep us from finding extent buffers after a failed writeout. It didn't keep us from continuing to use a buffer that we already had found. In this case we're searching the commit root to cache the block group, so we can start committing the transaction and switch the commit root and then start writing. After the switch we can look up an extent buffer that hasn't been written yet and start processing that block group. Then we fail to write that block out and clear Uptodate on the page, and then we start spewing these errors. Normally we're protected by the tree lock to a certain degree here. If we read a block we have that block read locked, and we block the writer from locking the block before we submit it for the write. However this isn't necessarily fool proof because the read could happen before we do the submit_bio and after we locked and unlocked the extent buffer. Also in this particular case we have path->skip_locking set, so that won't save us here. We'll simply get a block that was valid when we read it, but became invalid while we were using it. What we really want is to catch the case where we've "read" a block but it's not marked Uptodate. On read we ClearPageError(), so if we're !Uptodate and !Error we know we didn't do the right thing for reading the page. Fix this by checking !Uptodate && !Error, this way we will not complain if our buffer gets invalidated while we're using it, and we'll maintain the spirit of the check which is to make sure we have a fully in-cache block while we're messing with it. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d6d48ecf823c..9081223c3230 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6851,14 +6851,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate && !PageError, as we clear PageError before + * reading. + */ if (fs_info->sectorsize < PAGE_SIZE) { - bool uptodate; + bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate); + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); + WARN_ON(!uptodate && !error); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!PageUptodate(page) && !PageError(page)); } } From a6ab66eb8541d61b0a11d70980f07b4c2dfeddc5 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Tue, 22 Feb 2022 16:42:07 +0800 Subject: [PATCH 572/773] btrfs: tree-checker: use u64 for item data end to avoid overflow User reported there is an array-index-out-of-bounds access while mounting the crafted image: [350.411942 ] loop0: detected capacity change from 0 to 262144 [350.427058 ] BTRFS: device fsid a62e00e8-e94e-4200-8217-12444de93c2e devid 1 transid 8 /dev/loop0 scanned by systemd-udevd (1044) [350.428564 ] BTRFS info (device loop0): disk space caching is enabled [350.428568 ] BTRFS info (device loop0): has skinny extents [350.429589 ] [350.429619 ] UBSAN: array-index-out-of-bounds in fs/btrfs/struct-funcs.c:161:1 [350.429636 ] index 1048096 is out of range for type 'page *[16]' [350.429650 ] CPU: 0 PID: 9 Comm: kworker/u8:1 Not tainted 5.16.0-rc4 [350.429652 ] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [350.429653 ] Workqueue: btrfs-endio-meta btrfs_work_helper [btrfs] [350.429772 ] Call Trace: [350.429774 ] [350.429776 ] dump_stack_lvl+0x47/0x5c [350.429780 ] ubsan_epilogue+0x5/0x50 [350.429786 ] __ubsan_handle_out_of_bounds+0x66/0x70 [350.429791 ] btrfs_get_16+0xfd/0x120 [btrfs] [350.429832 ] check_leaf+0x754/0x1a40 [btrfs] [350.429874 ] ? filemap_read+0x34a/0x390 [350.429878 ] ? load_balance+0x175/0xfc0 [350.429881 ] validate_extent_buffer+0x244/0x310 [btrfs] [350.429911 ] btrfs_validate_metadata_buffer+0xf8/0x100 [btrfs] [350.429935 ] end_bio_extent_readpage+0x3af/0x850 [btrfs] [350.429969 ] ? newidle_balance+0x259/0x480 [350.429972 ] end_workqueue_fn+0x29/0x40 [btrfs] [350.429995 ] btrfs_work_helper+0x71/0x330 [btrfs] [350.430030 ] ? __schedule+0x2fb/0xa40 [350.430033 ] process_one_work+0x1f6/0x400 [350.430035 ] ? process_one_work+0x400/0x400 [350.430036 ] worker_thread+0x2d/0x3d0 [350.430037 ] ? process_one_work+0x400/0x400 [350.430038 ] kthread+0x165/0x190 [350.430041 ] ? set_kthread_struct+0x40/0x40 [350.430043 ] ret_from_fork+0x1f/0x30 [350.430047 ] [350.430047 ] [350.430077 ] BTRFS warning (device loop0): bad eb member start: ptr 0xffe20f4e start 20975616 member offset 4293005178 size 2 btrfs check reports: corrupt leaf: root=3 block=20975616 physical=20975616 slot=1, unexpected item end, have 4294971193 expect 3897 The first slot item offset is 4293005033 and the size is 1966160. In check_leaf, we use btrfs_item_end() to check item boundary versus extent_buffer data size. However, return type of btrfs_item_end() is u32. (u32)(4293005033 + 1966160) == 3897, overflow happens and the result 3897 equals to leaf data size reasonably. Fix it by use u64 variable to store item data end in check_leaf() to avoid u32 overflow. This commit does solve the invalid memory access showed by the stack trace. However, its metadata profile is DUP and another copy of the leaf is fine. So the image can be mounted successfully. But when umount is called, the ASSERT btrfs_mark_buffer_dirty() will be triggered because the only node in extent tree has 0 item and invalid owner. It's solved by another commit "btrfs: check extent buffer owner against the owner rootid". Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215299 Reported-by: Wenqing Liu CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 9fd145f1c4bc..aae5697dde32 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1682,6 +1682,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) */ for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; + u64 item_data_end; int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1696,6 +1697,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return -EUCLEAN; } + item_data_end = (u64)btrfs_item_offset(leaf, slot) + + btrfs_item_size(leaf, slot); /* * Make sure the offset and ends are right, remember that the * item data starts at the end of the leaf and grows towards the @@ -1706,11 +1709,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) else item_end_expected = btrfs_item_offset(leaf, slot - 1); - if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { + if (unlikely(item_data_end != item_end_expected)) { generic_err(leaf, slot, - "unexpected item end, have %u expect %u", - btrfs_item_data_end(leaf, slot), - item_end_expected); + "unexpected item end, have %llu expect %u", + item_data_end, item_end_expected); return -EUCLEAN; } @@ -1719,12 +1721,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (unlikely(btrfs_item_data_end(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info))) { + if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, - "slot end outside of leaf, have %u expect range [0, %u]", - btrfs_item_data_end(leaf, slot), - BTRFS_LEAF_DATA_SIZE(fs_info)); + "slot end outside of leaf, have %llu expect range [0, %u]", + item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } From b4be6aefa73c9a6899ef3ba9c5faaa8a66e333ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Feb 2022 14:56:10 -0500 Subject: [PATCH 573/773] btrfs: do not start relocation until in progress drops are done We hit a bug with a recovering relocation on mount for one of our file systems in production. I reproduced this locally by injecting errors into snapshot delete with balance running at the same time. This presented as an error while looking up an extent item WARNING: CPU: 5 PID: 1501 at fs/btrfs/extent-tree.c:866 lookup_inline_extent_backref+0x647/0x680 CPU: 5 PID: 1501 Comm: btrfs-balance Not tainted 5.16.0-rc8+ #8 RIP: 0010:lookup_inline_extent_backref+0x647/0x680 RSP: 0018:ffffae0a023ab960 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000000000 RBP: ffff943fd2a39b60 R08: 0000000000000000 R09: 0000000000000001 R10: 0001434088152de0 R11: 0000000000000000 R12: 0000000001d05000 R13: ffff943fd2a39b60 R14: ffff943fdb96f2a0 R15: ffff9442fc923000 FS: 0000000000000000(0000) GS:ffff944e9eb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1157b1fca8 CR3: 000000010f092000 CR4: 0000000000350ee0 Call Trace: insert_inline_extent_backref+0x46/0xd0 __btrfs_inc_extent_ref.isra.0+0x5f/0x200 ? btrfs_merge_delayed_refs+0x164/0x190 __btrfs_run_delayed_refs+0x561/0xfa0 ? btrfs_search_slot+0x7b4/0xb30 ? btrfs_update_root+0x1a9/0x2c0 btrfs_run_delayed_refs+0x73/0x1f0 ? btrfs_update_root+0x1a9/0x2c0 btrfs_commit_transaction+0x50/0xa50 ? btrfs_update_reloc_root+0x122/0x220 prepare_to_merge+0x29f/0x320 relocate_block_group+0x2b8/0x550 btrfs_relocate_block_group+0x1a6/0x350 btrfs_relocate_chunk+0x27/0xe0 btrfs_balance+0x777/0xe60 balance_kthread+0x35/0x50 ? btrfs_balance+0xe60/0xe60 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Normally snapshot deletion and relocation are excluded from running at the same time by the fs_info->cleaner_mutex. However if we had a pending balance waiting to get the ->cleaner_mutex, and a snapshot deletion was running, and then the box crashed, we would come up in a state where we have a half deleted snapshot. Again, in the normal case the snapshot deletion needs to complete before relocation can start, but in this case relocation could very well start before the snapshot deletion completes, as we simply add the root to the dead roots list and wait for the next time the cleaner runs to clean up the snapshot. Fix this by setting a bit on the fs_info if we have any DEAD_ROOT's that had a pending drop_progress key. If they do then we know we were in the middle of the drop operation and set a flag on the fs_info. Then balance can wait until this flag is cleared to start up again. If there are DEAD_ROOT's that don't have a drop_progress set then we're safe to start balance right away as we'll be properly protected by the cleaner_mutex. CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 10 ++++++++++ fs/btrfs/disk-io.c | 10 ++++++++++ fs/btrfs/extent-tree.c | 10 ++++++++++ fs/btrfs/relocation.c | 13 +++++++++++++ fs/btrfs/root-tree.c | 15 +++++++++++++++ fs/btrfs/transaction.c | 33 ++++++++++++++++++++++++++++++++- fs/btrfs/transaction.h | 1 + 7 files changed, 91 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 947f04789389..ebb2d109e8bb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -602,6 +602,9 @@ enum { /* Indicate that we want the transaction kthread to commit right now. */ BTRFS_FS_COMMIT_TRANS, + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -1106,8 +1109,15 @@ enum { BTRFS_ROOT_QGROUP_FLUSHING, /* We started the orphan cleanup for this root. */ BTRFS_ROOT_ORPHAN_CLEANUP, + /* This root has a drop operation that was started previously. */ + BTRFS_ROOT_UNFINISHED_DROP, }; +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87a5addbedf6..48590a380762 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3813,6 +3813,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device set_bit(BTRFS_FS_OPEN, &fs_info->flags); + /* Kick the cleaner thread so it'll start deleting snapshots. */ + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); + clear_oneshot: btrfs_clear_oneshot_options(fs_info); return 0; @@ -4538,6 +4542,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); + /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + */ + btrfs_wake_unfinished_drop(fs_info); + /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d89273c4b6b8..96427b1ecac3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5622,6 +5622,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) int ret; int level; bool root_dropped = false; + bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); @@ -5664,6 +5665,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -5838,6 +5841,13 @@ out_free: kfree(wc); btrfs_free_path(path); out: + /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f5465197996d..9d8054839782 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + /* + * This only gets set if we had a half-deleted snapshot on mount. We + * cannot allow relocation to start while we're still trying to clean up + * these pending deletions. + */ + ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); + if (ret) + return ret; + + /* We may have been woken up by close_ctree, so bail if we're closing. */ + if (btrfs_fs_closing(fs_info)) + return -EINTR; + bg = btrfs_lookup_block_group(fs_info, group_start); if (!bg) return -ENOENT; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3d68d2dcd83e..ca7426ef61c8 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { + struct btrfs_key drop_key; + + btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress); + /* + * If we have a non-zero drop_progress then we know we + * made it partly through deleting this snapshot, and + * thus we need to make sure we block any balance from + * happening until this snapshot is completely dropped. + */ + if (drop_key.objectid != 0 || drop_key.type != 0 || + drop_key.offset != 0) { + set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); + set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + } + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c3cfdfd8de9b..f17bf3764ce8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1319,6 +1319,32 @@ again: return 0; } +/* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + /* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to @@ -1331,7 +1357,12 @@ void btrfs_add_dead_root(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); - list_add_tail(&root->root_list, &fs_info->dead_roots); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 9402d8d94484..ba8a9826eb37 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -216,6 +216,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); From 5fd76bf31ccfecc06e2e6b29f8c809e934085b99 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Feb 2022 15:14:43 -0800 Subject: [PATCH 574/773] btrfs: fix relocation crash due to premature return from btrfs_commit_transaction() We are seeing crashes similar to the following trace: [38.969182] WARNING: CPU: 20 PID: 2105 at fs/btrfs/relocation.c:4070 btrfs_relocate_block_group+0x2dc/0x340 [btrfs] [38.973556] CPU: 20 PID: 2105 Comm: btrfs Not tainted 5.17.0-rc4 #54 [38.974580] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [38.976539] RIP: 0010:btrfs_relocate_block_group+0x2dc/0x340 [btrfs] [38.980336] RSP: 0000:ffffb0dd42e03c20 EFLAGS: 00010206 [38.981218] RAX: ffff96cfc4ede800 RBX: ffff96cfc3ce0000 RCX: 000000000002ca14 [38.982560] RDX: 0000000000000000 RSI: 4cfd109a0bcb5d7f RDI: ffff96cfc3ce0360 [38.983619] RBP: ffff96cfc309c000 R08: 0000000000000000 R09: 0000000000000000 [38.984678] R10: ffff96cec0000001 R11: ffffe84c80000000 R12: ffff96cfc4ede800 [38.985735] R13: 0000000000000000 R14: 0000000000000000 R15: ffff96cfc3ce0360 [38.987146] FS: 00007f11c15218c0(0000) GS:ffff96d6dfb00000(0000) knlGS:0000000000000000 [38.988662] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [38.989398] CR2: 00007ffc922c8e60 CR3: 00000001147a6001 CR4: 0000000000370ee0 [38.990279] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [38.991219] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [38.992528] Call Trace: [38.992854] [38.993148] btrfs_relocate_chunk+0x27/0xe0 [btrfs] [38.993941] btrfs_balance+0x78e/0xea0 [btrfs] [38.994801] ? vsnprintf+0x33c/0x520 [38.995368] ? __kmalloc_track_caller+0x351/0x440 [38.996198] btrfs_ioctl_balance+0x2b9/0x3a0 [btrfs] [38.997084] btrfs_ioctl+0x11b0/0x2da0 [btrfs] [38.997867] ? mod_objcg_state+0xee/0x340 [38.998552] ? seq_release+0x24/0x30 [38.999184] ? proc_nr_files+0x30/0x30 [38.999654] ? call_rcu+0xc8/0x2f0 [39.000228] ? __x64_sys_ioctl+0x84/0xc0 [39.000872] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [39.001973] __x64_sys_ioctl+0x84/0xc0 [39.002566] do_syscall_64+0x3a/0x80 [39.003011] entry_SYSCALL_64_after_hwframe+0x44/0xae [39.003735] RIP: 0033:0x7f11c166959b [39.007324] RSP: 002b:00007fff2543e998 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [39.008521] RAX: ffffffffffffffda RBX: 00007f11c1521698 RCX: 00007f11c166959b [39.009833] RDX: 00007fff2543ea40 RSI: 00000000c4009420 RDI: 0000000000000003 [39.011270] RBP: 0000000000000003 R08: 0000000000000013 R09: 00007f11c16f94e0 [39.012581] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fff25440df3 [39.014046] R13: 0000000000000000 R14: 00007fff2543ea40 R15: 0000000000000001 [39.015040] [39.015418] ---[ end trace 0000000000000000 ]--- [43.131559] ------------[ cut here ]------------ [43.132234] kernel BUG at fs/btrfs/extent-tree.c:2717! [43.133031] invalid opcode: 0000 [#1] PREEMPT SMP PTI [43.133702] CPU: 1 PID: 1839 Comm: btrfs Tainted: G W 5.17.0-rc4 #54 [43.134863] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [43.136426] RIP: 0010:unpin_extent_range+0x37a/0x4f0 [btrfs] [43.139913] RSP: 0000:ffffb0dd4216bc70 EFLAGS: 00010246 [43.140629] RAX: 0000000000000000 RBX: ffff96cfc34490f8 RCX: 0000000000000001 [43.141604] RDX: 0000000080000001 RSI: 0000000051d00000 RDI: 00000000ffffffff [43.142645] RBP: 0000000000000000 R08: 0000000000000000 R09: ffff96cfd07dca50 [43.143669] R10: ffff96cfc46e8a00 R11: fffffffffffec000 R12: 0000000041d00000 [43.144657] R13: ffff96cfc3ce0000 R14: ffffb0dd4216bd08 R15: 0000000000000000 [43.145686] FS: 00007f7657dd68c0(0000) GS:ffff96d6df640000(0000) knlGS:0000000000000000 [43.146808] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43.147584] CR2: 00007f7fe81bf5b0 CR3: 00000001093ee004 CR4: 0000000000370ee0 [43.148589] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [43.149581] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [43.150559] Call Trace: [43.150904] [43.151253] btrfs_finish_extent_commit+0x88/0x290 [btrfs] [43.152127] btrfs_commit_transaction+0x74f/0xaa0 [btrfs] [43.152932] ? btrfs_attach_transaction_barrier+0x1e/0x50 [btrfs] [43.153786] btrfs_ioctl+0x1edc/0x2da0 [btrfs] [43.154475] ? __check_object_size+0x150/0x170 [43.155170] ? preempt_count_add+0x49/0xa0 [43.155753] ? __x64_sys_ioctl+0x84/0xc0 [43.156437] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [43.157456] __x64_sys_ioctl+0x84/0xc0 [43.157980] do_syscall_64+0x3a/0x80 [43.158543] entry_SYSCALL_64_after_hwframe+0x44/0xae [43.159231] RIP: 0033:0x7f7657f1e59b [43.161819] RSP: 002b:00007ffda5cd1658 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [43.162702] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f7657f1e59b [43.163526] RDX: 0000000000000000 RSI: 0000000000009408 RDI: 0000000000000003 [43.164358] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 [43.165208] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [43.166029] R13: 00005621b91c3232 R14: 00005621b91ba580 R15: 00007ffda5cd1800 [43.166863] [43.167125] Modules linked in: btrfs blake2b_generic xor pata_acpi ata_piix libata raid6_pq scsi_mod libcrc32c virtio_net virtio_rng net_failover rng_core failover scsi_common [43.169552] ---[ end trace 0000000000000000 ]--- [43.171226] RIP: 0010:unpin_extent_range+0x37a/0x4f0 [btrfs] [43.174767] RSP: 0000:ffffb0dd4216bc70 EFLAGS: 00010246 [43.175600] RAX: 0000000000000000 RBX: ffff96cfc34490f8 RCX: 0000000000000001 [43.176468] RDX: 0000000080000001 RSI: 0000000051d00000 RDI: 00000000ffffffff [43.177357] RBP: 0000000000000000 R08: 0000000000000000 R09: ffff96cfd07dca50 [43.178271] R10: ffff96cfc46e8a00 R11: fffffffffffec000 R12: 0000000041d00000 [43.179178] R13: ffff96cfc3ce0000 R14: ffffb0dd4216bd08 R15: 0000000000000000 [43.180071] FS: 00007f7657dd68c0(0000) GS:ffff96d6df800000(0000) knlGS:0000000000000000 [43.181073] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43.181808] CR2: 00007fe09905f010 CR3: 00000001093ee004 CR4: 0000000000370ee0 [43.182706] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [43.183591] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 We first hit the WARN_ON(rc->block_group->pinned > 0) in btrfs_relocate_block_group() and then the BUG_ON(!cache) in unpin_extent_range(). This tells us that we are exiting relocation and removing the block group with bytes still pinned for that block group. This is supposed to be impossible: the last thing relocate_block_group() does is commit the transaction to get rid of pinned extents. Commit d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit") introduced an optimization so that commits from fsync don't have to wait for the previous commit to unpin extents. This was only intended to affect fsync, but it inadvertently made it possible for any commit to skip waiting for the previous commit to unpin. This is because if a call to btrfs_commit_transaction() finds that another thread is already committing the transaction, it waits for the other thread to complete the commit and then returns. If that other thread was in fsync, then it completes the commit without completing the previous commit. This makes the following sequence of events possible: Thread 1____________________|Thread 2 (fsync)_____________________|Thread 3 (balance)___________________ btrfs_commit_transaction(N) | | btrfs_run_delayed_refs | | pin extents | | ... | | state = UNBLOCKED |btrfs_sync_file | | btrfs_start_transaction(N + 1) |relocate_block_group | | btrfs_join_transaction(N + 1) | btrfs_commit_transaction(N + 1) | ... | trans->state = COMMIT_START | | | btrfs_commit_transaction(N + 1) | | wait_for_commit(N + 1, COMPLETED) | wait_for_commit(N, SUPER_COMMITTED)| state = SUPER_COMMITTED | ... | btrfs_finish_extent_commit| | unpin_extent_range() | trans->state = COMPLETED | | | return | | ... | |Thread 1 isn't done, so pinned > 0 | |and we WARN | | | |btrfs_remove_block_group unpin_extent_range() | | Thread 3 removed the | | block group, so we BUG| | There are other sequences involving SUPER_COMMITTED transactions that can cause a similar outcome. We could fix this by making relocation explicitly wait for unpinning, but there may be other cases that need it. Josef mentioned ENOSPC flushing and the free space cache inode as other potential victims. Rather than playing whack-a-mole, this fix is conservative and makes all commits not in fsync wait for all previous transactions, which is what the optimization intended. Fixes: d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f17bf3764ce8..1f1c25db6f6b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state >= min_state); + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) From d4aef1e122d8bbdc15ce3bd0bc813d6b44a7d63a Mon Sep 17 00:00:00 2001 From: Sidong Yang Date: Mon, 28 Feb 2022 01:43:40 +0000 Subject: [PATCH 575/773] btrfs: qgroup: fix deadlock between rescan worker and remove qgroup The commit e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker") by Kawasaki resolves deadlock between quota disable and qgroup rescan worker. But also there is a deadlock case like it. It's about enabling or disabling quota and creating or removing qgroup. It can be reproduced in simple script below. for i in {1..100} do btrfs quota enable /mnt & btrfs qgroup create 1/0 /mnt & btrfs qgroup destroy 1/0 /mnt & btrfs quota disable /mnt & done Here's why the deadlock happens: 1) The quota rescan task is running. 2) Task A calls btrfs_quota_disable(), locks the qgroup_ioctl_lock mutex, and then calls btrfs_qgroup_wait_for_completion(), to wait for the quota rescan task to complete. 3) Task B calls btrfs_remove_qgroup() and it blocks when trying to lock the qgroup_ioctl_lock mutex, because it's being held by task A. At that point task B is holding a transaction handle for the current transaction. 4) The quota rescan task calls btrfs_commit_transaction(). This results in it waiting for all other tasks to release their handles on the transaction, but task B is blocked on the qgroup_ioctl_lock mutex while holding a handle on the transaction, and that mutex is being held by task A, which is waiting for the quota rescan task to complete, resulting in a deadlock between these 3 tasks. To resolve this issue, the thread disabling quota should unlock qgroup_ioctl_lock before waiting rescan completion. Move btrfs_qgroup_wait_for_completion() after unlock of qgroup_ioctl_lock. Fixes: e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Reviewed-by: Shin'ichiro Kawasaki Signed-off-by: Sidong Yang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f12dc687350c..30d42ea655ce 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1196,6 +1196,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) if (!fs_info->quota_root) goto out; + /* + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs + * to lock that mutex while holding a transaction handle and the rescan + * worker needs to commit a transaction. + */ + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* * Request qgroup rescan worker to complete and wait for it. This wait * must be done before transaction start for quota disable since it may @@ -1203,7 +1211,6 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) */ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); - mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * 1 For the root item From 4751dc99627e4d1465c5bfa8cb7ab31ed418eff5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 28 Feb 2022 16:29:28 +0000 Subject: [PATCH 576/773] btrfs: add missing run of delayed items after unlink during log replay During log replay, whenever we need to check if a name (dentry) exists in a directory we do searches on the subvolume tree for inode references or or directory entries (BTRFS_DIR_INDEX_KEY keys, and BTRFS_DIR_ITEM_KEY keys as well, before kernel 5.17). However when during log replay we unlink a name, through btrfs_unlink_inode(), we may not delete inode references and dir index keys from a subvolume tree and instead just add the deletions to the delayed inode's delayed items, which will only be run when we commit the transaction used for log replay. This means that after an unlink operation during log replay, if we attempt to search for the same name during log replay, we will not see that the name was already deleted, since the deletion is recorded only on the delayed items. We run delayed items after every unlink operation during log replay, except at unlink_old_inode_refs() and at add_inode_ref(). This was due to an overlook, as delayed items should be run after evert unlink, for the reasons stated above. So fix those two cases. Fixes: 0d836392cadd5 ("Btrfs: fix mount failure after fsync due to hard link recreation") Fixes: 1f250e929a9c9 ("Btrfs: fix log replay failure after unlink and link combination") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 42caf595c936..6bc8834ac8f7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1362,6 +1362,15 @@ again: inode, name, namelen); kfree(name); iput(dir); + /* + * Whenever we need to check if a name exists or not, we + * check the subvolume tree. So after an unlink we must + * run delayed items, so that future checks for a name + * during log replay see that the name does not exists + * anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); if (ret) goto out; goto again; @@ -1614,6 +1623,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, */ if (!ret && inode->i_nlink == 0) inc_nlink(inode); + /* + * Whenever we need to check if a name exists or + * not, we check the subvolume tree. So after an + * unlink we must run delayed items, so that future + * checks for a name during log replay see that the + * name does not exists anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); } if (ret < 0) goto out; From c6c937d673aaa1d603f62f134e1ca9c173eeeed3 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 1 Mar 2022 20:49:41 +0800 Subject: [PATCH 577/773] KVM: x86/mmu: Passing up the error state of mmu_alloc_shadow_roots() Just like on the optional mmu_alloc_direct_roots() path, once shadow path reaches "r = -EIO" somewhere, the caller needs to know the actual state in order to enter error handling and avoid something worse. Fixes: 4a38162ee9f1 ("KVM: MMU: load PDPTRs outside mmu_lock") Signed-off-by: Like Xu Reviewed-by: Sean Christopherson Message-Id: <20220301124941.48412-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8e24f73bf60b..5628d0ba637e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3565,7 +3565,7 @@ set_root_pgd: out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - return 0; + return r; } static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) From 8d25b7beca7ed6ca34f53f0f8abd009e2be15d94 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 19 Feb 2022 04:28:20 -0500 Subject: [PATCH 578/773] KVM: x86: pull kvm->srcu read-side to kvm_arch_vcpu_ioctl_run kvm_arch_vcpu_ioctl_run is already doing srcu_read_lock/unlock in two places, namely vcpu_run and post_kvm_run_save, and a third is actually needed around the call to vcpu->arch.complete_userspace_io to avoid the following splat: WARNING: suspicious RCU usage arch/x86/kvm/pmu.c:190 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by CPU 28/KVM/370841: #0: ff11004089f280b8 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x87/0x730 [kvm] Call Trace: dump_stack_lvl+0x59/0x73 reprogram_fixed_counter+0x15d/0x1a0 [kvm] kvm_pmu_trigger_event+0x1a3/0x260 [kvm] ? free_moved_vector+0x1b4/0x1e0 complete_fast_pio_in+0x8a/0xd0 [kvm] This splat is not at all unexpected, since complete_userspace_io callbacks can execute similar code to vmexits. For example, SVM with nrips=false will call into the emulator from svm_skip_emulated_instruction(). While it's tempting to never acquire kvm->srcu for an uninitialized vCPU, practically speaking there's no penalty to acquiring kvm->srcu "early" as the KVM_MP_STATE_UNINITIALIZED path is a one-time thing per vCPU. On the other hand, seemingly innocuous helpers like kvm_apic_accept_events() and sync_regs() can theoretically reach code that might access SRCU-protected data structures, e.g. sync_regs() can trigger forced existing of nested mode via kvm_vcpu_ioctl_x86_set_vcpu_events(). Reported-by: Like Xu Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82a9dcd8c67f..eb4029660bd9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9180,6 +9180,7 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) likely(!pic_in_kernel(vcpu->kvm)); } +/* Called within kvm->srcu read side. */ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; @@ -9188,16 +9189,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); - /* - * The call to kvm_ready_for_interrupt_injection() may end up in - * kvm_xen_has_interrupt() which may require the srcu lock to be - * held, to protect against changes in the vcpu_info address. - */ - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; @@ -9815,6 +9809,7 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); /* + * Called within kvm->srcu read side. * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the * userspace. @@ -10193,6 +10188,7 @@ out: return r; } +/* Called within kvm->srcu read side. */ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { bool hv_timer; @@ -10252,12 +10248,12 @@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } +/* Called within kvm->srcu read side. */ static int vcpu_run(struct kvm_vcpu *vcpu) { int r; struct kvm *kvm = vcpu->kvm; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vcpu->arch.l1tf_flush_l1d = true; for (;;) { @@ -10285,14 +10281,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (__xfer_to_guest_mode_work_pending()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); r = xfer_to_guest_mode_handle_work(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (r) return r; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - return r; } @@ -10398,6 +10392,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; + struct kvm *kvm = vcpu->kvm; int r; vcpu_load(vcpu); @@ -10405,6 +10400,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_run->flags = 0; kvm_load_guest_fpu(vcpu); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { r = -EINTR; @@ -10415,7 +10411,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * use before KVM has ever run the vCPU. */ WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); + + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (kvm_apic_accept_events(vcpu) < 0) { r = 0; goto out; @@ -10475,8 +10475,9 @@ out: if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); - kvm_sigset_deactivate(vcpu); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_sigset_deactivate(vcpu); vcpu_put(vcpu); return r; } From 90f8f4c0e3cebd541deaa45cf0e470bb9810dd4f Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Mon, 28 Feb 2022 12:39:57 -0800 Subject: [PATCH 579/773] ptp: ocp: Add ptp_ocp_adjtime_coarse for large adjustments In ("ptp: ocp: Have FPGA fold in ns adjustment for adjtime."), the ns adjustment was written to the FPGA register, so the clock could accurately perform adjustments. However, the adjtime() call passes in a s64, while the clock adjustment registers use a s32. When trying to perform adjustments with a large value (37 sec), things fail. Examine the incoming delta, and if larger than 1 sec, use the original (coarse) adjustment method. If smaller than 1 sec, then allow the FPGA to fold in the changes over a 1 second window. Fixes: 6d59d4fa1789 ("ptp: ocp: Have FPGA fold in ns adjustment for adjtime.") Signed-off-by: Jonathan Lemon Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20220228203957.367371-1-jonathan.lemon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_ocp.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index 0f1b5a7d2a89..17ad5f0d13b2 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -607,7 +607,7 @@ ptp_ocp_settime(struct ptp_clock_info *ptp_info, const struct timespec64 *ts) } static void -__ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u64 adj_val) +__ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u32 adj_val) { u32 select, ctrl; @@ -615,7 +615,7 @@ __ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u64 adj_val) iowrite32(OCP_SELECT_CLK_REG, &bp->reg->select); iowrite32(adj_val, &bp->reg->offset_ns); - iowrite32(adj_val & 0x7f, &bp->reg->offset_window_ns); + iowrite32(NSEC_PER_SEC, &bp->reg->offset_window_ns); ctrl = OCP_CTRL_ADJUST_OFFSET | OCP_CTRL_ENABLE; iowrite32(ctrl, &bp->reg->ctrl); @@ -624,6 +624,22 @@ __ptp_ocp_adjtime_locked(struct ptp_ocp *bp, u64 adj_val) iowrite32(select >> 16, &bp->reg->select); } +static void +ptp_ocp_adjtime_coarse(struct ptp_ocp *bp, u64 delta_ns) +{ + struct timespec64 ts; + unsigned long flags; + int err; + + spin_lock_irqsave(&bp->lock, flags); + err = __ptp_ocp_gettime_locked(bp, &ts, NULL); + if (likely(!err)) { + timespec64_add_ns(&ts, delta_ns); + __ptp_ocp_settime_locked(bp, &ts); + } + spin_unlock_irqrestore(&bp->lock, flags); +} + static int ptp_ocp_adjtime(struct ptp_clock_info *ptp_info, s64 delta_ns) { @@ -631,6 +647,11 @@ ptp_ocp_adjtime(struct ptp_clock_info *ptp_info, s64 delta_ns) unsigned long flags; u32 adj_ns, sign; + if (delta_ns > NSEC_PER_SEC || -delta_ns > NSEC_PER_SEC) { + ptp_ocp_adjtime_coarse(bp, delta_ns); + return 0; + } + sign = delta_ns < 0 ? BIT(31) : 0; adj_ns = sign ? -delta_ns : delta_ns; From 875ad06015329314c594d3302ac2bbea37774543 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 27 Feb 2022 12:00:51 -0800 Subject: [PATCH 580/773] iwlwifi: fix build error for IWLMEI When CONFIG_IWLWIFI=m and CONFIG_IWLMEI=y, the kernel build system must be told to build the iwlwifi/ subdirectory for both IWLWIFI and IWLMEI so that builds for both =y and =m are done. This resolves an undefined reference build error: ERROR: modpost: "iwl_mei_is_connected" [drivers/net/wireless/intel/iwlwifi/iwlwifi.ko] undefined! Fixes: 977df8bd5844 ("wlwifi: work around reverse dependency on MEI") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Arnd Bergmann Cc: Luca Coelho Cc: linux-wireless@vger.kernel.org Link: https://lore.kernel.org/r/20220227200051.7176-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/Makefile b/drivers/net/wireless/intel/Makefile index 1364b0014488..208e73a16051 100644 --- a/drivers/net/wireless/intel/Makefile +++ b/drivers/net/wireless/intel/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_IPW2200) += ipw2x00/ obj-$(CONFIG_IWLEGACY) += iwlegacy/ obj-$(CONFIG_IWLWIFI) += iwlwifi/ +obj-$(CONFIG_IWLMEI) += iwlwifi/ From e50b88c4f076242358b66ddb67482b96947438f2 Mon Sep 17 00:00:00 2001 From: Sreeramya Soratkal Date: Tue, 1 Mar 2022 11:33:20 +0530 Subject: [PATCH 581/773] nl80211: Update bss channel on channel switch for P2P_CLIENT The wdev channel information is updated post channel switch only for the station mode and not for the other modes. Due to this, the P2P client still points to the old value though it moved to the new channel when the channel change is induced from the P2P GO. Update the bss channel after CSA channel switch completion for P2P client interface as well. Signed-off-by: Sreeramya Soratkal Link: https://lore.kernel.org/r/1646114600-31479-1-git-send-email-quic_ssramya@quicinc.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b1909ce2b739..c01fbcc848e8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -17828,7 +17828,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, wdev->chandef = *chandef; wdev->preset_chandef = *chandef; - if (wdev->iftype == NL80211_IFTYPE_STATION && + if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && !WARN_ON(!wdev->current_bss)) cfg80211_update_assoc_bss_entry(wdev, chandef->chan); From e6e91ec966db5af4f059cfbac1af06560404b317 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 2 Mar 2022 09:27:15 +0200 Subject: [PATCH 582/773] iwlwifi: mvm: return value for request_ownership Propagate the value to the user space so it can understand if the operation failed or not. Fixes: bfcfdb59b669 ("iwlwifi: mvm: add vendor commands needed for iwlmei") Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20220302072715.4885-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c b/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c index 78450366312b..080a1587caa5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c @@ -71,12 +71,13 @@ static int iwl_mvm_vendor_host_get_ownership(struct wiphy *wiphy, { struct ieee80211_hw *hw = wiphy_to_ieee80211_hw(wiphy); struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); + int ret; mutex_lock(&mvm->mutex); - iwl_mvm_mei_get_ownership(mvm); + ret = iwl_mvm_mei_get_ownership(mvm); mutex_unlock(&mvm->mutex); - return 0; + return ret; } static const struct wiphy_vendor_command iwl_mvm_vendor_commands[] = { From 4424c35ead667ba2e8de7ab8206da66453e6f728 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Feb 2022 17:47:16 +0200 Subject: [PATCH 583/773] auxdisplay: lcd2s: Fix lcd2s_redefine_char() feature It seems that the lcd2s_redefine_char() has never been properly tested. The buffer is filled by DEF_CUSTOM_CHAR command followed by the character number (from 0 to 7), but immediately after that these bytes are rewritten by the decoded hex stream. Fix the index to fill the buffer after the command and number. Fixes: 8c9108d014c5 ("auxdisplay: add a driver for lcd2s character display") Cc: Lars Poeschel Signed-off-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven [fixed typo in commit message] Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/lcd2s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index 38ba08628ccb..ea9d75ad4f16 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -238,7 +238,7 @@ static int lcd2s_redefine_char(struct charlcd *lcd, char *esc) if (buf[1] > 7) return 1; - i = 0; + i = 2; shift = 0; value = 0; while (*esc && i < LCD2S_CHARACTER_SIZE + 2) { From 898c0a15425a5bcaa8d44bd436eae5afd2483796 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Feb 2022 17:47:17 +0200 Subject: [PATCH 584/773] auxdisplay: lcd2s: Fix memory leak in ->remove() Once allocated the struct lcd2s_data is never freed. Fix the memory leak by switching to devm_kzalloc(). Fixes: 8c9108d014c5 ("auxdisplay: add a driver for lcd2s character display") Cc: Lars Poeschel Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/lcd2s.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index ea9d75ad4f16..7b9cfbbaf007 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -298,6 +298,10 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c, I2C_FUNC_SMBUS_WRITE_BLOCK_DATA)) return -EIO; + lcd2s = devm_kzalloc(&i2c->dev, sizeof(*lcd2s), GFP_KERNEL); + if (!lcd2s) + return -ENOMEM; + /* Test, if the display is responding */ err = lcd2s_i2c_smbus_write_byte(i2c, LCD2S_CMD_DISPLAY_OFF); if (err < 0) @@ -307,12 +311,6 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c, if (!lcd) return -ENOMEM; - lcd2s = kzalloc(sizeof(struct lcd2s_data), GFP_KERNEL); - if (!lcd2s) { - err = -ENOMEM; - goto fail1; - } - lcd->drvdata = lcd2s; lcd2s->i2c = i2c; lcd2s->charlcd = lcd; @@ -321,24 +319,22 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c, err = device_property_read_u32(&i2c->dev, "display-height-chars", &lcd->height); if (err) - goto fail2; + goto fail1; err = device_property_read_u32(&i2c->dev, "display-width-chars", &lcd->width); if (err) - goto fail2; + goto fail1; lcd->ops = &lcd2s_ops; err = charlcd_register(lcd2s->charlcd); if (err) - goto fail2; + goto fail1; i2c_set_clientdata(i2c, lcd2s); return 0; -fail2: - kfree(lcd2s); fail1: kfree(lcd); return err; From 9ed331f8a0fb674f4f06edf05a1687bf755af27b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Feb 2022 17:47:18 +0200 Subject: [PATCH 585/773] auxdisplay: lcd2s: Use proper API to free the instance of charlcd object While it might work, the current approach is fragile in a few ways: - whenever members in the structure are shuffled, the pointer will be wrong - the resource freeing may include more than covered by kfree() Fix this by using charlcd_free() call instead of kfree(). Fixes: 8c9108d014c5 ("auxdisplay: add a driver for lcd2s character display") Cc: Lars Poeschel Signed-off-by: Andy Shevchenko Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/lcd2s.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index 7b9cfbbaf007..2578b2d45439 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -336,7 +336,7 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c, return 0; fail1: - kfree(lcd); + charlcd_free(lcd2s->charlcd); return err; } @@ -345,7 +345,7 @@ static int lcd2s_i2c_remove(struct i2c_client *i2c) struct lcd2s_data *lcd2s = i2c_get_clientdata(i2c); charlcd_unregister(lcd2s->charlcd); - kfree(lcd2s->charlcd); + charlcd_free(lcd2s->charlcd); return 0; } From f1ef17011c765495c876fa75435e59eecfdc1ee4 Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Tue, 1 Mar 2022 14:11:59 +0800 Subject: [PATCH 586/773] drm/amdgpu: fix suspend/resume hang regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regression has been reported that suspend/resume may hang with the previous vm ready check commit. So bring back the evicted list check as a temp fix. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1922 Fixes: c1a66c3bc425 ("drm/amdgpu: check vm ready by amdgpu_vm->evicting flag") Reviewed-by: Christian König Signed-off-by: Qiang Yu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d62190b3dd9b..418341a67517 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -777,7 +777,8 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) amdgpu_vm_eviction_lock(vm); ret = !vm->evicting; amdgpu_vm_eviction_unlock(vm); - return ret; + + return ret && list_empty(&vm->evicted); } /** From 224102de2ff105a2c05695e66a08f4b5b6b2d19c Mon Sep 17 00:00:00 2001 From: lena wang Date: Tue, 1 Mar 2022 19:17:09 +0800 Subject: [PATCH 587/773] net: fix up skbs delta_truesize in UDP GRO frag_list The truesize for a UDP GRO packet is added by main skb and skbs in main skb's frag_list: skb_gro_receive_list p->truesize += skb->truesize; The commit 53475c5dd856 ("net: fix use-after-free when UDP GRO with shared fraglist") introduced a truesize increase for frag_list skbs. When uncloning skb, it will call pskb_expand_head and trusesize for frag_list skbs may increase. This can occur when allocators uses __netdev_alloc_skb and not jump into __alloc_skb. This flow does not use ksize(len) to calculate truesize while pskb_expand_head uses. skb_segment_list err = skb_unclone(nskb, GFP_ATOMIC); pskb_expand_head if (!skb->sk || skb->destructor == sock_edemux) skb->truesize += size - osize; If we uses increased truesize adding as delta_truesize, it will be larger than before and even larger than previous total truesize value if skbs in frag_list are abundant. The main skb truesize will become smaller and even a minus value or a huge value for an unsigned int parameter. Then the following memory check will drop this abnormal skb. To avoid this error we should use the original truesize to segment the main skb. Fixes: 53475c5dd856 ("net: fix use-after-free when UDP GRO with shared fraglist") Signed-off-by: lena wang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/1646133431-8948-1-git-send-email-lena.wang@mediatek.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8138c372535..ea51e23e9247 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3876,6 +3876,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, list_skb = list_skb->next; err = 0; + delta_truesize += nskb->truesize; if (skb_shared(nskb)) { tmp = skb_clone(nskb, GFP_ATOMIC); if (tmp) { @@ -3900,7 +3901,6 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, tail = nskb; delta_len += nskb->len; - delta_truesize += nskb->truesize; skb_push(nskb, -skb_network_offset(nskb) + offset); From 1dba41c9d2e2dc94b543394974f63d55aa195bfe Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 1 Mar 2022 05:34:40 -0600 Subject: [PATCH 588/773] net: ipa: add an interconnect dependency In order to function, the IPA driver very clearly requires the interconnect framework to be enabled in the kernel configuration. State that dependency in the Kconfig file. This became a problem when CONFIG_COMPILE_TEST support was added. Non-Qualcomm platforms won't necessarily enable CONFIG_INTERCONNECT. Reported-by: kernel test robot Fixes: 38a4066f593c5 ("net: ipa: support COMPILE_TEST") Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20220301113440.257916-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipa/Kconfig b/drivers/net/ipa/Kconfig index e0164a55c1e6..6782c2cbf542 100644 --- a/drivers/net/ipa/Kconfig +++ b/drivers/net/ipa/Kconfig @@ -2,6 +2,7 @@ config QCOM_IPA tristate "Qualcomm IPA support" depends on NET && QCOM_SMEM depends on ARCH_QCOM || COMPILE_TEST + depends on INTERCONNECT depends on QCOM_RPROC_COMMON || (QCOM_RPROC_COMMON=n && COMPILE_TEST) depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select QCOM_MDT_LOADER if ARCH_QCOM From 60ce37b03917e593d8e5d8bcc7ec820773daf81d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Mar 2022 08:17:22 -0800 Subject: [PATCH 589/773] bpf, sockmap: Do not ignore orig_len parameter Currently, sk_psock_verdict_recv() returns skb->len This is problematic because tcp_read_sock() might have passed orig_len < skb->len, due to the presence of TCP urgent data. This causes an infinite loop from tcp_read_sock() Followup patch will make tcp_read_sock() more robust vs bad actors. Fixes: ef5659280eb1 ("bpf, sockmap: Allow skipping sk_skb parser program") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: John Fastabend Acked-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220302161723.3910001-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8eb671c827f9..929a2b096b04 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1153,7 +1153,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; - int len = skb->len; + int len = orig_len; /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ skb = skb_clone(skb, GFP_ATOMIC); From e3d5ea2c011ecb16fb94c56a659364e6b30fac94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Mar 2022 08:17:23 -0800 Subject: [PATCH 590/773] tcp: make tcp_read_sock() more robust If recv_actor() returns an incorrect value, tcp_read_sock() might loop forever. Instead, issue a one time warning and make sure to make progress. Signed-off-by: Eric Dumazet Acked-by: John Fastabend Acked-by: Jakub Sitnicki Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220302161723.3910001-2-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 02cb275e5487..28ff2a820f7c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1684,11 +1684,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (!copied) copied = used; break; - } else if (used <= len) { - seq += used; - copied += used; - offset += used; } + if (WARN_ON_ONCE(used > len)) + used = len; + seq += used; + copied += used; + offset += used; + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it From bd6f1fd5d33dfe5d1b4f2502d3694a7cc13f166d Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Wed, 2 Mar 2022 20:24:23 +0800 Subject: [PATCH 591/773] net: arcnet: com20020: Fix null-ptr-deref in com20020pci_probe() During driver initialization, the pointer of card info, i.e. the variable 'ci' is required. However, the definition of 'com20020pci_id_table' reveals that this field is empty for some devices, which will cause null pointer dereference when initializing these devices. The following log reveals it: [ 3.973806] KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] [ 3.973819] RIP: 0010:com20020pci_probe+0x18d/0x13e0 [com20020_pci] [ 3.975181] Call Trace: [ 3.976208] local_pci_probe+0x13f/0x210 [ 3.977248] pci_device_probe+0x34c/0x6d0 [ 3.977255] ? pci_uevent+0x470/0x470 [ 3.978265] really_probe+0x24c/0x8d0 [ 3.978273] __driver_probe_device+0x1b3/0x280 [ 3.979288] driver_probe_device+0x50/0x370 Fix this by checking whether the 'ci' is a null pointer first. Fixes: 8c14f9c70327 ("ARCNET: add com20020 PCI IDs with metadata") Signed-off-by: Zheyu Ma Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 6382e1937cca..c580acb8b1d3 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -138,6 +138,9 @@ static int com20020pci_probe(struct pci_dev *pdev, return -ENOMEM; ci = (struct com20020_pci_card_info *)id->driver_data; + if (!ci) + return -EINVAL; + priv->ci = ci; mm = &ci->misc_map; From 0537f0a2151375dcf90c1bbfda6a0aaf57164e89 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 2 Mar 2022 21:25:11 +0800 Subject: [PATCH 592/773] net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error generated by client The main reason for this unexpected SMC_CLC_DECL_ERR_REGRMB in client dues to following execution sequence: Server Conn A: Server Conn B: Client Conn B: smc_lgr_unregister_conn smc_lgr_register_conn smc_clc_send_accept -> smc_rtoken_add smcr_buf_unuse -> Client Conn A: smc_rtoken_delete smc_lgr_unregister_conn() makes current link available to assigned to new incoming connection, while smcr_buf_unuse() has not executed yet, which means that smc_rtoken_add may fail because of insufficient rtoken_entry, reversing their execution order will avoid this problem. Fixes: 3e034725c0d8 ("net/smc: common functions for RMBs and send buffers") Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 29525d03b253..f8c9675f4af3 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1161,8 +1161,8 @@ void smc_conn_free(struct smc_connection *conn) cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { - smc_lgr_unregister_conn(conn); smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) From 4940a1fdf31c39f0806ac831cde333134862030b Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 2 Mar 2022 21:25:12 +0800 Subject: [PATCH 593/773] net/smc: fix unexpected SMC_CLC_DECL_ERR_REGRMB error cause by server The problem of SMC_CLC_DECL_ERR_REGRMB on the server is very clear. Based on the fact that whether a new SMC connection can be accepted or not depends on not only the limit of conn nums, but also the available entries of rtoken. Since the rtoken release is trigger by peer, while the conn nums is decrease by local, tons of thing can happen in this time difference. This only thing that needs to be mentioned is that now all connection creations are completely protected by smc_server_lgr_pending lock, it's enough to check only the available entries in rtokens_used_mask. Fixes: cd6851f30386 ("smc: remote memory buffers (RMBs)") Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f8c9675f4af3..be7d704976ff 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1864,7 +1864,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; From 815d5121927093017947fd76e627da03f0f70be7 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 2 Mar 2022 14:44:39 +0100 Subject: [PATCH 594/773] Bluetooth: hci_core: Fix unbalanced unlock in set_device_flags() There is only one "goto done;" in set_device_flags() and this happens *before* hci_dev_lock() is called, move the done label to after the hci_dev_unlock() to fix the following unlock balance: [ 31.493567] ===================================== [ 31.493571] WARNING: bad unlock balance detected! [ 31.493576] 5.17.0-rc2+ #13 Tainted: G C E [ 31.493581] ------------------------------------- [ 31.493584] bluetoothd/685 is trying to release lock (&hdev->lock) at: [ 31.493594] [] set_device_flags+0x65/0x1f0 [bluetooth] [ 31.493684] but there are no more locks to release! Note this bug has been around for a couple of years, but before commit fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params flags") supported_flags was hardcoded to "((1U << HCI_CONN_FLAG_MAX) - 1)" so the check for unsupported flags which does the "goto done;" never triggered. Fixes: fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params flags") Cc: Luiz Augusto von Dentz Signed-off-by: Hans de Goede Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 533cf60673a3..230a7a8196c0 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4541,9 +4541,9 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, } } -done: hci_dev_unlock(hdev); +done: if (status == MGMT_STATUS_SUCCESS) device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type, supported_flags, current_flags); From 008ee9eb8a11bcabf12c91771dd4f470b082bd44 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 2 Mar 2022 13:02:45 -0800 Subject: [PATCH 595/773] Bluetooth: hci_sync: Fix not processing all entries on cmd_sync_work hci_cmd_sync_queue can be called multiple times, each adding a hci_cmd_sync_work_entry, before hci_cmd_sync_work is run so this makes sure they are all dequeued properly otherwise it creates a backlog of entries that are never run. Link: https://lore.kernel.org/all/CAJCQCtSeUtHCgsHXLGrSTWKmyjaQDbDNpP4rb0i+RE+L2FTXSA@mail.gmail.com/T/ Fixes: 6a98e3836fa20 ("Bluetooth: Add helper for serialized HCI command execution") Tested-by: Chris Clayton Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sync.c | 57 +++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 9ba2a1a7d481..ab9aa700b6b3 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -276,40 +276,37 @@ EXPORT_SYMBOL(__hci_cmd_sync_status); static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); - struct hci_cmd_sync_work_entry *entry; - hci_cmd_sync_work_func_t func; - hci_cmd_sync_work_destroy_t destroy; - void *data; bt_dev_dbg(hdev, ""); - mutex_lock(&hdev->cmd_sync_work_lock); - entry = list_first_entry(&hdev->cmd_sync_work_list, - struct hci_cmd_sync_work_entry, list); - if (entry) { - list_del(&entry->list); - func = entry->func; - data = entry->data; - destroy = entry->destroy; + /* Dequeue all entries and run them */ + while (1) { + struct hci_cmd_sync_work_entry *entry; + + mutex_lock(&hdev->cmd_sync_work_lock); + entry = list_first_entry_or_null(&hdev->cmd_sync_work_list, + struct hci_cmd_sync_work_entry, + list); + if (entry) + list_del(&entry->list); + mutex_unlock(&hdev->cmd_sync_work_lock); + + if (!entry) + break; + + bt_dev_dbg(hdev, "entry %p", entry); + + if (entry->func) { + int err; + + hci_req_sync_lock(hdev); + err = entry->func(hdev, entry->data); + if (entry->destroy) + entry->destroy(hdev, entry->data, err); + hci_req_sync_unlock(hdev); + } + kfree(entry); - } else { - func = NULL; - data = NULL; - destroy = NULL; - } - mutex_unlock(&hdev->cmd_sync_work_lock); - - if (func) { - int err; - - hci_req_sync_lock(hdev); - - err = func(hdev, data); - - if (destroy) - destroy(hdev, data, err); - - hci_req_sync_unlock(hdev); } } From f1fb205efb0ccca55626fd4ef38570dd16b44719 Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Tue, 1 Mar 2022 23:28:22 +0100 Subject: [PATCH 596/773] sfc: extend the locking on mcdi->seqno seqno could be read as a stale value outside of the lock. The lock is already acquired to protect the modification of seqno against a possible race condition. Place the reading of this value also inside this locking to protect it against a possible race condition. Signed-off-by: Niels Dossche Acked-by: Martin Habets Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/mcdi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c index be6bfd6b7ec7..50baf62b2cbc 100644 --- a/drivers/net/ethernet/sfc/mcdi.c +++ b/drivers/net/ethernet/sfc/mcdi.c @@ -163,9 +163,9 @@ static void efx_mcdi_send_request(struct efx_nic *efx, unsigned cmd, /* Serialise with efx_mcdi_ev_cpl() and efx_mcdi_ev_death() */ spin_lock_bh(&mcdi->iface_lock); ++mcdi->seqno; + seqno = mcdi->seqno & SEQ_MASK; spin_unlock_bh(&mcdi->iface_lock); - seqno = mcdi->seqno & SEQ_MASK; xflags = 0; if (mcdi->mode == MCDI_MODE_EVENTS) xflags |= MCDI_HEADER_XFLAGS_EVREQ; From 8ccffe9ac3239e549beaa0a9d5e1a1eac94e866c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 2 Mar 2022 21:21:15 +0100 Subject: [PATCH 597/773] bnx2: Fix an error message Fix an error message and report the correct failing function. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index e20aafeb4ca9..b97ed9b5f685 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -8216,7 +8216,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) rc = dma_set_coherent_mask(&pdev->dev, persist_dma_mask); if (rc) { dev_err(&pdev->dev, - "pci_set_consistent_dma_mask failed, aborting\n"); + "dma_set_coherent_mask failed, aborting\n"); goto err_out_unmap; } } else if ((rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) != 0) { From 10b6bb62ae1a49ee818fc479cf57b8900176773e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Mar 2022 21:39:39 +0200 Subject: [PATCH 598/773] net: dcb: disable softirqs in dcbnl_flush_dev() Ido Schimmel points out that since commit 52cff74eef5d ("dcbnl : Disable software interrupts before taking dcb_lock"), the DCB API can be called by drivers from softirq context. One such in-tree example is the chelsio cxgb4 driver: dcb_rpl -> cxgb4_dcb_handle_fw_update -> dcb_ieee_setapp If the firmware for this driver happened to send an event which resulted in a call to dcb_ieee_setapp() at the exact same time as another DCB-enabled interface was unregistering on the same CPU, the softirq would deadlock, because the interrupted process was already holding the dcb_lock in dcbnl_flush_dev(). Fix this unlikely event by using spin_lock_bh() in dcbnl_flush_dev() as in the rest of the dcbnl code. Fixes: 91b0383fef06 ("net: dcb: flush lingering app table entries for unregistered devices") Reported-by: Ido Schimmel Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220302193939.1368823-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dcb/dcbnl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 36c91273daac..dc4fb699b56c 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2077,7 +2077,7 @@ static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { @@ -2086,7 +2086,7 @@ static void dcbnl_flush_dev(struct net_device *dev) } } - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, From dc9752075341e7beb653e37c6f4a3723074dc8bc Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 2 Mar 2022 18:14:46 +0200 Subject: [PATCH 599/773] selftests: mlxsw: tc_police_scale: Make test more robust The test adds tc filters and checks how many of them were offloaded by grepping for 'in_hw'. iproute2 commit f4cd4f127047 ("tc: add skip_hw and skip_sw to control action offload") added offload indication to tc actions, producing the following output: $ tc filter show dev swp2 ingress ... filter protocol ipv6 pref 1000 flower chain 0 handle 0x7c0 eth_type ipv6 dst_ip 2001:db8:1::7bf skip_sw in_hw in_hw_count 1 action order 1: police 0x7c0 rate 10Mbit burst 100Kb mtu 2Kb action drop overhead 0b ref 1 bind 1 not_in_hw used_hw_stats immediate The current grep expression matches on both 'in_hw' and 'not_in_hw', resulting in incorrect results. Fix that by using JSON output instead. Fixes: 5061e773264b ("selftests: mlxsw: Add scale test for tc-police") Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh index 3e3e06ea5703..86e787895f78 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh @@ -60,7 +60,8 @@ __tc_police_test() tc_police_rules_create $count $should_fail - offload_count=$(tc filter show dev $swp1 ingress | grep in_hw | wc -l) + offload_count=$(tc -j filter show dev $swp1 ingress | + jq "[.[] | select(.options.in_hw == true)] | length") ((offload_count == count)) check_err_fail $should_fail $? "tc police offload count" } From 196f9bc050cbc5085b4cbb61cce2efe380bc66d0 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 2 Mar 2022 18:14:47 +0200 Subject: [PATCH 600/773] selftests: mlxsw: resource_scale: Fix return value The test runs several test cases and is supposed to return an error in case at least one of them failed. Currently, the check of the return value of each test case is in the wrong place, which can result in the wrong return value. For example: # TESTS='tc_police' ./resource_scale.sh TEST: 'tc_police' [default] 968 [FAIL] tc police offload count failed Error: mlxsw_spectrum: Failed to allocate policer index. We have an error talking to the kernel Command failed /tmp/tmp.i7Oc5HwmXY:969 TEST: 'tc_police' [default] overflow 969 [ OK ] ... TEST: 'tc_police' [ipv4_max] overflow 969 [ OK ] $ echo $? 0 Fix this by moving the check to be done after each test case. Fixes: 059b18e21c63 ("selftests: mlxsw: Return correct error code in resource scale test") Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/mlxsw/spectrum/resource_scale.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index bcb110e830ce..dea33dc93790 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -50,8 +50,8 @@ for current_test in ${TESTS:-$ALL_TESTS}; do else log_test "'$current_test' [$profile] overflow $target" fi + RET_FIN=$(( RET_FIN || RET )) done - RET_FIN=$(( RET_FIN || RET )) done done current_test="" From 6c7273a266759d9d36f7c862149f248bcdeddc0f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 2 Mar 2022 09:59:27 -0800 Subject: [PATCH 601/773] ixgbe: xsk: change !netif_carrier_ok() handling in ixgbe_xmit_zc() Commit c685c69fba71 ("ixgbe: don't do any AF_XDP zero-copy transmit if netif is not OK") addressed the ring transient state when MEM_TYPE_XSK_BUFF_POOL was being configured which in turn caused the interface to through down/up. Maurice reported that when carrier is not ok and xsk_pool is present on ring pair, ksoftirqd will consume 100% CPU cycles due to the constant NAPI rescheduling as ixgbe_poll() states that there is still some work to be done. To fix this, do not set work_done to false for a !netif_carrier_ok(). Fixes: c685c69fba71 ("ixgbe: don't do any AF_XDP zero-copy transmit if netif is not OK") Reported-by: Maurice Baijens Tested-by: Maurice Baijens Signed-off-by: Maciej Fijalkowski Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index b3fd8e5cd85b..6a5e9cf6b5da 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -390,12 +390,14 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) u32 cmd_type; while (budget-- > 0) { - if (unlikely(!ixgbe_desc_unused(xdp_ring)) || - !netif_carrier_ok(xdp_ring->netdev)) { + if (unlikely(!ixgbe_desc_unused(xdp_ring))) { work_done = false; break; } + if (!netif_carrier_ok(xdp_ring->netdev)) + break; + if (!xsk_tx_peek_desc(pool, &desc)) break; From e1bec7fa1cee311a6d3fb9161037c7675904134d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Mar 2022 17:42:49 +0200 Subject: [PATCH 602/773] net: dsa: make dsa_tree_change_tag_proto actually unwind the tag proto change The blamed commit said one thing but did another. It explains that we should restore the "return err" to the original "goto out_unwind_tagger", but instead it replaced it with "goto out_unlock". When DSA_NOTIFIER_TAG_PROTO fails after the first switch of a multi-switch tree, the switches would end up not using the same tagging protocol. Fixes: 0b0e2ff10356 ("net: dsa: restore error path of dsa_tree_change_tag_proto") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220303154249.1854436-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 209496996cdf..b4e67758e104 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1261,7 +1261,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, info.tag_ops = tag_ops; err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); if (err) - goto out_unlock; + goto out_unwind_tagger; err = dsa_tree_bind_tag_proto(dst, tag_ops); if (err) From 2d3916f3189172d5c69d33065c3c21119fe539fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Mar 2022 09:37:28 -0800 Subject: [PATCH 603/773] ipv6: fix skb drops in igmp6_event_query() and igmp6_event_report() While investigating on why a synchronize_net() has been added recently in ipv6_mc_down(), I found that igmp6_event_query() and igmp6_event_report() might drop skbs in some cases. Discussion about removing synchronize_net() from ipv6_mc_down() will happen in a different thread. Fixes: f185de28d9ae ("mld: add new workqueues for process mld events") Signed-off-by: Eric Dumazet Cc: Taehee Yoo Cc: Cong Wang Cc: David Ahern Link: https://lore.kernel.org/r/20220303173728.937869-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ndisc.h | 4 ++-- net/ipv6/mcast.c | 32 ++++++++++++-------------------- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 53cb8de0e589..47ffb360ddfa 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -475,9 +475,9 @@ int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); -int igmp6_event_query(struct sk_buff *skb); +void igmp6_event_query(struct sk_buff *skb); -int igmp6_event_report(struct sk_buff *skb); +void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index a8861db52c18..909f937befd7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1371,27 +1371,23 @@ static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, } /* called with rcu_read_lock() */ -int igmp6_event_query(struct sk_buff *skb) +void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) @@ -1542,27 +1538,23 @@ static void mld_query_work(struct work_struct *work) } /* called with rcu_read_lock() */ -int igmp6_event_report(struct sk_buff *skb) +void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) From b08968f196d498b19e9d0841d76a03862258f2d8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Mar 2022 13:05:18 +0000 Subject: [PATCH 604/773] cachefiles: Fix incorrect length to fallocate() When cachefiles_shorten_object() calls fallocate() to shape the cache file to match the DIO size, it passes the total file size it wants to achieve, not the amount of zeros that should be inserted. Since this is meant to preallocate that amount of storage for the file, it can cause the cache to fill up the disk and hit ENOSPC. Fix this by passing the length actually required to go from the current EOF to the desired EOF. Fixes: 7623ed6772de ("cachefiles: Implement cookie resize for truncate") Reported-by: Jeffle Xu Signed-off-by: David Howells Tested-by: Jeff Layton Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164630854858.3665356.17419701804248490708.stgit@warthog.procyon.org.uk # v1 Signed-off-by: Linus Torvalds --- fs/cachefiles/interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 51c968cd00a6..ae93cee9d25d 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -254,7 +254,7 @@ static bool cachefiles_shorten_object(struct cachefiles_object *object, ret = cachefiles_inject_write_error(); if (ret == 0) ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE, - new_size, dio_size); + new_size, dio_size - new_size); if (ret < 0) { trace_cachefiles_io_error(object, file_inode(file), ret, cachefiles_trace_fallocate_error); From 38f80f42147ff658aff218edb0a88c37e58bf44f Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Sat, 26 Feb 2022 14:40:56 +0700 Subject: [PATCH 605/773] MAINTAINERS: Remove dead patchwork link The patchwork link is dead. It says: 404: File not found The page URL requested (/project/LKML/list/) does not exist. Remove it. Signed-off-by: Ammar Faizi Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d2b39f444abf..05fd080b82f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21468,7 +21468,6 @@ THE REST M: Linus Torvalds L: linux-kernel@vger.kernel.org S: Buried alive in reporters -Q: http://patchwork.kernel.org/project/LKML/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git F: * F: */ From 8b274f2238950c55570ff14fcc278a7fcbecc663 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:48 +0100 Subject: [PATCH 606/773] riscv: Fix is_linear_mapping with recent move of KASAN region The KASAN region was recently moved between the linear mapping and the kernel mapping, is_linear_mapping used to check the validity of an address by using the start of the kernel mapping, which is now wrong. Fix this by using the maximum size of the physical memory. Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/page.h | 2 +- arch/riscv/include/asm/pgtable.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 160e3a1e8f8b..004372f8da54 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -119,7 +119,7 @@ extern phys_addr_t phys_ram_base; ((x) >= kernel_map.virt_addr && (x) < (kernel_map.virt_addr + kernel_map.size)) #define is_linear_mapping(x) \ - ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < kernel_map.virt_addr)) + ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < PAGE_OFFSET + KERN_VIRT_SIZE)) #define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + kernel_map.va_pa_offset)) #define kernel_mapping_pa_to_va(y) ({ \ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7e949f25c933..e3549e50de95 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -13,6 +13,7 @@ #ifndef CONFIG_MMU #define KERNEL_LINK_ADDR PAGE_OFFSET +#define KERN_VIRT_SIZE (UL(-1)) #else #define ADDRESS_SPACE_END (UL(-1)) From a3d328037846d013bb4c7f3777241e190e4c75e1 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:49 +0100 Subject: [PATCH 607/773] riscv: Fix config KASAN && SPARSEMEM && !SPARSE_VMEMMAP In order to get the pfn of a struct page* when sparsemem is enabled without vmemmap, the mem_section structures need to be initialized which happens in sparse_init. But kasan_early_init calls pfn_to_page way before sparse_init is called, which then tries to dereference a null mem_section pointer. Fix this by removing the usage of this function in kasan_early_init. Fixes: 8ad8b72721d0 ("riscv: Add KASAN support") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index f61f7ca6fe0f..85e849318389 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -202,8 +202,7 @@ asmlinkage void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PTE; ++i) set_pte(kasan_early_shadow_pte + i, - mk_pte(virt_to_page(kasan_early_shadow_page), - PAGE_KERNEL)); + pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL)); for (i = 0; i < PTRS_PER_PMD; ++i) set_pmd(kasan_early_shadow_pmd + i, From 5f763b3b59602735993149330ffa7e348bc85bc0 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:50 +0100 Subject: [PATCH 608/773] riscv: Fix DEBUG_VIRTUAL false warnings KERN_VIRT_SIZE used to encompass the kernel mapping before it was redefined when moving the kasan mapping next to the kernel mapping to only match the maximum amount of physical memory. Then, kernel mapping addresses that go through __virt_to_phys are now declared as wrong which is not true, one can use __virt_to_phys on such addresses. Fix this by redefining the condition that matches wrong addresses. Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/physaddr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c index e7fd0c253c7b..19cf25a74ee2 100644 --- a/arch/riscv/mm/physaddr.c +++ b/arch/riscv/mm/physaddr.c @@ -8,12 +8,10 @@ phys_addr_t __virt_to_phys(unsigned long x) { - phys_addr_t y = x - PAGE_OFFSET; - /* * Boundary checking aginst the kernel linear mapping space. */ - WARN(y >= KERN_VIRT_SIZE, + WARN(!is_linear_mapping(x) && !is_kernel_mapping(x), "virt_to_phys used for non-linear address: %pK (%pS)\n", (void *)x, (void *)x); From c648c4bb7d02ceb53ee40172fdc4433b37cee9c6 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:51 +0100 Subject: [PATCH 609/773] riscv: Fix config KASAN && DEBUG_VIRTUAL __virt_to_phys function is called very early in the boot process (ie kasan_early_init) so it should not be instrumented by KASAN otherwise it bugs. Fix this by declaring phys_addr.c as non-kasan instrumentable. Signed-off-by: Alexandre Ghiti Fixes: 8ad8b72721d0 (riscv: Add KASAN support) Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index 7ebaef10ea1b..ac7a25298a04 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -24,6 +24,9 @@ obj-$(CONFIG_KASAN) += kasan_init.o ifdef CONFIG_KASAN KASAN_SANITIZE_kasan_init.o := n KASAN_SANITIZE_init.o := n +ifdef CONFIG_DEBUG_VIRTUAL +KASAN_SANITIZE_physaddr.o := n +endif endif obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o From 625e24a550e6a600e639b43cf7c15879b2a70840 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:52 +0100 Subject: [PATCH 610/773] riscv: Move high_memory initialization to setup_bootmem high_memory used to be initialized in mem_init, way after setup_bootmem. But a call to dma_contiguous_reserve in this function gives rise to the below warning because high_memory is equal to 0 and is used at the very beginning at cma_declare_contiguous_nid. It went unnoticed since the move of the kasan region redefined KERN_VIRT_SIZE so that it does not encompass -1 anymore. Fix this by initializing high_memory in setup_bootmem. ------------[ cut here ]------------ virt_to_phys used for non-linear address: ffffffffffffffff (0xffffffffffffffff) WARNING: CPU: 0 PID: 0 at arch/riscv/mm/physaddr.c:14 __virt_to_phys+0xac/0x1b8 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.17.0-rc1-00007-ga68b89289e26 #27 Hardware name: riscv-virtio,qemu (DT) epc : __virt_to_phys+0xac/0x1b8 ra : __virt_to_phys+0xac/0x1b8 epc : ffffffff80014922 ra : ffffffff80014922 sp : ffffffff84a03c30 gp : ffffffff85866c80 tp : ffffffff84a3f180 t0 : ffffffff86bce657 t1 : fffffffef09406e8 t2 : 0000000000000000 s0 : ffffffff84a03c70 s1 : ffffffffffffffff a0 : 000000000000004f a1 : 00000000000f0000 a2 : 0000000000000002 a3 : ffffffff8011f408 a4 : 0000000000000000 a5 : 0000000000000000 a6 : 0000000000f00000 a7 : ffffffff84a03747 s2 : ffffffd800000000 s3 : ffffffff86ef4000 s4 : ffffffff8467f828 s5 : fffffff800000000 s6 : 8000000000006800 s7 : 0000000000000000 s8 : 0000000480000000 s9 : 0000000080038ea0 s10: 0000000000000000 s11: ffffffffffffffff t3 : ffffffff84a035c0 t4 : fffffffef09406e8 t5 : fffffffef09406e9 t6 : ffffffff84a03758 status: 0000000000000100 badaddr: 0000000000000000 cause: 0000000000000003 [] cma_declare_contiguous_nid+0xf2/0x64a [] dma_contiguous_reserve_area+0x46/0xb4 [] dma_contiguous_reserve+0x174/0x18e [] paging_init+0x12c/0x35e [] setup_arch+0x120/0x74e [] start_kernel+0xce/0x68c irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<0000000000000000>] 0x0 softirqs last enabled at (0): [<0000000000000000>] 0x0 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace 0000000000000000 ]--- Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c27294128e18..0d588032d6e6 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -125,7 +125,6 @@ void __init mem_init(void) else swiotlb_force = SWIOTLB_NO_FORCE; #endif - high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); memblock_free_all(); print_vm_layout(); @@ -195,6 +194,7 @@ static void __init setup_bootmem(void) min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); + high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); From e4fcfe6eca6f32357f1b4408ff15b10527518eee Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 25 Feb 2022 13:39:53 +0100 Subject: [PATCH 611/773] riscv: Fix kasan pud population In sv48, the kasan inner regions are not aligned on PGDIR_SIZE and then when we populate the kasan linear mapping region, we clear the kasan vmalloc region which is in the same PGD. Fix this by copying the content of the kasan early pud after allocating a new PGD for the first time. Fixes: e8a62cc26ddf ("riscv: Implement sv48 support") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 85e849318389..cd1a145257b7 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -113,8 +113,11 @@ static void __init kasan_populate_pud(pgd_t *pgd, base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); } else { base_pud = (pud_t *)pgd_page_vaddr(*pgd); - if (base_pud == lm_alias(kasan_early_shadow_pud)) + if (base_pud == lm_alias(kasan_early_shadow_pud)) { base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + memcpy(base_pud, (void *)kasan_early_shadow_pud, + sizeof(pud_t) * PTRS_PER_PUD); + } } pudp = base_pud + pud_index(vaddr); From bfa26ba343c727e055223be04e08f2ebdd43c293 Mon Sep 17 00:00:00 2001 From: William Mahon Date: Thu, 3 Mar 2022 18:23:42 -0800 Subject: [PATCH 612/773] HID: add mapping for KEY_DICTATE Numerous keyboards are adding dictate keys which allows for text messages to be dictated by a microphone. This patch adds a new key definition KEY_DICTATE and maps 0x0c/0x0d8 usage code to this new keycode. Additionally hid-debug is adjusted to recognize this new usage code as well. Signed-off-by: William Mahon Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220303021501.1.I5dbf50eb1a7a6734ee727bda4a8573358c6d3ec0@changeid Signed-off-by: Dmitry Torokhov --- drivers/hid/hid-debug.c | 1 + drivers/hid/hid-input.c | 1 + include/uapi/linux/input-event-codes.h | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 26c31d759914..8aa68416b1d7 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -969,6 +969,7 @@ static const char *keys[KEY_MAX + 1] = { [KEY_ASSISTANT] = "Assistant", [KEY_KBD_LAYOUT_NEXT] = "KbdLayoutNext", [KEY_EMOJI_PICKER] = "EmojiPicker", + [KEY_DICTATE] = "Dictate", [KEY_BRIGHTNESS_MIN] = "BrightnessMin", [KEY_BRIGHTNESS_MAX] = "BrightnessMax", [KEY_BRIGHTNESS_AUTO] = "BrightnessAuto", diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 112901d2d8d2..ce2b75a67cb8 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -992,6 +992,7 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x0cd: map_key_clear(KEY_PLAYPAUSE); break; case 0x0cf: map_key_clear(KEY_VOICECOMMAND); break; + case 0x0d8: map_key_clear(KEY_DICTATE); break; case 0x0d9: map_key_clear(KEY_EMOJI_PICKER); break; case 0x0e0: map_abs_clear(ABS_VOLUME); break; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 225ec87d4f22..4db5d41848e4 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -612,6 +612,7 @@ #define KEY_ASSISTANT 0x247 /* AL Context-aware desktop assistant */ #define KEY_KBD_LAYOUT_NEXT 0x248 /* AC Next Keyboard Layout Select */ #define KEY_EMOJI_PICKER 0x249 /* Show/hide emoji picker (HUTRR101) */ +#define KEY_DICTATE 0x24a /* Start or Stop Voice Dictation Session (HUTRR99) */ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ From 327b89f0acc4c20a06ed59e4d9af7f6d804dc2e2 Mon Sep 17 00:00:00 2001 From: William Mahon Date: Thu, 3 Mar 2022 18:26:22 -0800 Subject: [PATCH 613/773] HID: add mapping for KEY_ALL_APPLICATIONS This patch adds a new key definition for KEY_ALL_APPLICATIONS and aliases KEY_DASHBOARD to it. It also maps the 0x0c/0x2a2 usage code to KEY_ALL_APPLICATIONS. Signed-off-by: William Mahon Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220303035618.1.I3a7746ad05d270161a18334ae06e3b6db1a1d339@changeid Signed-off-by: Dmitry Torokhov --- drivers/hid/hid-debug.c | 4 +++- drivers/hid/hid-input.c | 2 ++ include/uapi/linux/input-event-codes.h | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 8aa68416b1d7..81e7e404a5fc 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -860,7 +860,9 @@ static const char *keys[KEY_MAX + 1] = { [KEY_F22] = "F22", [KEY_F23] = "F23", [KEY_F24] = "F24", [KEY_PLAYCD] = "PlayCD", [KEY_PAUSECD] = "PauseCD", [KEY_PROG3] = "Prog3", - [KEY_PROG4] = "Prog4", [KEY_SUSPEND] = "Suspend", + [KEY_PROG4] = "Prog4", + [KEY_ALL_APPLICATIONS] = "AllApplications", + [KEY_SUSPEND] = "Suspend", [KEY_CLOSE] = "Close", [KEY_PLAY] = "Play", [KEY_FASTFORWARD] = "FastForward", [KEY_BASSBOOST] = "BassBoost", [KEY_PRINT] = "Print", [KEY_HP] = "HP", diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index ce2b75a67cb8..56ec27398a00 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -1084,6 +1084,8 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x29d: map_key_clear(KEY_KBD_LAYOUT_NEXT); break; + case 0x2a2: map_key_clear(KEY_ALL_APPLICATIONS); break; + case 0x2c7: map_key_clear(KEY_KBDINPUTASSIST_PREV); break; case 0x2c8: map_key_clear(KEY_KBDINPUTASSIST_NEXT); break; case 0x2c9: map_key_clear(KEY_KBDINPUTASSIST_PREVGROUP); break; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 4db5d41848e4..7989d9483ea7 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -278,7 +278,8 @@ #define KEY_PAUSECD 201 #define KEY_PROG3 202 #define KEY_PROG4 203 -#define KEY_DASHBOARD 204 /* AL Dashboard */ +#define KEY_ALL_APPLICATIONS 204 /* AC Desktop Show All Applications */ +#define KEY_DASHBOARD KEY_ALL_APPLICATIONS #define KEY_SUSPEND 205 #define KEY_CLOSE 206 /* AC Close */ #define KEY_PLAY 207 From 74583f1b92cb3bbba1a3741cea237545c56f506c Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 1 Mar 2022 00:44:18 +0000 Subject: [PATCH 614/773] riscv: dts: k210: fix broken IRQs on hart1 Commit 67d96729a9e7 ("riscv: Update Canaan Kendryte K210 device tree") incorrectly removed two entries from the PLIC interrupt-controller node's interrupts-extended property. The PLIC driver cannot know the mapping between hart contexts and hart ids, so this information has to be provided by device tree, as specified by the PLIC device tree binding. The PLIC driver uses the interrupts-extended property, and initializes the hart context registers in the exact same order as provided by the interrupts-extended property. In other words, if we don't specify the S-mode interrupts, the PLIC driver will simply initialize the hart0 S-mode hart context with the hart1 M-mode configuration. It is therefore essential to specify the S-mode IRQs even though the system itself will only ever be running in M-mode. Re-add the S-mode interrupts, so that we get working IRQs on hart1 again. Cc: Fixes: 67d96729a9e7 ("riscv: Update Canaan Kendryte K210 device tree") Signed-off-by: Niklas Cassel Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/canaan/k210.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi index 56f57118c633..44d338514761 100644 --- a/arch/riscv/boot/dts/canaan/k210.dtsi +++ b/arch/riscv/boot/dts/canaan/k210.dtsi @@ -113,7 +113,8 @@ compatible = "canaan,k210-plic", "sifive,plic-1.0.0"; reg = <0xC000000 0x4000000>; interrupt-controller; - interrupts-extended = <&cpu0_intc 11>, <&cpu1_intc 11>; + interrupts-extended = <&cpu0_intc 11>, <&cpu0_intc 9>, + <&cpu1_intc 11>, <&cpu1_intc 9>; riscv,ndev = <65>; }; From 0bf476fc3624e3a72af4ba7340d430a91c18cd67 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Thu, 3 Mar 2022 12:10:27 -0600 Subject: [PATCH 615/773] net: macb: Fix lost RX packet wakeup race in NAPI receive There is an oddity in the way the RSR register flags propagate to the ISR register (and the actual interrupt output) on this hardware: it appears that RSR register bits only result in ISR being asserted if the interrupt was actually enabled at the time, so enabling interrupts with RSR bits already set doesn't trigger an interrupt to be raised. There was already a partial fix for this race in the macb_poll function where it checked for RSR bits being set and re-triggered NAPI receive. However, there was a still a race window between checking RSR and actually enabling interrupts, where a lost wakeup could happen. It's necessary to check again after enabling interrupts to see if RSR was set just prior to the interrupt being enabled, and re-trigger receive in that case. This issue was noticed in a point-to-point UDP request-response protocol which periodically saw timeouts or abnormally high response times due to received packets not being processed in a timely fashion. In many applications, more packets arriving, including TCP retransmissions, would cause the original packet to be processed, thus masking the issue. Fixes: 02f7a34f34e3 ("net: macb: Re-enable RX interrupt only when RX is done") Cc: stable@vger.kernel.org Co-developed-by: Scott McNutt Signed-off-by: Scott McNutt Signed-off-by: Robert Hancock Tested-by: Claudiu Beznea Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 25 +++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 98498a76ae16..d13f06cf0308 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1573,7 +1573,14 @@ static int macb_poll(struct napi_struct *napi, int budget) if (work_done < budget) { napi_complete_done(napi, work_done); - /* Packets received while interrupts were disabled */ + /* RSR bits only seem to propagate to raise interrupts when + * interrupts are enabled at the time, so if bits are already + * set due to packets received while interrupts were disabled, + * they will not cause another interrupt to be generated when + * interrupts are re-enabled. + * Check for this case here. This has been seen to happen + * around 30% of the time under heavy network load. + */ status = macb_readl(bp, RSR); if (status) { if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) @@ -1581,6 +1588,22 @@ static int macb_poll(struct napi_struct *napi, int budget) napi_reschedule(napi); } else { queue_writel(queue, IER, bp->rx_intr_mask); + + /* In rare cases, packets could have been received in + * the window between the check above and re-enabling + * interrupts. Therefore, a double-check is required + * to avoid losing a wakeup. This can potentially race + * with the interrupt handler doing the same actions + * if an interrupt is raised just after enabling them, + * but this should be harmless. + */ + status = macb_readl(bp, RSR); + if (unlikely(status)) { + queue_writel(queue, IDR, bp->rx_intr_mask); + if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) + queue_writel(queue, ISR, MACB_BIT(RCOMP)); + napi_schedule(napi); + } } } From be4977b847f5d5cedb64d50eaaf2218c3a55a3a3 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Fri, 4 Mar 2022 03:25:18 +0000 Subject: [PATCH 616/773] tipc: fix kernel panic when enabling bearer When enabling a bearer on a node, a kernel panic is observed: [ 4.498085] RIP: 0010:tipc_mon_prep+0x4e/0x130 [tipc] ... [ 4.520030] Call Trace: [ 4.520689] [ 4.521236] tipc_link_build_proto_msg+0x375/0x750 [tipc] [ 4.522654] tipc_link_build_state_msg+0x48/0xc0 [tipc] [ 4.524034] __tipc_node_link_up+0xd7/0x290 [tipc] [ 4.525292] tipc_rcv+0x5da/0x730 [tipc] [ 4.526346] ? __netif_receive_skb_core+0xb7/0xfc0 [ 4.527601] tipc_l2_rcv_msg+0x5e/0x90 [tipc] [ 4.528737] __netif_receive_skb_list_core+0x20b/0x260 [ 4.530068] netif_receive_skb_list_internal+0x1bf/0x2e0 [ 4.531450] ? dev_gro_receive+0x4c2/0x680 [ 4.532512] napi_complete_done+0x6f/0x180 [ 4.533570] virtnet_poll+0x29c/0x42e [virtio_net] ... The node in question is receiving activate messages in another thread after changing bearer status to allow message sending/ receiving in current thread: thread 1 | thread 2 -------- | -------- | tipc_enable_bearer() | test_and_set_bit_lock() | tipc_bearer_xmit_skb() | | tipc_l2_rcv_msg() | tipc_rcv() | __tipc_node_link_up() | tipc_link_build_state_msg() | tipc_link_build_proto_msg() | tipc_mon_prep() | { | ... | // null-pointer dereference | u16 gen = mon->dom_gen; | ... | } // Not being executed yet | tipc_mon_create() | { | ... | // allocate | mon = kzalloc(); | ... | } | Monitoring pointer in thread 2 is dereferenced before monitoring data is allocated in thread 1. This causes kernel panic. This commit fixes it by allocating the monitoring data before enabling the bearer to receive messages. Fixes: 35c55c9877f8 ("tipc: add neighbor monitoring framework") Reported-by: Shuang Li Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/bearer.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 473a790f5894..a2f9c9640716 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -352,16 +352,18 @@ static int tipc_enable_bearer(struct net *net, const char *name, goto rejected; } + /* Create monitoring data before accepting activate messages */ + if (tipc_mon_create(net, bearer_id)) { + bearer_disable(net, b); + kfree_skb(skb); + return -ENOMEM; + } + test_and_set_bit_lock(0, &b->up); rcu_assign_pointer(tn->bearer_list[bearer_id], b); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); - if (tipc_mon_create(net, bearer_id)) { - bearer_disable(net, b); - return -ENOMEM; - } - pr_info("Enabled bearer <%s>, priority %u\n", name, prio); return res; From 838d6d3461db0fdbf33fc5f8a69c27b50b4a46da Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 14:56:15 -0500 Subject: [PATCH 617/773] virtio: unexport virtio_finalize_features virtio_finalize_features is only used internally within virtio. No reason to export it. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck Acked-by: Jason Wang --- drivers/virtio/virtio.c | 3 +-- include/linux/virtio.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 00ac9db792a4..d891b0a354b0 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -166,7 +166,7 @@ void virtio_add_status(struct virtio_device *dev, unsigned int status) } EXPORT_SYMBOL_GPL(virtio_add_status); -int virtio_finalize_features(struct virtio_device *dev) +static int virtio_finalize_features(struct virtio_device *dev) { int ret = dev->config->finalize_features(dev); unsigned status; @@ -202,7 +202,6 @@ int virtio_finalize_features(struct virtio_device *dev) } return 0; } -EXPORT_SYMBOL_GPL(virtio_finalize_features); void virtio_reset_device(struct virtio_device *dev) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 72292a62cd90..5464f398912a 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -133,7 +133,6 @@ bool is_virtio_device(struct device *dev); void virtio_break_device(struct virtio_device *dev); void virtio_config_changed(struct virtio_device *dev); -int virtio_finalize_features(struct virtio_device *dev); #ifdef CONFIG_PM_SLEEP int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); From 4fa59ede95195f267101a1b8916992cf3f245cdb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 14:58:41 -0500 Subject: [PATCH 618/773] virtio: acknowledge all features before access The feature negotiation was designed in a way that makes it possible for devices to know which config fields will be accessed by drivers. This is broken since commit 404123c2db79 ("virtio: allow drivers to validate features") with fallout in at least block and net. We have a partial work-around in commit 2f9a174f918e ("virtio: write back F_VERSION_1 before validate") which at least lets devices find out which format should config space have, but this is a partial fix: guests should not access config space without acknowledging features since otherwise we'll never be able to change the config space format. To fix, split finalize_features from virtio_finalize_features and call finalize_features with all feature bits before validation, and then - if validation changed any bits - once again after. Since virtio_finalize_features no longer writes out features rename it to virtio_features_ok - since that is what it does: checks that features are ok with the device. As a side effect, this also reduces the amount of hypervisor accesses - we now only acknowledge features once unless we are clearing any features when validating (which is uncommon). IRC I think that this was more or less always the intent in the spec but unfortunately the way the spec is worded does not say this explicitly, I plan to address this at the spec level, too. Acked-by: Jason Wang Cc: stable@vger.kernel.org Fixes: 404123c2db79 ("virtio: allow drivers to validate features") Fixes: 2f9a174f918e ("virtio: write back F_VERSION_1 before validate") Cc: "Halil Pasic" Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 39 ++++++++++++++++++++--------------- include/linux/virtio_config.h | 3 ++- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index d891b0a354b0..d6396be0ea83 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -166,14 +166,13 @@ void virtio_add_status(struct virtio_device *dev, unsigned int status) } EXPORT_SYMBOL_GPL(virtio_add_status); -static int virtio_finalize_features(struct virtio_device *dev) +/* Do some validation, then set FEATURES_OK */ +static int virtio_features_ok(struct virtio_device *dev) { - int ret = dev->config->finalize_features(dev); unsigned status; + int ret; might_sleep(); - if (ret) - return ret; ret = arch_has_restricted_virtio_memory_access(); if (ret) { @@ -244,17 +243,6 @@ static int virtio_dev_probe(struct device *_d) driver_features_legacy = driver_features; } - /* - * Some devices detect legacy solely via F_VERSION_1. Write - * F_VERSION_1 to force LE config space accesses before FEATURES_OK for - * these when needed. - */ - if (drv->validate && !virtio_legacy_is_little_endian() - && device_features & BIT_ULL(VIRTIO_F_VERSION_1)) { - dev->features = BIT_ULL(VIRTIO_F_VERSION_1); - dev->config->finalize_features(dev); - } - if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else @@ -265,13 +253,26 @@ static int virtio_dev_probe(struct device *_d) if (device_features & (1ULL << i)) __virtio_set_bit(dev, i); + err = dev->config->finalize_features(dev); + if (err) + goto err; + if (drv->validate) { + u64 features = dev->features; + err = drv->validate(dev); if (err) goto err; + + /* Did validation change any features? Then write them again. */ + if (features != dev->features) { + err = dev->config->finalize_features(dev); + if (err) + goto err; + } } - err = virtio_finalize_features(dev); + err = virtio_features_ok(dev); if (err) goto err; @@ -495,7 +496,11 @@ int virtio_device_restore(struct virtio_device *dev) /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); - ret = virtio_finalize_features(dev); + ret = dev->config->finalize_features(dev); + if (ret) + goto err; + + ret = virtio_features_ok(dev); if (ret) goto err; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 4d107ad31149..dafdc7f48c01 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -64,8 +64,9 @@ struct virtio_shm_region { * Returns the first 64 feature bits (all we currently need). * @finalize_features: confirm what device features we'll be using. * vdev: the virtio_device - * This gives the final feature bits for the device: it can change + * This sends the driver feature bits to the device: it can change * the dev->feature bits if it wants. + * Note: despite the name this can be called any number of times. * Returns 0 on success or error status * @bus_name: return the bus name associated with the device (optional) * vdev: the virtio_device From c46eccdaadabb7822080a04e633f81b2ad37f358 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 15:54:01 -0500 Subject: [PATCH 619/773] virtio: document virtio_reset_device Looks like most callers get driver/device removal wrong. Document what's expected of callers. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index d6396be0ea83..22f15f444f75 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -202,6 +202,22 @@ static int virtio_features_ok(struct virtio_device *dev) return 0; } +/** + * virtio_reset_device - quiesce device for removal + * @dev: the device to reset + * + * Prevents device from sending interrupts and accessing memory. + * + * Generally used for cleanup during driver / device removal. + * + * Once this has been invoked, caller must ensure that + * virtqueue_notify / virtqueue_kick are not in progress. + * + * Note: this guarantees that vq callbacks are not in progress, however caller + * is responsible for preventing access from other contexts, such as a system + * call/workqueue/bh. Invoking virtio_break_device then flushing any such + * contexts is one way to handle that. + * */ void virtio_reset_device(struct virtio_device *dev) { dev->config->reset(dev); From 0e7174b9d5877130fec41fb4a16e0c2ee4958d44 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 5 Oct 2021 03:04:10 -0400 Subject: [PATCH 620/773] virtio_console: break out of buf poll on remove A common pattern for device reset is currently: vdev->config->reset(vdev); .. cleanup .. reset prevents new interrupts from arriving and waits for interrupt handlers to finish. However if - as is common - the handler queues a work request which is flushed during the cleanup stage, we have code adding buffers / trying to get buffers while device is reset. Not good. This was reproduced by running modprobe virtio_console modprobe -r virtio_console in a loop. Fix this up by calling virtio_break_device + flush before reset. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1786239 Signed-off-by: Michael S. Tsirkin --- drivers/char/virtio_console.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 2359889a35a0..e3c430539a17 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1957,6 +1957,13 @@ static void virtcons_remove(struct virtio_device *vdev) list_del(&portdev->list); spin_unlock_irq(&pdrvdata_lock); + /* Device is going away, exit any polling for buffers */ + virtio_break_device(vdev); + if (use_multiport(portdev)) + flush_work(&portdev->control_work); + else + flush_work(&portdev->config_work); + /* Disable interrupts for vqs */ virtio_reset_device(vdev); /* Finish up work that's lined up */ From ca93e44bfb5fd7996b76f0f544999171f647f93b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 2 Mar 2022 11:48:39 +0000 Subject: [PATCH 621/773] btrfs: fallback to blocking mode when doing async dio over multiple extents Some users recently reported that MariaDB was getting a read corruption when using io_uring on top of btrfs. This started to happen in 5.16, after commit 51bd9563b6783d ("btrfs: fix deadlock due to page faults during direct IO reads and writes"). That changed btrfs to use the new iomap flag IOMAP_DIO_PARTIAL and to disable page faults before calling iomap_dio_rw(). This was necessary to fix deadlocks when the iovector corresponds to a memory mapped file region. That type of scenario is exercised by test case generic/647 from fstests. For this MariaDB scenario, we attempt to read 16K from file offset X using IOCB_NOWAIT and io_uring. In that range we have 4 extents, each with a size of 4K, and what happens is the following: 1) btrfs_direct_read() disables page faults and calls iomap_dio_rw(); 2) iomap creates a struct iomap_dio object, its reference count is initialized to 1 and its ->size field is initialized to 0; 3) iomap calls btrfs_dio_iomap_begin() with file offset X, which finds the first 4K extent, and setups an iomap for this extent consisting of a single page; 4) At iomap_dio_bio_iter(), we are able to access the first page of the buffer (struct iov_iter) with bio_iov_iter_get_pages() without triggering a page fault; 5) iomap submits a bio for this 4K extent (iomap_dio_submit_bio() -> btrfs_submit_direct()) and increments the refcount on the struct iomap_dio object to 2; The ->size field of the struct iomap_dio object is incremented to 4K; 6) iomap calls btrfs_iomap_begin() again, this time with a file offset of X + 4K. There we setup an iomap for the next extent that also has a size of 4K; 7) Then at iomap_dio_bio_iter() we call bio_iov_iter_get_pages(), which tries to access the next page (2nd page) of the buffer. This triggers a page fault and returns -EFAULT; 8) At __iomap_dio_rw() we see the -EFAULT, but we reset the error to 0 because we passed the flag IOMAP_DIO_PARTIAL to iomap and the struct iomap_dio object has a ->size value of 4K (we submitted a bio for an extent already). The 'wait_for_completion' variable is not set to true, because our iocb has IOCB_NOWAIT set; 9) At the bottom of __iomap_dio_rw(), we decrement the reference count of the struct iomap_dio object from 2 to 1. Because we were not the only ones holding a reference on it and 'wait_for_completion' is set to false, -EIOCBQUEUED is returned to btrfs_direct_read(), which just returns it up the callchain, up to io_uring; 10) The bio submitted for the first extent (step 5) completes and its bio endio function, iomap_dio_bio_end_io(), decrements the last reference on the struct iomap_dio object, resulting in calling iomap_dio_complete_work() -> iomap_dio_complete(). 11) At iomap_dio_complete() we adjust the iocb->ki_pos from X to X + 4K and return 4K (the amount of io done) to iomap_dio_complete_work(); 12) iomap_dio_complete_work() calls the iocb completion callback, iocb->ki_complete() with a second argument value of 4K (total io done) and the iocb with the adjust ki_pos of X + 4K. This results in completing the read request for io_uring, leaving it with a result of 4K bytes read, and only the first page of the buffer filled in, while the remaining 3 pages, corresponding to the other 3 extents, were not filled; 13) For the application, the result is unexpected because if we ask to read N bytes, it expects to get N bytes read as long as those N bytes don't cross the EOF (i_size). MariaDB reports this as an error, as it's not expecting a short read, since it knows it's asking for read operations fully within the i_size boundary. This is typical in many applications, but it may also be questionable if they should react to such short reads by issuing more read calls to get the remaining data. Nevertheless, the short read happened due to a change in btrfs regarding how it deals with page faults while in the middle of a read operation, and there's no reason why btrfs can't have the previous behaviour of returning the whole data that was requested by the application. The problem can also be triggered with the following simple program: /* Get O_DIRECT */ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include int main(int argc, char *argv[]) { char *foo_path; struct io_uring ring; struct io_uring_sqe *sqe; struct io_uring_cqe *cqe; struct iovec iovec; int fd; long pagesize; void *write_buf; void *read_buf; ssize_t ret; int i; if (argc != 2) { fprintf(stderr, "Use: %s \n", argv[0]); return 1; } foo_path = malloc(strlen(argv[1]) + 5); if (!foo_path) { fprintf(stderr, "Failed to allocate memory for file path\n"); return 1; } strcpy(foo_path, argv[1]); strcat(foo_path, "/foo"); /* * Create file foo with 2 extents, each with a size matching * the page size. Then allocate a buffer to read both extents * with io_uring, using O_DIRECT and IOCB_NOWAIT. Before doing * the read with io_uring, access the first page of the buffer * to fault it in, so that during the read we only trigger a * page fault when accessing the second page of the buffer. */ fd = open(foo_path, O_CREAT | O_TRUNC | O_WRONLY | O_DIRECT, 0666); if (fd == -1) { fprintf(stderr, "Failed to create file 'foo': %s (errno %d)", strerror(errno), errno); return 1; } pagesize = sysconf(_SC_PAGE_SIZE); ret = posix_memalign(&write_buf, pagesize, 2 * pagesize); if (ret) { fprintf(stderr, "Failed to allocate write buffer\n"); return 1; } memset(write_buf, 0xab, pagesize); memset(write_buf + pagesize, 0xcd, pagesize); /* Create 2 extents, each with a size matching page size. */ for (i = 0; i < 2; i++) { ret = pwrite(fd, write_buf + i * pagesize, pagesize, i * pagesize); if (ret != pagesize) { fprintf(stderr, "Failed to write to file, ret = %ld errno %d (%s)\n", ret, errno, strerror(errno)); return 1; } ret = fsync(fd); if (ret != 0) { fprintf(stderr, "Failed to fsync file\n"); return 1; } } close(fd); fd = open(foo_path, O_RDONLY | O_DIRECT); if (fd == -1) { fprintf(stderr, "Failed to open file 'foo': %s (errno %d)", strerror(errno), errno); return 1; } ret = posix_memalign(&read_buf, pagesize, 2 * pagesize); if (ret) { fprintf(stderr, "Failed to allocate read buffer\n"); return 1; } /* * Fault in only the first page of the read buffer. * We want to trigger a page fault for the 2nd page of the * read buffer during the read operation with io_uring * (O_DIRECT and IOCB_NOWAIT). */ memset(read_buf, 0, 1); ret = io_uring_queue_init(1, &ring, 0); if (ret != 0) { fprintf(stderr, "Failed to create io_uring queue\n"); return 1; } sqe = io_uring_get_sqe(&ring); if (!sqe) { fprintf(stderr, "Failed to get io_uring sqe\n"); return 1; } iovec.iov_base = read_buf; iovec.iov_len = 2 * pagesize; io_uring_prep_readv(sqe, fd, &iovec, 1, 0); ret = io_uring_submit_and_wait(&ring, 1); if (ret != 1) { fprintf(stderr, "Failed at io_uring_submit_and_wait()\n"); return 1; } ret = io_uring_wait_cqe(&ring, &cqe); if (ret < 0) { fprintf(stderr, "Failed at io_uring_wait_cqe()\n"); return 1; } printf("io_uring read result for file foo:\n\n"); printf(" cqe->res == %d (expected %d)\n", cqe->res, 2 * pagesize); printf(" memcmp(read_buf, write_buf) == %d (expected 0)\n", memcmp(read_buf, write_buf, 2 * pagesize)); io_uring_cqe_seen(&ring, cqe); io_uring_queue_exit(&ring); return 0; } When running it on an unpatched kernel: $ gcc io_uring_test.c -luring $ mkfs.btrfs -f /dev/sda $ mount /dev/sda /mnt/sda $ ./a.out /mnt/sda io_uring read result for file foo: cqe->res == 4096 (expected 8192) memcmp(read_buf, write_buf) == -205 (expected 0) After this patch, the read always returns 8192 bytes, with the buffer filled with the correct data. Although that reproducer always triggers the bug in my test vms, it's possible that it will not be so reliable on other environments, as that can happen if the bio for the first extent completes and decrements the reference on the struct iomap_dio object before we do the atomic_dec_and_test() on the reference at __iomap_dio_rw(). Fix this in btrfs by having btrfs_dio_iomap_begin() return -EAGAIN whenever we try to satisfy a non blocking IO request (IOMAP_NOWAIT flag set) over a range that spans multiple extents (or a mix of extents and holes). This avoids returning success to the caller when we only did partial IO, which is not optimal for writes and for reads it's actually incorrect, as the caller doesn't expect to get less bytes read than it has requested (unless EOF is crossed), as previously mentioned. This is also the type of behaviour that xfs follows (xfs_direct_write_iomap_begin()), even though it doesn't use IOMAP_DIO_PARTIAL. A test case for fstests will follow soon. Link: https://lore.kernel.org/linux-btrfs/CABVffEM0eEWho+206m470rtM0d9J8ue85TtR-A_oVTuGLWFicA@mail.gmail.com/ Link: https://lore.kernel.org/linux-btrfs/CAHF2GV6U32gmqSjLe=XKgfcZAmLCiH26cJ2OnHGp5x=VAH4OHQ@mail.gmail.com/ CC: stable@vger.kernel.org # 5.16+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 76e530f76e3c..5bbea5ec31fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7600,6 +7600,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } len = min(len, em->len - (start - em->start)); + + /* + * If we have a NOWAIT request and the range contains multiple extents + * (or a mix of extents and holes), then we return -EAGAIN to make the + * caller fallback to a context where it can do a blocking (without + * NOWAIT) request. This way we avoid doing partial IO and returning + * success to the caller, which is not optimal for writes and for reads + * it can result in unexpected behaviour for an application. + * + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling + * iomap_dio_rw(), we can end up returning less data then what the caller + * asked for, resulting in an unexpected, and incorrect, short read. + * That is, the caller asked to read N bytes and we return less than that, + * which is wrong unless we are crossing EOF. This happens if we get a + * page fault error when trying to fault in pages for the buffer that is + * associated to the struct iov_iter passed to iomap_dio_rw(), and we + * have previously submitted bios for other extents in the range, in + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of + * those bios have completed by the time we get the page fault error, + * which we return back to our caller - we should only return EIOCBQUEUED + * after we have submitted bios for all the extents in the range. + */ + if ((flags & IOMAP_NOWAIT) && len < length) { + free_extent_map(em); + ret = -EAGAIN; + goto unlock_err; + } + if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, start, len); From e0077cc13b831f8fad5557442f73bf7728683713 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Fri, 14 Jan 2022 19:27:59 -0500 Subject: [PATCH 622/773] vdpa: factor out vdpa_set_features_unlocked for vdpa internal use No functional change introduced. vdpa bus driver such as virtio_vdpa or vhost_vdpa is not supposed to take care of the locking for core by its own. The locked API vdpa_set_features should suffice the bus driver's need. Signed-off-by: Si-Wei Liu Reviewed-by: Eli Cohen Link: https://lore.kernel.org/r/1642206481-30721-2-git-send-email-si-wei.liu@oracle.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa.c | 2 +- drivers/vhost/vdpa.c | 2 +- drivers/virtio/virtio_vdpa.c | 2 +- include/linux/vdpa.h | 18 ++++++++++++------ 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 9846c9de4bfa..1ea525433a5c 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -393,7 +393,7 @@ static void vdpa_get_config_unlocked(struct vdpa_device *vdev, * If it does happen we assume a legacy guest. */ if (!vdev->features_valid) - vdpa_set_features(vdev, 0, true); + vdpa_set_features_unlocked(vdev, 0); ops->get_config(vdev, offset, buf, len); } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 851539807bc9..ec5249e8c32d 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -286,7 +286,7 @@ static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; - if (vdpa_set_features(vdpa, features, false)) + if (vdpa_set_features(vdpa, features)) return -EINVAL; return 0; diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 7767a7f0119b..76504559bc25 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -317,7 +317,7 @@ static int virtio_vdpa_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - return vdpa_set_features(vdpa, vdev->features, false); + return vdpa_set_features(vdpa, vdev->features); } static const char *virtio_vdpa_bus_name(struct virtio_device *vdev) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2de442ececae..721089bb4c84 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -401,18 +401,24 @@ static inline int vdpa_reset(struct vdpa_device *vdev) return ret; } -static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features, bool locked) +static inline int vdpa_set_features_unlocked(struct vdpa_device *vdev, u64 features) { const struct vdpa_config_ops *ops = vdev->config; int ret; - if (!locked) - mutex_lock(&vdev->cf_mutex); - vdev->features_valid = true; ret = ops->set_driver_features(vdev, features); - if (!locked) - mutex_unlock(&vdev->cf_mutex); + + return ret; +} + +static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) +{ + int ret; + + mutex_lock(&vdev->cf_mutex); + ret = vdpa_set_features_unlocked(vdev, features); + mutex_unlock(&vdev->cf_mutex); return ret; } From 30c22f3816ffef8aa21a000e93c4ee1402a6ea65 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Fri, 14 Jan 2022 19:28:00 -0500 Subject: [PATCH 623/773] vdpa/mlx5: should verify CTRL_VQ feature exists for MQ Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ". There's assumption in the mlx5_vdpa multiqueue code that MQ must come together with CTRL_VQ. However, there's nowhere in the upper layer to guarantee this assumption would hold. Were there an untrusted driver sending down MQ without CTRL_VQ, it would compromise various spots for e.g. is_index_valid() and is_ctrl_vq_idx(). Although this doesn't end up with immediate panic or security loophole as of today's code, the chance for this to be taken advantage of due to future code change is not zero. Harden the crispy assumption by failing the set_driver_features() call when seeing (MQ && !CTRL_VQ). For that end, verify_min_features() is renamed to verify_driver_features() to reflect the fact that it now does more than just validate the minimum features. verify_driver_features() is now used to accommodate various checks against the driver features for set_driver_features(). Signed-off-by: Si-Wei Liu Link: https://lore.kernel.org/r/1642206481-30721-3-git-send-email-si-wei.liu@oracle.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index f648f1c54a0f..1ffbd20fcf21 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1897,11 +1897,25 @@ static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev) return ndev->mvdev.mlx_features; } -static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features) +static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features) { + /* Minimum features to expect */ if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) return -EOPNOTSUPP; + /* Double check features combination sent down by the driver. + * Fail invalid features due to absence of the depended feature. + * + * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit + * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ". + * By failing the invalid features sent down by untrusted drivers, + * we're assured the assumption made upon is_index_valid() and + * is_ctrl_vq_idx() will not be compromised. + */ + if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) == + BIT_ULL(VIRTIO_NET_F_MQ)) + return -EINVAL; + return 0; } @@ -1977,7 +1991,7 @@ static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features) print_features(mvdev, features, true); - err = verify_min_features(mvdev, features); + err = verify_driver_features(mvdev, features); if (err) return err; From ed0f849fc3a63ed2ddf5e72cdb1de3bdbbb0f8eb Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Fri, 14 Jan 2022 19:28:01 -0500 Subject: [PATCH 624/773] vdpa/mlx5: add validation for VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET command When control vq receives a VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET command request from the driver, presently there is no validation against the number of queue pairs to configure, or even if multiqueue had been negotiated or not is unverified. This may lead to kernel panic due to uninitialized resource for the queues were there any bogus request sent down by untrusted driver. Tie up the loose ends there. Fixes: 52893733f2c5 ("vdpa/mlx5: Add multiqueue support") Signed-off-by: Si-Wei Liu Link: https://lore.kernel.org/r/1642206481-30721-4-git-send-email-si-wei.liu@oracle.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 1ffbd20fcf21..d0f91078600e 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1563,11 +1563,27 @@ static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd) switch (cmd) { case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET: + /* This mq feature check aligns with pre-existing userspace + * implementation. + * + * Without it, an untrusted driver could fake a multiqueue config + * request down to a non-mq device that may cause kernel to + * panic due to uninitialized resources for extra vqs. Even with + * a well behaving guest driver, it is not expected to allow + * changing the number of vqs on a non-mq device. + */ + if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) + break; + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq)); if (read != sizeof(mq)) break; newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs); + if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || + newqps > mlx5_vdpa_max_qps(mvdev->max_vqs)) + break; + if (ndev->cur_num_vqs == 2 * newqps) { status = VIRTIO_NET_OK; break; From b9d102dafec6af1c07b610faf0a6d4e8aee14ae0 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 21 Jan 2022 16:39:39 +0800 Subject: [PATCH 625/773] vduse: Fix returning wrong type in vduse_domain_alloc_iova() This fixes the following smatch warnings: drivers/vdpa/vdpa_user/iova_domain.c:305 vduse_domain_alloc_iova() warn: should 'iova_pfn << shift' be a 64 bit type? Fixes: 8c773d53fb7b ("vduse: Implement an MMU-based software IOTLB") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20220121083940.102-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa_user/iova_domain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 2b1143f11d8f..0a4d93edc4c0 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -294,7 +294,7 @@ vduse_domain_alloc_iova(struct iova_domain *iovad, iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true); - return iova_pfn << shift; + return (dma_addr_t)iova_pfn << shift; } static void vduse_domain_free_iova(struct iova_domain *iovad, From 0708a0afe291bdfe1386d74d5ec1f0c27e8b9168 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2022 15:26:32 +0100 Subject: [PATCH 626/773] mm: Consider __GFP_NOWARN flag for oversized kvmalloc() calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller was recently triggering an oversized kvmalloc() warning via xdp_umem_create(). The triggered warning was added back in 7661809d493b ("mm: don't allow oversized kvmalloc() calls"). The rationale for the warning for huge kvmalloc sizes was as a reaction to a security bug where the size was more than UINT_MAX but not everything was prepared to handle unsigned long sizes. Anyway, the AF_XDP related call trace from this syzkaller report was: kvmalloc include/linux/mm.h:806 [inline] kvmalloc_array include/linux/mm.h:824 [inline] kvcalloc include/linux/mm.h:829 [inline] xdp_umem_pin_pages net/xdp/xdp_umem.c:102 [inline] xdp_umem_reg net/xdp/xdp_umem.c:219 [inline] xdp_umem_create+0x6a5/0xf00 net/xdp/xdp_umem.c:252 xsk_setsockopt+0x604/0x790 net/xdp/xsk.c:1068 __sys_setsockopt+0x1fd/0x4e0 net/socket.c:2176 __do_sys_setsockopt net/socket.c:2187 [inline] __se_sys_setsockopt net/socket.c:2184 [inline] __x64_sys_setsockopt+0xb5/0x150 net/socket.c:2184 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Björn mentioned that requests for >2GB allocation can still be valid: The structure that is being allocated is the page-pinning accounting. AF_XDP has an internal limit of U32_MAX pages, which is *a lot*, but still fewer than what memcg allows (PAGE_COUNTER_MAX is a LONG_MAX/ PAGE_SIZE on 64 bit systems). [...] I could just change from U32_MAX to INT_MAX, but as I stated earlier that has a hacky feeling to it. [...] From my perspective, the code isn't broken, with the memcg limits in consideration. [...] Linus says: [...] Pretty much every time this has come up, the kernel warning has shown that yes, the code was broken and there really wasn't a reason for doing allocations that big. Of course, some people would be perfectly fine with the allocation failing, they just don't want the warning. I didn't want __GFP_NOWARN to shut it up originally because I wanted people to see all those cases, but these days I think we can just say "yeah, people can shut it up explicitly by saying 'go ahead and fail this allocation, don't warn about it'". So enough time has passed that by now I'd certainly be ok with [it]. Thus allow call-sites to silence such userspace triggered splats if the allocation requests have __GFP_NOWARN. For xdp_umem_pin_pages()'s call to kvcalloc() this is already the case, so nothing else needed there. Fixes: 7661809d493b ("mm: don't allow oversized kvmalloc() calls") Reported-by: syzbot+11421fbbff99b989670e@syzkaller.appspotmail.com Suggested-by: Linus Torvalds Signed-off-by: Daniel Borkmann Tested-by: syzbot+11421fbbff99b989670e@syzkaller.appspotmail.com Cc: Björn Töpel Cc: Magnus Karlsson Cc: Willy Tarreau Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Jakub Kicinski Cc: David S. Miller Link: https://lore.kernel.org/bpf/CAJ+HfNhyfsT5cS_U9EC213ducHs9k9zNxX9+abqC0kTrPbQ0gg@mail.gmail.com Link: https://lore.kernel.org/bpf/20211201202905.b9892171e3f5b9a60f9da251@linux-foundation.org Reviewed-by: Leon Romanovsky Ackd-by: Michal Hocko Signed-off-by: Linus Torvalds --- mm/util.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index 7e43369064c8..d3102081add0 100644 --- a/mm/util.c +++ b/mm/util.c @@ -587,8 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) return ret; /* Don't even allow crazy sizes */ - if (WARN_ON_ONCE(size > INT_MAX)) + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; + } return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); From 1d02b444b8d1345ea4708db3bab4db89a7784b55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 2 Mar 2022 19:17:44 -0800 Subject: [PATCH 627/773] tracing: Fix return value of __setup handlers __setup() handlers should generally return 1 to indicate that the boot options have been handled. Using invalid option values causes the entire kernel boot option string to be reported as Unknown and added to init's environment strings, polluting it. Unknown kernel command line parameters "BOOT_IMAGE=/boot/bzImage-517rc6 kprobe_event=p,syscall_any,$arg1 trace_options=quiet trace_clock=jiffies", will be passed to user space. Run /sbin/init as init process with arguments: /sbin/init with environment: HOME=/ TERM=linux BOOT_IMAGE=/boot/bzImage-517rc6 kprobe_event=p,syscall_any,$arg1 trace_options=quiet trace_clock=jiffies Return 1 from the __setup() handlers so that init's environment is not polluted with kernel boot options. Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Link: https://lkml.kernel.org/r/20220303031744.32356-1-rdunlap@infradead.org Cc: stable@vger.kernel.org Fixes: 7bcfaf54f591 ("tracing: Add trace_options kernel command line parameter") Fixes: e1e232ca6b8f ("tracing: Add trace_clock= kernel parameter") Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3050892d1812..eb44418574f9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,7 +235,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - return 0; + return 1; } __setup("trace_options=", set_trace_boot_options); @@ -246,7 +246,7 @@ static int __init set_trace_boot_clock(char *str) { strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; - return 0; + return 1; } __setup("trace_clock=", set_trace_boot_clock); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 508f14af4f2c..b62fd785b599 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -32,7 +32,7 @@ static int __init set_kprobe_boot_events(char *str) strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); - return 0; + return 1; } __setup("kprobe_event=", set_kprobe_boot_events); From a502a8f04097e038c3daa16c5202a9538116d563 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 3 Mar 2022 08:54:15 +0100 Subject: [PATCH 628/773] net: phy: meson-gxl: fix interrupt handling in forced mode This PHY doesn't support a link-up interrupt source. If aneg is enabled we use the "aneg complete" interrupt for this purpose, but if aneg is disabled link-up isn't signaled currently. According to a vendor driver there's an additional "energy detect" interrupt source that can be used to signal link-up if aneg is disabled. We can safely ignore this interrupt source if aneg is enabled. This patch was tested on a TX3 Mini TV box with S905W (even though boot message says it's a S905D). This issue has been existing longer, but due to changes in phylib and the driver the patch applies only from the commit marked as fixed. Fixes: 84c8f773d2dc ("net: phy: meson-gxl: remove the use of .ack_callback()") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/04cac530-ea1b-850e-6cfa-144a55c4d75d@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/meson-gxl.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index 7e7904fee1d9..c49062ad72c6 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -30,8 +30,12 @@ #define INTSRC_LINK_DOWN BIT(4) #define INTSRC_REMOTE_FAULT BIT(5) #define INTSRC_ANEG_COMPLETE BIT(6) +#define INTSRC_ENERGY_DETECT BIT(7) #define INTSRC_MASK 30 +#define INT_SOURCES (INTSRC_LINK_DOWN | INTSRC_ANEG_COMPLETE | \ + INTSRC_ENERGY_DETECT) + #define BANK_ANALOG_DSP 0 #define BANK_WOL 1 #define BANK_BIST 3 @@ -200,7 +204,6 @@ static int meson_gxl_ack_interrupt(struct phy_device *phydev) static int meson_gxl_config_intr(struct phy_device *phydev) { - u16 val; int ret; if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { @@ -209,16 +212,9 @@ static int meson_gxl_config_intr(struct phy_device *phydev) if (ret) return ret; - val = INTSRC_ANEG_PR - | INTSRC_PARALLEL_FAULT - | INTSRC_ANEG_LP_ACK - | INTSRC_LINK_DOWN - | INTSRC_REMOTE_FAULT - | INTSRC_ANEG_COMPLETE; - ret = phy_write(phydev, INTSRC_MASK, val); + ret = phy_write(phydev, INTSRC_MASK, INT_SOURCES); } else { - val = 0; - ret = phy_write(phydev, INTSRC_MASK, val); + ret = phy_write(phydev, INTSRC_MASK, 0); /* Ack any pending IRQ */ ret = meson_gxl_ack_interrupt(phydev); @@ -237,9 +233,16 @@ static irqreturn_t meson_gxl_handle_interrupt(struct phy_device *phydev) return IRQ_NONE; } + irq_status &= INT_SOURCES; + if (irq_status == 0) return IRQ_NONE; + /* Aneg-complete interrupt is used for link-up detection */ + if (phydev->autoneg == AUTONEG_ENABLE && + irq_status == INTSRC_ENERGY_DETECT) + return IRQ_HANDLED; + phy_trigger_machine(phydev); return IRQ_HANDLED; From eafd987d4a82c7bb5aa12f0e3b4f8f3dea93e678 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 25 Feb 2022 14:31:49 -0800 Subject: [PATCH 629/773] x86/speculation: Warn about Spectre v2 LFENCE mitigation With: f8a66d608a3e ("x86,bugs: Unconditionally allow spectre_v2=retpoline,amd") it became possible to enable the LFENCE "retpoline" on Intel. However, Intel doesn't recommend it, as it has some weaknesses compared to retpoline. Now AMD doesn't recommend it either. It can still be left available as a cmdline option. It's faster than retpoline but is weaker in certain scenarios -- particularly SMT, but even non-SMT may be vulnerable in some cases. So just unconditionally warn if the user requests it on the cmdline. [ bp: Massage commit message. ] Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/bugs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a36bfe2c2480..cfd116423908 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -651,6 +651,7 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #ifdef CONFIG_BPF_SYSCALL @@ -972,6 +973,7 @@ static void __init spectre_v2_select_mitigation(void) break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); mode = SPECTRE_V2_LFENCE; break; @@ -1787,6 +1789,9 @@ static char *ibpb_state(void) static ssize_t spectre_v2_show_state(char *buf) { + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sprintf(buf, "Vulnerable: LFENCE\n"); + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); From 0de05d056afdb00eca8c7bbb0c79a3438daf700c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 25 Feb 2022 14:32:28 -0800 Subject: [PATCH 630/773] x86/speculation: Warn about eIBRS + LFENCE + Unprivileged eBPF + SMT The commit 44a3918c8245 ("x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting") added a warning for the "eIBRS + unprivileged eBPF" combination, which has been shown to be vulnerable against Spectre v2 BHB-based attacks. However, there's no warning about the "eIBRS + LFENCE retpoline + unprivileged eBPF" combo. The LFENCE adds more protection by shortening the speculation window after a mispredicted branch. That makes an attack significantly more difficult, even with unprivileged eBPF. So at least for now the logic doesn't warn about that combination. But if you then add SMT into the mix, the SMT attack angle weakens the effectiveness of the LFENCE considerably. So extend the "eIBRS + unprivileged eBPF" warning to also include the "eIBRS + LFENCE + unprivileged eBPF + SMT" case. [ bp: Massage commit message. ] Suggested-by: Alyssa Milburn Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/bugs.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index cfd116423908..6296e1ebed1d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -653,12 +653,27 @@ static inline const char *spectre_v2_module_string(void) { return ""; } #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) { - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } } #endif @@ -1118,6 +1133,10 @@ void cpu_bugs_smt_update(void) { mutex_lock(&spec_ctrl_mutex); + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; @@ -1793,7 +1812,11 @@ static ssize_t spectre_v2_show_state(char *buf) return sprintf(buf, "Vulnerable: LFENCE\n"); if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) - return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], From 58dbe9b373df2828d873b1c0e5afc77485b2f376 Mon Sep 17 00:00:00 2001 From: Murilo Opsfelder Araujo Date: Tue, 1 Mar 2022 17:47:43 -0300 Subject: [PATCH 631/773] powerpc/64s: Fix build failure when CONFIG_PPC_64S_HASH_MMU is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following build failure occurs when CONFIG_PPC_64S_HASH_MMU is not set: arch/powerpc/kernel/setup_64.c: In function ‘setup_per_cpu_areas’: arch/powerpc/kernel/setup_64.c:811:21: error: ‘mmu_linear_psize’ undeclared (first use in this function); did you mean ‘mmu_virtual_psize’? 811 | if (mmu_linear_psize == MMU_PAGE_4K) | ^~~~~~~~~~~~~~~~ | mmu_virtual_psize arch/powerpc/kernel/setup_64.c:811:21: note: each undeclared identifier is reported only once for each function it appears in Move the declaration of mmu_linear_psize outside of CONFIG_PPC_64S_HASH_MMU ifdef. After the above is fixed, it fails later with the following error: ld: arch/powerpc/kexec/file_load_64.o: in function `.arch_kexec_kernel_image_probe': file_load_64.c:(.text+0x1c1c): undefined reference to `.add_htab_mem_range' Fix that, too, by conditioning add_htab_mem_range() symbol to CONFIG_PPC_64S_HASH_MMU. Fixes: 387e220a2e5e ("powerpc/64s: Move hash MMU support code under CONFIG_PPC_64S_HASH_MMU") Reported-by: Erhard F. Signed-off-by: Murilo Opsfelder Araujo Signed-off-by: Michael Ellerman BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215567 Link: https://lore.kernel.org/r/20220301204743.45133-1-muriloo@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 2 +- arch/powerpc/include/asm/kexec_ranges.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index ba5b1becf518..006cbec70ffe 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -202,7 +202,6 @@ static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) /* * The current system page and segment sizes */ -extern int mmu_linear_psize; extern int mmu_virtual_psize; extern int mmu_vmalloc_psize; extern int mmu_io_psize; @@ -213,6 +212,7 @@ extern int mmu_io_psize; #define mmu_virtual_psize MMU_PAGE_4K #endif #endif +extern int mmu_linear_psize; extern int mmu_vmemmap_psize; /* MMU initialization */ diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h index 7a90000f8d15..f83866a19e87 100644 --- a/arch/powerpc/include/asm/kexec_ranges.h +++ b/arch/powerpc/include/asm/kexec_ranges.h @@ -9,7 +9,7 @@ struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges); int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); int add_tce_mem_ranges(struct crash_mem **mem_ranges); int add_initrd_mem_range(struct crash_mem **mem_ranges); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU int add_htab_mem_range(struct crash_mem **mem_ranges); #else static inline int add_htab_mem_range(struct crash_mem **mem_ranges) From 9dd78194a3722fa6712192cdd4f7032d45112a9a Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Feb 2022 16:45:54 +0000 Subject: [PATCH 632/773] ARM: report Spectre v2 status through sysfs As per other architectures, add support for reporting the Spectre vulnerability status via sysfs CPU. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/spectre.h | 28 +++++++ arch/arm/kernel/Makefile | 2 + arch/arm/kernel/spectre.c | 54 +++++++++++++ arch/arm/mm/Kconfig | 1 + arch/arm/mm/proc-v7-bugs.c | 143 ++++++++++++++++++++++++--------- 5 files changed, 188 insertions(+), 40 deletions(-) create mode 100644 arch/arm/include/asm/spectre.h create mode 100644 arch/arm/kernel/spectre.c diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h new file mode 100644 index 000000000000..8a9019e08dba --- /dev/null +++ b/arch/arm/include/asm/spectre.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_SPECTRE_H +#define __ASM_SPECTRE_H + +enum { + SPECTRE_UNAFFECTED, + SPECTRE_MITIGATED, + SPECTRE_VULNERABLE, +}; + +enum { + __SPECTRE_V2_METHOD_BPIALL, + __SPECTRE_V2_METHOD_ICIALLU, + __SPECTRE_V2_METHOD_SMC, + __SPECTRE_V2_METHOD_HVC, +}; + +enum { + SPECTRE_V2_METHOD_BPIALL = BIT(__SPECTRE_V2_METHOD_BPIALL), + SPECTRE_V2_METHOD_ICIALLU = BIT(__SPECTRE_V2_METHOD_ICIALLU), + SPECTRE_V2_METHOD_SMC = BIT(__SPECTRE_V2_METHOD_SMC), + SPECTRE_V2_METHOD_HVC = BIT(__SPECTRE_V2_METHOD_HVC), +}; + +void spectre_v2_update_state(unsigned int state, unsigned int methods); + +#endif diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index ae295a3bcfef..6ef3b535b7bf 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -106,4 +106,6 @@ endif obj-$(CONFIG_HAVE_ARM_SMCCC) += smccc-call.o +obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += spectre.o + extra-y := $(head-y) vmlinux.lds diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c new file mode 100644 index 000000000000..6f6dd1cfd099 --- /dev/null +++ b/arch/arm/kernel/spectre.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +static unsigned int spectre_v2_state; +static unsigned int spectre_v2_methods; + +void spectre_v2_update_state(unsigned int state, unsigned int method) +{ + if (state > spectre_v2_state) + spectre_v2_state = state; + spectre_v2_methods |= method; +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + const char *method; + + if (spectre_v2_state == SPECTRE_UNAFFECTED) + return sprintf(buf, "%s\n", "Not affected"); + + if (spectre_v2_state != SPECTRE_MITIGATED) + return sprintf(buf, "%s\n", "Vulnerable"); + + switch (spectre_v2_methods) { + case SPECTRE_V2_METHOD_BPIALL: + method = "Branch predictor hardening"; + break; + + case SPECTRE_V2_METHOD_ICIALLU: + method = "I-cache invalidation"; + break; + + case SPECTRE_V2_METHOD_SMC: + case SPECTRE_V2_METHOD_HVC: + method = "Firmware call"; + break; + + default: + method = "Multiple mitigations"; + break; + } + + return sprintf(buf, "Mitigation: %s\n", method); +} diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 58afba346729..850329ea9bf8 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -830,6 +830,7 @@ config CPU_BPREDICT_DISABLE config CPU_SPECTRE bool + select GENERIC_CPU_VULNERABILITIES config HARDEN_BRANCH_PREDICTOR bool "Harden the branch predictor against aliasing attacks" if EXPERT diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index 114c05ab4dd9..e438e59bb63e 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -6,8 +6,35 @@ #include #include #include +#include #include +#ifdef CONFIG_ARM_PSCI +static int __maybe_unused spectre_v2_get_cpu_fw_mitigation_state(void) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + + switch ((int)res.a0) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + + default: + return SPECTRE_VULNERABLE; + } +} +#else +static int __maybe_unused spectre_v2_get_cpu_fw_mitigation_state(void) +{ + return SPECTRE_VULNERABLE; +} +#endif + #ifdef CONFIG_HARDEN_BRANCH_PREDICTOR DEFINE_PER_CPU(harden_branch_predictor_fn_t, harden_branch_predictor_fn); @@ -36,13 +63,60 @@ static void __maybe_unused call_hvc_arch_workaround_1(void) arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); } -static void cpu_v7_spectre_init(void) +static unsigned int spectre_v2_install_workaround(unsigned int method) { const char *spectre_v2_method = NULL; int cpu = smp_processor_id(); if (per_cpu(harden_branch_predictor_fn, cpu)) - return; + return SPECTRE_MITIGATED; + + switch (method) { + case SPECTRE_V2_METHOD_BPIALL: + per_cpu(harden_branch_predictor_fn, cpu) = + harden_branch_predictor_bpiall; + spectre_v2_method = "BPIALL"; + break; + + case SPECTRE_V2_METHOD_ICIALLU: + per_cpu(harden_branch_predictor_fn, cpu) = + harden_branch_predictor_iciallu; + spectre_v2_method = "ICIALLU"; + break; + + case SPECTRE_V2_METHOD_HVC: + per_cpu(harden_branch_predictor_fn, cpu) = + call_hvc_arch_workaround_1; + cpu_do_switch_mm = cpu_v7_hvc_switch_mm; + spectre_v2_method = "hypervisor"; + break; + + case SPECTRE_V2_METHOD_SMC: + per_cpu(harden_branch_predictor_fn, cpu) = + call_smc_arch_workaround_1; + cpu_do_switch_mm = cpu_v7_smc_switch_mm; + spectre_v2_method = "firmware"; + break; + } + + if (spectre_v2_method) + pr_info("CPU%u: Spectre v2: using %s workaround\n", + smp_processor_id(), spectre_v2_method); + + return SPECTRE_MITIGATED; +} +#else +static unsigned int spectre_v2_install_workaround(unsigned int method) +{ + pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n"); + + return SPECTRE_VULNERABLE; +} +#endif + +static void cpu_v7_spectre_v2_init(void) +{ + unsigned int state, method = 0; switch (read_cpuid_part()) { case ARM_CPU_PART_CORTEX_A8: @@ -51,68 +125,57 @@ static void cpu_v7_spectre_init(void) case ARM_CPU_PART_CORTEX_A17: case ARM_CPU_PART_CORTEX_A73: case ARM_CPU_PART_CORTEX_A75: - per_cpu(harden_branch_predictor_fn, cpu) = - harden_branch_predictor_bpiall; - spectre_v2_method = "BPIALL"; + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_BPIALL; break; case ARM_CPU_PART_CORTEX_A15: case ARM_CPU_PART_BRAHMA_B15: - per_cpu(harden_branch_predictor_fn, cpu) = - harden_branch_predictor_iciallu; - spectre_v2_method = "ICIALLU"; + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_ICIALLU; break; -#ifdef CONFIG_ARM_PSCI case ARM_CPU_PART_BRAHMA_B53: /* Requires no workaround */ + state = SPECTRE_UNAFFECTED; break; + default: /* Other ARM CPUs require no workaround */ - if (read_cpuid_implementor() == ARM_CPU_IMP_ARM) + if (read_cpuid_implementor() == ARM_CPU_IMP_ARM) { + state = SPECTRE_UNAFFECTED; break; - fallthrough; - /* Cortex A57/A72 require firmware workaround */ - case ARM_CPU_PART_CORTEX_A57: - case ARM_CPU_PART_CORTEX_A72: { - struct arm_smccc_res res; + } - arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if ((int)res.a0 != 0) - return; + fallthrough; + + /* Cortex A57/A72 require firmware workaround */ + case ARM_CPU_PART_CORTEX_A57: + case ARM_CPU_PART_CORTEX_A72: + state = spectre_v2_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + break; switch (arm_smccc_1_1_get_conduit()) { case SMCCC_CONDUIT_HVC: - per_cpu(harden_branch_predictor_fn, cpu) = - call_hvc_arch_workaround_1; - cpu_do_switch_mm = cpu_v7_hvc_switch_mm; - spectre_v2_method = "hypervisor"; + method = SPECTRE_V2_METHOD_HVC; break; case SMCCC_CONDUIT_SMC: - per_cpu(harden_branch_predictor_fn, cpu) = - call_smc_arch_workaround_1; - cpu_do_switch_mm = cpu_v7_smc_switch_mm; - spectre_v2_method = "firmware"; + method = SPECTRE_V2_METHOD_SMC; break; default: + state = SPECTRE_VULNERABLE; break; } } -#endif - } - if (spectre_v2_method) - pr_info("CPU%u: Spectre v2: using %s workaround\n", - smp_processor_id(), spectre_v2_method); + if (state == SPECTRE_MITIGATED) + state = spectre_v2_install_workaround(method); + + spectre_v2_update_state(state, method); } -#else -static void cpu_v7_spectre_init(void) -{ -} -#endif static __maybe_unused bool cpu_v7_check_auxcr_set(bool *warned, u32 mask, const char *msg) @@ -142,16 +205,16 @@ static bool check_spectre_auxcr(bool *warned, u32 bit) void cpu_v7_ca8_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(6))) - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } void cpu_v7_ca15_ibe(void) { if (check_spectre_auxcr(this_cpu_ptr(&spectre_warned), BIT(0))) - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } void cpu_v7_bugs_init(void) { - cpu_v7_spectre_init(); + cpu_v7_spectre_v2_init(); } From 04e91b7324760a377a725e218b5ee783826d30f5 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Feb 2022 19:46:15 +0000 Subject: [PATCH 633/773] ARM: early traps initialisation Provide a couple of helpers to copy the vectors and stubs, and also to flush the copied vectors and stubs. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/traps.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 195dff58bafc..abc82dfeab51 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -787,10 +787,22 @@ static inline void __init kuser_init(void *vectors) } #endif +#ifndef CONFIG_CPU_V7M +static void copy_from_lma(void *vma, void *lma_start, void *lma_end) +{ + memcpy(vma, lma_start, lma_end - lma_start); +} + +static void flush_vectors(void *vma, size_t offset, size_t size) +{ + unsigned long start = (unsigned long)vma + offset; + unsigned long end = start + size; + + flush_icache_range(start, end); +} + void __init early_trap_init(void *vectors_base) { -#ifndef CONFIG_CPU_V7M - unsigned long vectors = (unsigned long)vectors_base; extern char __stubs_start[], __stubs_end[]; extern char __vectors_start[], __vectors_end[]; unsigned i; @@ -811,17 +823,20 @@ void __init early_trap_init(void *vectors_base) * into the vector page, mapped at 0xffff0000, and ensure these * are visible to the instruction stream. */ - memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start); - memcpy((void *)vectors + 0x1000, __stubs_start, __stubs_end - __stubs_start); + copy_from_lma(vectors_base, __vectors_start, __vectors_end); + copy_from_lma(vectors_base + 0x1000, __stubs_start, __stubs_end); kuser_init(vectors_base); - flush_icache_range(vectors, vectors + PAGE_SIZE * 2); + flush_vectors(vectors_base, 0, PAGE_SIZE * 2); +} #else /* ifndef CONFIG_CPU_V7M */ +void __init early_trap_init(void *vectors_base) +{ /* * on V7-M there is no need to copy the vector table to a dedicated * memory area. The address is configurable and so a table in the kernel * image can be used. */ -#endif } +#endif From 8d9d651ff2270a632e9dc497b142db31e8911315 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Feb 2022 19:49:50 +0000 Subject: [PATCH 634/773] ARM: use LOADADDR() to get load address of sections Use the linker's LOADADDR() macro to get the load address of the sections, and provide a macro to set the start and end symbols. Acked-by: Catalin Marinas Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/vmlinux.lds.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h index 4a91428c324d..e02710d17cf9 100644 --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -26,6 +26,11 @@ #define ARM_MMU_DISCARD(x) x #endif +/* Set start/end symbol names to the LMA for the section */ +#define ARM_LMA(sym, section) \ + sym##_start = LOADADDR(section); \ + sym##_end = LOADADDR(section) + SIZEOF(section) + #define PROC_INFO \ . = ALIGN(4); \ __proc_info_begin = .; \ @@ -110,19 +115,19 @@ * only thing that matters is their relative offsets */ #define ARM_VECTORS \ - __vectors_start = .; \ + __vectors_lma = .; \ .vectors 0xffff0000 : AT(__vectors_start) { \ *(.vectors) \ } \ - . = __vectors_start + SIZEOF(.vectors); \ - __vectors_end = .; \ + ARM_LMA(__vectors, .vectors); \ + . = __vectors_lma + SIZEOF(.vectors); \ \ - __stubs_start = .; \ - .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_start) { \ + __stubs_lma = .; \ + .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_lma) { \ *(.stubs) \ } \ - . = __stubs_start + SIZEOF(.stubs); \ - __stubs_end = .; \ + ARM_LMA(__stubs, .stubs); \ + . = __stubs_lma + SIZEOF(.stubs); \ \ PROVIDE(vector_fiq_offset = vector_fiq - ADDR(.vectors)); From b9baf5c8c5c356757f4f9d8180b5e9d234065bc3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 10 Feb 2022 16:05:45 +0000 Subject: [PATCH 635/773] ARM: Spectre-BHB workaround Workaround the Spectre BHB issues for Cortex-A15, Cortex-A57, Cortex-A72, Cortex-A73 and Cortex-A75. We also include Brahma B15 as well to be safe, which is affected by Spectre V2 in the same ways as Cortex-A15. Reviewed-by: Catalin Marinas Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/assembler.h | 10 ++++ arch/arm/include/asm/spectre.h | 4 ++ arch/arm/include/asm/vmlinux.lds.h | 18 +++++-- arch/arm/kernel/entry-armv.S | 79 +++++++++++++++++++++++++++--- arch/arm/kernel/entry-common.S | 24 +++++++++ arch/arm/kernel/spectre.c | 4 ++ arch/arm/kernel/traps.c | 38 ++++++++++++++ arch/arm/mm/Kconfig | 10 ++++ arch/arm/mm/proc-v7-bugs.c | 76 ++++++++++++++++++++++++++++ 9 files changed, 254 insertions(+), 9 deletions(-) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 7d23d4bb2168..49ea603fc8e3 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -107,6 +107,16 @@ .endm #endif +#if __LINUX_ARM_ARCH__ < 7 + .macro dsb, args + mcr p15, 0, r0, c7, c10, 4 + .endm + + .macro isb, args + mcr p15, 0, r0, c7, r5, 4 + .endm +#endif + .macro asm_trace_hardirqs_off, save=1 #if defined(CONFIG_TRACE_IRQFLAGS) .if \save diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h index 8a9019e08dba..d1fa5607d3aa 100644 --- a/arch/arm/include/asm/spectre.h +++ b/arch/arm/include/asm/spectre.h @@ -14,6 +14,7 @@ enum { __SPECTRE_V2_METHOD_ICIALLU, __SPECTRE_V2_METHOD_SMC, __SPECTRE_V2_METHOD_HVC, + __SPECTRE_V2_METHOD_LOOP8, }; enum { @@ -21,8 +22,11 @@ enum { SPECTRE_V2_METHOD_ICIALLU = BIT(__SPECTRE_V2_METHOD_ICIALLU), SPECTRE_V2_METHOD_SMC = BIT(__SPECTRE_V2_METHOD_SMC), SPECTRE_V2_METHOD_HVC = BIT(__SPECTRE_V2_METHOD_HVC), + SPECTRE_V2_METHOD_LOOP8 = BIT(__SPECTRE_V2_METHOD_LOOP8), }; void spectre_v2_update_state(unsigned int state, unsigned int methods); +int spectre_bhb_update_vectors(unsigned int method); + #endif diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h index e02710d17cf9..0ef21bfae9f6 100644 --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -116,11 +116,23 @@ */ #define ARM_VECTORS \ __vectors_lma = .; \ - .vectors 0xffff0000 : AT(__vectors_start) { \ - *(.vectors) \ + OVERLAY 0xffff0000 : NOCROSSREFS AT(__vectors_lma) { \ + .vectors { \ + *(.vectors) \ + } \ + .vectors.bhb.loop8 { \ + *(.vectors.bhb.loop8) \ + } \ + .vectors.bhb.bpiall { \ + *(.vectors.bhb.bpiall) \ + } \ } \ ARM_LMA(__vectors, .vectors); \ - . = __vectors_lma + SIZEOF(.vectors); \ + ARM_LMA(__vectors_bhb_loop8, .vectors.bhb.loop8); \ + ARM_LMA(__vectors_bhb_bpiall, .vectors.bhb.bpiall); \ + . = __vectors_lma + SIZEOF(.vectors) + \ + SIZEOF(.vectors.bhb.loop8) + \ + SIZEOF(.vectors.bhb.bpiall); \ \ __stubs_lma = .; \ .stubs ADDR(.vectors) + 0x1000 : AT(__stubs_lma) { \ diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 5cd057859fe9..676703cbfe4b 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1002,12 +1002,11 @@ vector_\name: sub lr, lr, #\correction .endif - @ - @ Save r0, lr_ (parent PC) and spsr_ - @ (parent CPSR) - @ + @ Save r0, lr_ (parent PC) stmia sp, {r0, lr} @ save r0, lr - mrs lr, spsr + + @ Save spsr_ (parent CPSR) +2: mrs lr, spsr str lr, [sp, #8] @ save spsr @ @@ -1028,6 +1027,44 @@ vector_\name: movs pc, lr @ branch to handler in SVC mode ENDPROC(vector_\name) +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .subsection 1 + .align 5 +vector_bhb_loop8_\name: + .if \correction + sub lr, lr, #\correction + .endif + + @ Save r0, lr_ (parent PC) + stmia sp, {r0, lr} + + @ bhb workaround + mov r0, #8 +1: b . + 4 + subs r0, r0, #1 + bne 1b + dsb + isb + b 2b +ENDPROC(vector_bhb_loop8_\name) + +vector_bhb_bpiall_\name: + .if \correction + sub lr, lr, #\correction + .endif + + @ Save r0, lr_ (parent PC) + stmia sp, {r0, lr} + + @ bhb workaround + mcr p15, 0, r0, c7, c5, 6 @ BPIALL + @ isb not needed due to "movs pc, lr" in the vector stub + @ which gives a "context synchronisation". + b 2b +ENDPROC(vector_bhb_bpiall_\name) + .previous +#endif + .align 2 @ handler addresses follow this label 1: @@ -1036,6 +1073,10 @@ ENDPROC(vector_\name) .section .stubs, "ax", %progbits @ This must be the first word .word vector_swi +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .word vector_bhb_loop8_swi + .word vector_bhb_bpiall_swi +#endif vector_rst: ARM( swi SYS_ERROR0 ) @@ -1150,8 +1191,10 @@ vector_addrexcptn: * FIQ "NMI" handler *----------------------------------------------------------------------------- * Handle a FIQ using the SVC stack allowing FIQ act like NMI on x86 - * systems. + * systems. This must be the last vector stub, so lets place it in its own + * subsection. */ + .subsection 2 vector_stub fiq, FIQ_MODE, 4 .long __fiq_usr @ 0 (USR_26 / USR_32) @@ -1184,6 +1227,30 @@ vector_addrexcptn: W(b) vector_irq W(b) vector_fiq +#ifdef CONFIG_HARDEN_BRANCH_HISTORY + .section .vectors.bhb.loop8, "ax", %progbits +.L__vectors_bhb_loop8_start: + W(b) vector_rst + W(b) vector_bhb_loop8_und + W(ldr) pc, .L__vectors_bhb_loop8_start + 0x1004 + W(b) vector_bhb_loop8_pabt + W(b) vector_bhb_loop8_dabt + W(b) vector_addrexcptn + W(b) vector_bhb_loop8_irq + W(b) vector_bhb_loop8_fiq + + .section .vectors.bhb.bpiall, "ax", %progbits +.L__vectors_bhb_bpiall_start: + W(b) vector_rst + W(b) vector_bhb_bpiall_und + W(ldr) pc, .L__vectors_bhb_bpiall_start + 0x1008 + W(b) vector_bhb_bpiall_pabt + W(b) vector_bhb_bpiall_dabt + W(b) vector_addrexcptn + W(b) vector_bhb_bpiall_irq + W(b) vector_bhb_bpiall_fiq +#endif + .data .align 2 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index ac86c34682bb..dbc1913ee30b 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -153,6 +153,29 @@ ENDPROC(ret_from_fork) *----------------------------------------------------------------------------- */ + .align 5 +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +ENTRY(vector_bhb_loop8_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mov r8, #8 +1: b 2f +2: subs r8, r8, #1 + bne 1b + dsb + isb + b 3f +ENDPROC(vector_bhb_loop8_swi) + + .align 5 +ENTRY(vector_bhb_bpiall_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mcr p15, 0, r8, c7, c5, 6 @ BPIALL + isb + b 3f +ENDPROC(vector_bhb_bpiall_swi) +#endif .align 5 ENTRY(vector_swi) #ifdef CONFIG_CPU_V7M @@ -160,6 +183,7 @@ ENTRY(vector_swi) #else sub sp, sp, #PT_REGS_SIZE stmia sp, {r0 - r12} @ Calling r0 - r12 +3: ARM( add r8, sp, #S_PC ) ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr THUMB( mov r8, sp ) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index 6f6dd1cfd099..ade967f18d06 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -45,6 +45,10 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, method = "Firmware call"; break; + case SPECTRE_V2_METHOD_LOOP8: + method = "History overwrite"; + break; + default: method = "Multiple mitigations"; break; diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index abc82dfeab51..90c887aa67a4 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -801,6 +802,43 @@ static void flush_vectors(void *vma, size_t offset, size_t size) flush_icache_range(start, end); } +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +int spectre_bhb_update_vectors(unsigned int method) +{ + extern char __vectors_bhb_bpiall_start[], __vectors_bhb_bpiall_end[]; + extern char __vectors_bhb_loop8_start[], __vectors_bhb_loop8_end[]; + void *vec_start, *vec_end; + + if (system_state >= SYSTEM_FREEING_INITMEM) { + pr_err("CPU%u: Spectre BHB workaround too late - system vulnerable\n", + smp_processor_id()); + return SPECTRE_VULNERABLE; + } + + switch (method) { + case SPECTRE_V2_METHOD_LOOP8: + vec_start = __vectors_bhb_loop8_start; + vec_end = __vectors_bhb_loop8_end; + break; + + case SPECTRE_V2_METHOD_BPIALL: + vec_start = __vectors_bhb_bpiall_start; + vec_end = __vectors_bhb_bpiall_end; + break; + + default: + pr_err("CPU%u: unknown Spectre BHB state %d\n", + smp_processor_id(), method); + return SPECTRE_VULNERABLE; + } + + copy_from_lma(vectors_page, vec_start, vec_end); + flush_vectors(vectors_page, 0, vec_end - vec_start); + + return SPECTRE_MITIGATED; +} +#endif + void __init early_trap_init(void *vectors_base) { extern char __stubs_start[], __stubs_end[]; diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 850329ea9bf8..9724c16e9076 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -851,6 +851,16 @@ config HARDEN_BRANCH_PREDICTOR If unsure, say Y. +config HARDEN_BRANCH_HISTORY + bool "Harden Spectre style attacks against branch history" if EXPERT + depends on CPU_SPECTRE + default y + help + Speculation attacks against some high-performance processors can + make use of branch history to influence future speculation. When + taking an exception, a sequence of branches overwrites the branch + history, or branch history is invalidated. + config TLS_REG_EMUL bool select NEED_KUSER_HELPERS diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index e438e59bb63e..c226feab2457 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -177,6 +177,81 @@ static void cpu_v7_spectre_v2_init(void) spectre_v2_update_state(state, method); } +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +static int spectre_bhb_method; + +static const char *spectre_bhb_method_name(int method) +{ + switch (method) { + case SPECTRE_V2_METHOD_LOOP8: + return "loop"; + + case SPECTRE_V2_METHOD_BPIALL: + return "BPIALL"; + + default: + return "unknown"; + } +} + +static int spectre_bhb_install_workaround(int method) +{ + if (spectre_bhb_method != method) { + if (spectre_bhb_method) { + pr_err("CPU%u: Spectre BHB: method disagreement, system vulnerable\n", + smp_processor_id()); + + return SPECTRE_VULNERABLE; + } + + if (spectre_bhb_update_vectors(method) == SPECTRE_VULNERABLE) + return SPECTRE_VULNERABLE; + + spectre_bhb_method = method; + } + + pr_info("CPU%u: Spectre BHB: using %s workaround\n", + smp_processor_id(), spectre_bhb_method_name(method)); + + return SPECTRE_MITIGATED; +} +#else +static int spectre_bhb_install_workaround(int method) +{ + return SPECTRE_VULNERABLE; +} +#endif + +static void cpu_v7_spectre_bhb_init(void) +{ + unsigned int state, method = 0; + + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A15: + case ARM_CPU_PART_BRAHMA_B15: + case ARM_CPU_PART_CORTEX_A57: + case ARM_CPU_PART_CORTEX_A72: + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_LOOP8; + break; + + case ARM_CPU_PART_CORTEX_A73: + case ARM_CPU_PART_CORTEX_A75: + state = SPECTRE_MITIGATED; + method = SPECTRE_V2_METHOD_BPIALL; + break; + + default: + state = SPECTRE_UNAFFECTED; + break; + } + + if (state == SPECTRE_MITIGATED) + state = spectre_bhb_install_workaround(method); + + spectre_v2_update_state(state, method); +} + static __maybe_unused bool cpu_v7_check_auxcr_set(bool *warned, u32 mask, const char *msg) { @@ -217,4 +292,5 @@ void cpu_v7_ca15_ibe(void) void cpu_v7_bugs_init(void) { cpu_v7_spectre_v2_init(); + cpu_v7_spectre_bhb_init(); } From c6a502c2299941c8326d029cfc8a3bc8a4607ad5 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 4 Mar 2022 21:25:36 +0300 Subject: [PATCH 636/773] mISDN: Fix memory leak in dsp_pipeline_build() dsp_pipeline_build() allocates dup pointer by kstrdup(cfg), but then it updates dup variable by strsep(&dup, "|"). As a result when it calls kfree(dup), the dup variable contains NULL. Found by Linux Driver Verification project (linuxtesting.org) with SVACE. Signed-off-by: Alexey Khoroshilov Fixes: 960366cf8dbb ("Add mISDN DSP") Signed-off-by: David S. Miller --- drivers/isdn/mISDN/dsp_pipeline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/isdn/mISDN/dsp_pipeline.c b/drivers/isdn/mISDN/dsp_pipeline.c index e11ca6bbc7f4..c3b2c99b5cd5 100644 --- a/drivers/isdn/mISDN/dsp_pipeline.c +++ b/drivers/isdn/mISDN/dsp_pipeline.c @@ -192,7 +192,7 @@ void dsp_pipeline_destroy(struct dsp_pipeline *pipeline) int dsp_pipeline_build(struct dsp_pipeline *pipeline, const char *cfg) { int found = 0; - char *dup, *tok, *name, *args; + char *dup, *next, *tok, *name, *args; struct dsp_element_entry *entry, *n; struct dsp_pipeline_entry *pipeline_entry; struct mISDN_dsp_element *elem; @@ -203,10 +203,10 @@ int dsp_pipeline_build(struct dsp_pipeline *pipeline, const char *cfg) if (!list_empty(&pipeline->list)) _dsp_pipeline_destroy(pipeline); - dup = kstrdup(cfg, GFP_ATOMIC); + dup = next = kstrdup(cfg, GFP_ATOMIC); if (!dup) return 0; - while ((tok = strsep(&dup, "|"))) { + while ((tok = strsep(&next, "|"))) { if (!strlen(tok)) continue; name = strsep(&tok, "("); From ff712a627f7296a42ea5d7356704525e1e909e05 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 4 Mar 2022 20:28:48 -0800 Subject: [PATCH 637/773] selftests/vm: cleanup hugetlb file after mremap test The hugepage-mremap test will create a file in a hugetlb filesystem. In a default 'run_vmtests' run, the file will contain all the hugetlb pages. After the test, the file remains and there are no free hugetlb pages for subsequent tests. This causes those hugetlb tests to fail. Change hugepage-mremap to take the name of the hugetlb file as an argument. Unlink the file within the test, and just to be sure remove the file in the run_vmtests script. Link: https://lkml.kernel.org/r/20220201033459.156944-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Shuah Khan Acked-by: Yosry Ahmed Reviewed-by: Muchun Song Reviewed-by: Mina Almasry Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/hugepage-mremap.c | 26 ++++++++++++++------ tools/testing/selftests/vm/run_vmtests.sh | 3 ++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 2a7c33631a29..1d689084a54b 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -3,9 +3,10 @@ * hugepage-mremap: * * Example of remapping huge page memory in a user application using the - * mremap system call. Code assumes a hugetlbfs filesystem is mounted - * at './huge'. The amount of memory used by this test is decided by a command - * line argument in MBs. If missing, the default amount is 10MB. + * mremap system call. The path to a file in a hugetlbfs filesystem must + * be passed as the last argument to this test. The amount of memory used + * by this test in MBs can optionally be passed as an argument. If no memory + * amount is passed, the default amount is 10MB. * * To make sure the test triggers pmd sharing and goes through the 'unshare' * path in the mremap code use 1GB (1024) or more. @@ -25,7 +26,6 @@ #define DEFAULT_LENGTH_MB 10UL #define MB_TO_BYTES(x) (x * 1024 * 1024) -#define FILE_NAME "huge/hugepagefile" #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) #define FLAGS (MAP_SHARED | MAP_ANONYMOUS) @@ -107,17 +107,26 @@ static void register_region_with_uffd(char *addr, size_t len) int main(int argc, char *argv[]) { + size_t length; + + if (argc != 2 && argc != 3) { + printf("Usage: %s [length_in_MB] \n", argv[0]); + exit(1); + } + /* Read memory length as the first arg if valid, otherwise fallback to - * the default length. Any additional args are ignored. + * the default length. */ - size_t length = argc > 1 ? (size_t)atoi(argv[1]) : 0UL; + if (argc == 3) + length = argc > 2 ? (size_t)atoi(argv[1]) : 0UL; length = length > 0 ? length : DEFAULT_LENGTH_MB; length = MB_TO_BYTES(length); int ret = 0; - int fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); + /* last arg is the hugetlb file name */ + int fd = open(argv[argc-1], O_CREAT | O_RDWR, 0755); if (fd < 0) { perror("Open failed"); @@ -169,5 +178,8 @@ int main(int argc, char *argv[]) munmap(addr, length); + close(fd); + unlink(argv[argc-1]); + return ret; } diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 75d401741394..71d2dc198fc1 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -111,13 +111,14 @@ fi echo "-----------------------" echo "running hugepage-mremap" echo "-----------------------" -./hugepage-mremap 256 +./hugepage-mremap $mnt/huge_mremap if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 else echo "[PASS]" fi +rm -f $mnt/huge_mremap echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" From 5c26f6ac9416b63d093e29c30e79b3297e425472 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:51 -0800 Subject: [PATCH 638/773] mm: refactor vm_area_struct::anon_vma_name usage code Avoid mixing strings and their anon_vma_name referenced pointers by using struct anon_vma_name whenever possible. This simplifies the code and allows easier sharing of anon_vma_name structures when they represent the same name. [surenb@google.com: fix comment] Link: https://lkml.kernel.org/r/20220223153613.835563-1-surenb@google.com Link: https://lkml.kernel.org/r/20220224231834.1481408-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Colin Cross Cc: Sumit Semwal Cc: Dave Hansen Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Johannes Weiner Cc: "Eric W. Biederman" Cc: Christian Brauner Cc: Alexey Gladkov Cc: Sasha Levin Cc: Chris Hyser Cc: Davidlohr Bueso Cc: Peter Collingbourne Cc: Xiaofeng Cao Cc: David Hildenbrand Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 +-- fs/userfaultfd.c | 6 +-- include/linux/mm.h | 7 +-- include/linux/mm_inline.h | 89 ++++++++++++++++++++++++++------------- include/linux/mm_types.h | 5 ++- kernel/fork.c | 4 +- kernel/sys.c | 19 ++++++--- mm/madvise.c | 87 +++++++++++++------------------------- mm/mempolicy.c | 2 +- mm/mlock.c | 2 +- mm/mmap.c | 12 +++--- mm/mprotect.c | 2 +- 12 files changed, 126 insertions(+), 115 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6e97ed775074..2c48b1eaaa9c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -309,7 +309,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - const char *anon_name; + struct anon_vma_name *anon_name; if (!mm) { name = "[vdso]"; @@ -327,10 +327,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = vma_anon_name(vma); + anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); - seq_printf(m, "[anon:%s]", anon_name); + seq_printf(m, "[anon:%s]", anon_name->name); } } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e26b10132d47..8e03b3d3f5fa 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -878,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, vma_anon_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) vma = prev; else @@ -1438,7 +1438,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), - vma_anon_name(vma)); + anon_vma_name(vma)); if (prev) { vma = prev; goto next; @@ -1615,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX, vma_anon_name(vma)); + NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..5744a3fc4716 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2626,7 +2626,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, const char *); + struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3372,11 +3372,12 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name); + unsigned long len_in, + struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name) { + unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe71..dd3accaa4e6d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -140,50 +140,81 @@ static __always_inline void del_page_from_lru_list(struct page *page, #ifdef CONFIG_ANON_VMA_NAME /* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. */ -extern const char *vma_anon_name(struct vm_area_struct *vma); - -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); - -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name_alloc(const char *name); +extern void anon_vma_name_free(struct kref *kref); /* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { - const char *vma_name = vma_anon_name(vma); + if (anon_name) + kref_get(&anon_name->kref); +} - /* either both NULL, or pointers to same string */ - if (vma_name == name) +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_put(&anon_name->kref, anon_vma_name_free); +} + +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + struct anon_vma_name *anon_name = anon_vma_name(orig_vma); + + if (anon_name) { + anon_vma_name_get(anon_name); + new_vma->anon_name = anon_name; + } +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) +{ + /* + * Not using anon_vma_name because it generates a warning if mmap_lock + * is not held, which might be the case here. + */ + if (!vma->vm_file) + anon_vma_name_put(vma->anon_name); +} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + if (anon_name1 == anon_name2) return true; - return name && vma_name && !strcmp(name, vma_name); + return anon_name1 && anon_name2 && + !strcmp(anon_name1->name, anon_name2->name); } + #else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { return NULL; } -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + return NULL; +} + +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_anon_vma_name(struct vm_area_struct *vma) {} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) { return true; } + #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5140e5feb486..0f549870da6a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -416,7 +416,10 @@ struct vm_area_struct { struct rb_node rb; unsigned long rb_subtree_last; } shared; - /* Serialized by mmap_sem. */ + /* + * Serialized by mmap_sem. Never use directly because it is + * valid only when vm_file is NULL. Use anon_vma_name instead. + */ struct anon_vma_name *anon_name; }; diff --git a/kernel/fork.c b/kernel/fork.c index a024bf6254df..f1e89007f228 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -366,14 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; - dup_vma_anon_name(orig, new); + dup_anon_vma_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { - free_vma_anon_name(vma); + free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); } diff --git a/kernel/sys.c b/kernel/sys.c index 97dc9e5d6bf9..5b0e172c4d47 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -2286,15 +2287,16 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, { struct mm_struct *mm = current->mm; const char __user *uname; - char *name, *pch; + struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: uname = (const char __user *)arg; if (uname) { - name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + char *name, *pch; + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); if (IS_ERR(name)) return PTR_ERR(name); @@ -2304,15 +2306,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, return -EINVAL; } } - } else { - /* Reset the name */ - name = NULL; + /* anon_vma has its own copy */ + anon_name = anon_vma_name_alloc(name); + kfree(name); + if (!anon_name) + return -ENOMEM; + } mmap_write_lock(mm); - error = madvise_set_anon_name(mm, addr, size, name); + error = madvise_set_anon_name(mm, addr, size, anon_name); mmap_write_unlock(mm); - kfree(name); + anon_vma_name_put(anon_name); break; default: error = -EINVAL; diff --git a/mm/madvise.c b/mm/madvise.c index 5604064df464..081b1cded21e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -65,7 +65,7 @@ static int madvise_need_mmap_write(int behavior) } #ifdef CONFIG_ANON_VMA_NAME -static struct anon_vma_name *anon_vma_name_alloc(const char *name) +struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; @@ -81,78 +81,49 @@ static struct anon_vma_name *anon_vma_name_alloc(const char *name) return anon_name; } -static void vma_anon_name_free(struct kref *kref) +void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } -static inline bool has_vma_anon_name(struct vm_area_struct *vma) +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - return !vma->vm_file && vma->anon_name; -} - -const char *vma_anon_name(struct vm_area_struct *vma) -{ - if (!has_vma_anon_name(vma)) - return NULL; - mmap_assert_locked(vma->vm_mm); - return vma->anon_name->name; -} + if (vma->vm_file) + return NULL; -void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) -{ - if (!has_vma_anon_name(orig_vma)) - return; - - kref_get(&orig_vma->anon_name->kref); - new_vma->anon_name = orig_vma->anon_name; -} - -void free_vma_anon_name(struct vm_area_struct *vma) -{ - struct anon_vma_name *anon_name; - - if (!has_vma_anon_name(vma)) - return; - - anon_name = vma->anon_name; - vma->anon_name = NULL; - kref_put(&anon_name->kref, vma_anon_name_free); + return vma->anon_name; } /* mmap_lock should be write-locked */ -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) { - const char *anon_name; + struct anon_vma_name *orig_name = anon_vma_name(vma); - if (!name) { - free_vma_anon_name(vma); + if (!anon_name) { + vma->anon_name = NULL; + anon_vma_name_put(orig_name); return 0; } - anon_name = vma_anon_name(vma); - if (anon_name) { - /* Same name, nothing to do here */ - if (!strcmp(name, anon_name)) - return 0; + if (anon_vma_name_eq(orig_name, anon_name)) + return 0; - free_vma_anon_name(vma); - } - vma->anon_name = anon_vma_name_alloc(name); - if (!vma->anon_name) - return -ENOMEM; + anon_vma_name_get(anon_name); + vma->anon_name = anon_name; + anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) { - if (name) + if (anon_name) return -EINVAL; return 0; @@ -165,13 +136,13 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, - const char *name) + struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { + if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; return 0; } @@ -179,7 +150,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, name); + vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; goto success; @@ -209,7 +180,7 @@ success: */ vma->vm_flags = new_flags; if (!vma->vm_file) { - error = replace_vma_anon_name(vma, name); + error = replace_anon_vma_name(vma, anon_name); if (error) return error; } @@ -1041,7 +1012,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, } error = madvise_update_vma(vma, prev, start, end, new_flags, - vma_anon_name(vma)); + anon_vma_name(vma)); out: /* @@ -1225,7 +1196,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long name) + unsigned long anon_name) { int error; @@ -1234,7 +1205,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (const char *)name); + (struct anon_vma_name *)anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1246,7 +1217,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, } int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - unsigned long len_in, const char *name) + unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; @@ -1266,7 +1237,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)name, + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 028e8dd82b44..69284d3b5e53 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -814,7 +814,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, - vma_anon_name(vma)); + anon_vma_name(vma)); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index 8f584eddd305..25934e7db3e1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index d445c1b9d606..f61a15474dd6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1031,7 +1031,7 @@ again: static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; - if (!is_same_vma_anon_name(vma, anon_name)) + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) return 0; return 1; } @@ -1084,7 +1084,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { @@ -1106,7 +1106,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { @@ -1167,7 +1167,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char *anon_name) + struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -3256,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index 5ca3fbcb1495..2887644fd150 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_anon_name(vma)); + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); From 96403e11283def1d1c465c8279514c9a504d8630 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:55 -0800 Subject: [PATCH 639/773] mm: prevent vm_area_struct::anon_name refcount saturation A deep process chain with many vmas could grow really high. With default sysctl_max_map_count (64k) and default pid_max (32k) the max number of vmas in the system is 2147450880 and the refcounter has headroom of 1073774592 before it reaches REFCOUNT_SATURATED (3221225472). Therefore it's unlikely that an anonymous name refcounter will overflow with these defaults. Currently the max for pid_max is PID_MAX_LIMIT (4194304) and for sysctl_max_map_count it's INT_MAX (2147483647). In this configuration anon_vma_name refcount overflow becomes theoretically possible (that still require heavy sharing of that anon_vma_name between processes). kref refcounting interface used in anon_vma_name structure will detect a counter overflow when it reaches REFCOUNT_SATURATED value but will only generate a warning and freeze the ref counter. This would lead to the refcounted object never being freed. A determined attacker could leak memory like that but it would be rather expensive and inefficient way to do so. To ensure anon_vma_name refcount does not overflow, stop anon_vma_name sharing when the refcount reaches REFCOUNT_MAX (2147483647), which still leaves INT_MAX/2 (1073741823) values before the counter reaches REFCOUNT_SATURATED. This should provide enough headroom for raising the refcounts temporarily. Link: https://lkml.kernel.org/r/20220223153613.835563-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Alexey Gladkov Cc: Chris Hyser Cc: Christian Brauner Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: "Eric W. Biederman" Cc: Johannes Weiner Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Peter Collingbourne Cc: Sasha Levin Cc: Sumit Semwal Cc: Vlastimil Babka Cc: Xiaofeng Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 18 ++++++++++++++---- mm/madvise.c | 3 +-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index dd3accaa4e6d..cf90b1fa2c60 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -161,15 +161,25 @@ static inline void anon_vma_name_put(struct anon_vma_name *anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } +static inline +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) +{ + /* Prevent anon_name refcount saturation early on */ + if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { + anon_vma_name_get(anon_name); + return anon_name; + + } + return anon_vma_name_alloc(anon_name->name); +} + static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); - if (anon_name) { - anon_vma_name_get(anon_name); - new_vma->anon_name = anon_name; - } + if (anon_name) + new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) diff --git a/mm/madvise.c b/mm/madvise.c index 081b1cded21e..1f2693dccf7b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -113,8 +113,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, if (anon_vma_name_eq(orig_name, anon_name)) return 0; - anon_vma_name_get(anon_name); - vma->anon_name = anon_name; + vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; From 942341dcc5748d9c1fc7009a359fc1916bfe0ef0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:58 -0800 Subject: [PATCH 640/773] mm: fix use-after-free when anon vma name is used after vma is freed When adjacent vmas are being merged it can result in the vma that was originally passed to madvise_update_vma being destroyed. In the current implementation, the name parameter passed to madvise_update_vma points directly to vma->anon_name and it is used after the call to vma_merge. In the cases when vma_merge merges the original vma and destroys it, this might result in UAF. For that the original vma would have to hold the anon_vma_name with the last reference. The following vma would need to contain a different anon_vma_name object with the same string. Such scenario is shown below: madvise_vma_behavior(vma) madvise_update_vma(vma, ..., anon_name == vma->anon_name) vma_merge(vma) __vma_adjust(vma) <-- merges vma with adjacent one vm_area_free(vma) <-- frees the original vma replace_vma_anon_name(anon_name) <-- UAF of vma->anon_name Fix this by raising the name refcount and stabilizing it. Link: https://lkml.kernel.org/r/20220224231834.1481408-3-surenb@google.com Link: https://lkml.kernel.org/r/20220223153613.835563-3-surenb@google.com Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory") Signed-off-by: Suren Baghdasaryan Reported-by: syzbot+aa7b3d4b35f9dc46a366@syzkaller.appspotmail.com Acked-by: Michal Hocko Cc: Alexey Gladkov Cc: Chris Hyser Cc: Christian Brauner Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: "Eric W. Biederman" Cc: Johannes Weiner Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Collingbourne Cc: Sasha Levin Cc: Sumit Semwal Cc: Vlastimil Babka Cc: Xiaofeng Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 1f2693dccf7b..38d0f515d548 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -131,6 +131,8 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_sem held for writing; + * Caller should ensure anon_name stability by raising its refcount even when + * anon_name belongs to a valid vma because this function might free that vma. */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, @@ -945,6 +947,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, unsigned long behavior) { int error; + struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; switch (behavior) { @@ -1010,8 +1013,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; } + anon_name = anon_vma_name(vma); + anon_vma_name_get(anon_name); error = madvise_update_vma(vma, prev, start, end, new_flags, - anon_vma_name(vma)); + anon_name); + anon_vma_name_put(anon_name); out: /* From f2b277c4d1c63a85127e8aa2588e9cc3bd21cb99 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 4 Mar 2022 20:29:01 -0800 Subject: [PATCH 641/773] memfd: fix F_SEAL_WRITE after shmem huge page allocated Wangyong reports: after enabling tmpfs filesystem to support transparent hugepage with the following command: echo always > /sys/kernel/mm/transparent_hugepage/shmem_enabled the docker program tries to add F_SEAL_WRITE through the following command, but it fails unexpectedly with errno EBUSY: fcntl(5, F_ADD_SEALS, F_SEAL_WRITE) = -1. That is because memfd_tag_pins() and memfd_wait_for_pins() were never updated for shmem huge pages: checking page_mapcount() against page_count() is hopeless on THP subpages - they need to check total_mapcount() against page_count() on THP heads only. Make memfd_tag_pins() (compared > 1) as strict as memfd_wait_for_pins() (compared != 1): either can be justified, but given the non-atomic total_mapcount() calculation, it is better now to be strict. Bear in mind that total_mapcount() itself scans all of the THP subpages, when choosing to take an XA_CHECK_SCHED latency break. Also fix the unlikely xa_is_value() case in memfd_wait_for_pins(): if a page has been swapped out since memfd_tag_pins(), then its refcount must have fallen, and so it can safely be untagged. Link: https://lkml.kernel.org/r/a4f79248-df75-2c8c-3df-ba3317ccb5da@google.com Signed-off-by: Hugh Dickins Reported-by: Zeal Robot Reported-by: wangyong Cc: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: CGEL ZTE Cc: Kirill A. Shutemov Cc: Song Liu Cc: Yang Yang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memfd.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 9f80f162791a..08f5f8304746 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -31,20 +31,28 @@ static void memfd_tag_pins(struct xa_state *xas) { struct page *page; - unsigned int tagged = 0; + int latency = 0; + int cache_count; lru_add_drain(); xas_lock_irq(xas); xas_for_each(xas, page, ULONG_MAX) { - if (xa_is_value(page)) - continue; - page = find_subpage(page, xas->xa_index); - if (page_count(page) - page_mapcount(page) > 1) - xas_set_mark(xas, MEMFD_TAG_PINNED); + cache_count = 1; + if (!xa_is_value(page) && + PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; - if (++tagged % XA_CHECK_SCHED) + if (!xa_is_value(page) && + page_count(page) - total_mapcount(page) != cache_count) + xas_set_mark(xas, MEMFD_TAG_PINNED); + if (cache_count != 1) + xas_set(xas, page->index + cache_count); + + latency += cache_count; + if (latency < XA_CHECK_SCHED) continue; + latency = 0; xas_pause(xas); xas_unlock_irq(xas); @@ -73,7 +81,8 @@ static int memfd_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - unsigned int tagged = 0; + int latency = 0; + int cache_count; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; @@ -87,10 +96,14 @@ static int memfd_wait_for_pins(struct address_space *mapping) xas_lock_irq(&xas); xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; - if (xa_is_value(page)) - continue; - page = find_subpage(page, xas.xa_index); - if (page_count(page) - page_mapcount(page) != 1) { + + cache_count = 1; + if (!xa_is_value(page) && + PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (!xa_is_value(page) && cache_count != + page_count(page) - total_mapcount(page)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still @@ -103,8 +116,11 @@ static int memfd_wait_for_pins(struct address_space *mapping) } if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); - if (++tagged % XA_CHECK_SCHED) + + latency += cache_count; + if (latency < XA_CHECK_SCHED) continue; + latency = 0; xas_pause(&xas); xas_unlock_irq(&xas); From b773827e361952b3f53ac6fa4c4e39ccd632102e Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 4 Mar 2022 20:29:04 -0800 Subject: [PATCH 642/773] kselftest/vm: fix tests build with old libc The error message when I build vm tests on debian10 (GLIBC 2.28): userfaultfd.c: In function `userfaultfd_pagemap_test': userfaultfd.c:1393:37: error: `MADV_PAGEOUT' undeclared (first use in this function); did you mean `MADV_RANDOM'? if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) ^~~~~~~~~~~~ MADV_RANDOM This patch includes these newer definitions from UAPI linux/mman.h, is useful to fix tests build on systems without these definitions in glibc sys/mman.h. Link: https://lkml.kernel.org/r/20220227055330.43087-2-zhouchengming@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 2f49c9af1b58..3fc1d2ee2948 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include From dd21bfa425c098b95ca86845f8e7d1ec1ddf6e4a Mon Sep 17 00:00:00 2001 From: Yun Zhou Date: Fri, 4 Mar 2022 20:29:07 -0800 Subject: [PATCH 643/773] proc: fix documentation and description of pagemap Since bit 57 was exported for uffd-wp write-protected (commit fb8e37f35a2f: "mm/pagemap: export uffd-wp protection information"), fixing it can reduce some unnecessary confusion. Link: https://lkml.kernel.org/r/20220301044538.3042713-1-yun.zhou@windriver.com Fixes: fb8e37f35a2fe1 ("mm/pagemap: export uffd-wp protection information") Signed-off-by: Yun Zhou Reviewed-by: Peter Xu Cc: Jonathan Corbet Cc: Tiberiu A Georgescu Cc: Florian Schmidt Cc: Ivan Teterevkov Cc: SeongJae Park Cc: Yang Shi Cc: David Hildenbrand Cc: Axel Rasmussen Cc: Miaohe Lin Cc: Andrea Arcangeli Cc: Colin Cross Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 2 +- fs/proc/task_mmu.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index bfc28704856c..6e2e416af783 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -23,7 +23,7 @@ There are four components to pagemap: * Bit 56 page exclusively mapped (since 4.2) * Bit 57 pte is uffd-wp write-protected (since 5.13) (see :ref:`Documentation/admin-guide/mm/userfaultfd.rst `) - * Bits 57-60 zero + * Bits 58-60 zero * Bit 61 page is file-page or shared-anon (since 3.5) * Bit 62 page swapped * Bit 63 page present diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2c48b1eaaa9c..f46060eb91b5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1597,7 +1597,8 @@ static const struct mm_walk_ops pagemap_ops = { * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped - * Bits 57-60 zero + * Bit 57 pte is uffd-wp write-protected + * Bits 58-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present From d1eff16d727ff257b706d32114d3881f67cc9c75 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 4 Mar 2022 20:29:10 -0800 Subject: [PATCH 644/773] configs/debug: set CONFIG_DEBUG_INFO=y properly CONFIG_DEBUG_INFO can't be set by user directly, so set CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y instead. Otherwise, we end up with no debuginfo in vmlinux which is a big no-no for kernel debugging. Link: https://lkml.kernel.org/r/20220301202920.18488-1-quic_qiancai@quicinc.com Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs/debug.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e9ffb0cc1eec..07df6d93c4df 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -16,7 +16,7 @@ CONFIG_SYMBOLIC_ERRNAME=y # # Compile-time checks and compiler options # -CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_FRAME_WARN=2048 CONFIG_SECTION_MISMATCH_WARN_ONLY=y From a3d9001b4e287fc043e5539d03d71a32ab114bcb Mon Sep 17 00:00:00 2001 From: Kai Lueke Date: Thu, 3 Mar 2022 15:55:10 +0100 Subject: [PATCH 645/773] Revert "xfrm: state and policy should fail if XFRMA_IF_ID 0" This reverts commit 68ac0f3810e76a853b5f7b90601a05c3048b8b54 because ID 0 was meant to be used for configuring the policy/state without matching for a specific interface (e.g., Cilium is affected, see https://github.com/cilium/cilium/pull/18789 and https://github.com/cilium/cilium/pull/19019). Signed-off-by: Kai Lueke Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a4fb596e87af..72b2f173aac8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -630,13 +630,8 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!x->if_id) { - err = -EINVAL; - goto error; - } - } err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -1432,13 +1427,8 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, mark = xfrm_mark_get(attrs, &m); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!if_id) { - err = -EINVAL; - goto out_noput; - } - } if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); @@ -1751,13 +1741,8 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); - if (attrs[XFRMA_IF_ID]) { + if (attrs[XFRMA_IF_ID]) xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - if (!xp->if_id) { - err = -EINVAL; - goto error; - } - } return xp; error: From afb3cc1a397d77771f342691b7e6b032a234d7f2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 3 Mar 2022 16:08:40 +0200 Subject: [PATCH 646/773] net: dsa: unlock the rtnl_mutex when dsa_master_setup() fails After the blamed commit, dsa_tree_setup_master() may exit without calling rtnl_unlock(), fix that. Fixes: c146f9bc195a ("net: dsa: hold rtnl_mutex when calling dsa_master_{setup,teardown}") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b4e67758e104..074e4a69a728 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1058,7 +1058,7 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { struct dsa_port *dp; - int err; + int err = 0; rtnl_lock(); @@ -1066,13 +1066,13 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst) if (dsa_port_is_cpu(dp)) { err = dsa_master_setup(dp->master, dp); if (err) - return err; + break; } } rtnl_unlock(); - return 0; + return err; } static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) From e2ae38cf3d91837a493cb2093c87700ff3cbe667 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sat, 5 Mar 2022 15:25:25 +0530 Subject: [PATCH 647/773] vhost: fix hung thread due to erroneous iotlb entries In vhost_iotlb_add_range_ctx(), range size can overflow to 0 when start is 0 and last is ULONG_MAX. One instance where it can happen is when userspace sends an IOTLB message with iova=size=uaddr=0 (vhost_process_iotlb_msg). So, an entry with size = 0, start = 0, last = ULONG_MAX ends up in the iotlb. Next time a packet is sent, iotlb_access_ok() loops indefinitely due to that erroneous entry. Call Trace: iotlb_access_ok+0x21b/0x3e0 drivers/vhost/vhost.c:1340 vq_meta_prefetch+0xbc/0x280 drivers/vhost/vhost.c:1366 vhost_transport_do_send_pkt+0xe0/0xfd0 drivers/vhost/vsock.c:104 vhost_worker+0x23d/0x3d0 drivers/vhost/vhost.c:372 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Reported by syzbot at: https://syzkaller.appspot.com/bug?extid=0abd373e2e50d704db87 To fix this, do two things: 1. Return -EINVAL in vhost_chr_write_iter() when userspace asks to map a range with size 0. 2. Fix vhost_iotlb_add_range_ctx() to handle the range [0, ULONG_MAX] by splitting it into two entries. Fixes: 0bbe30668d89e ("vhost: factor out IOTLB") Reported-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com Tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com Signed-off-by: Anirudh Rayabharam Link: https://lore.kernel.org/r/20220305095525.5145-1-mail@anirudhrb.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/iotlb.c | 11 +++++++++++ drivers/vhost/vhost.c | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c index 670d56c879e5..40b098320b2a 100644 --- a/drivers/vhost/iotlb.c +++ b/drivers/vhost/iotlb.c @@ -57,6 +57,17 @@ int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, if (last < start) return -EFAULT; + /* If the range being mapped is [0, ULONG_MAX], split it into two entries + * otherwise its size would overflow u64. + */ + if (start == 0 && last == ULONG_MAX) { + u64 mid = last / 2; + + vhost_iotlb_add_range_ctx(iotlb, start, mid, addr, perm, opaque); + addr += mid + 1; + start = mid + 1; + } + if (iotlb->limit && iotlb->nmaps == iotlb->limit && iotlb->flags & VHOST_IOTLB_FLAG_RETIRE) { diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 59edb5a1ffe2..55475fd59fb7 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1170,6 +1170,11 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, goto done; } + if (msg.size == 0) { + ret = -EINVAL; + goto done; + } + if (dev->msg_handler) ret = dev->msg_handler(dev, &msg); else From dacc73ed0b88f1a787ec20385f42ca9dd9eddcd0 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 4 Mar 2022 18:00:57 +0800 Subject: [PATCH 648/773] virtio-blk: Don't use MAX_DISCARD_SEGMENTS if max_discard_seg is zero Currently the value of max_discard_segment will be set to MAX_DISCARD_SEGMENTS (256) with no basis in hardware if device set 0 to max_discard_seg in configuration space. It's incorrect since the device might not be able to handle such large descriptors. To fix it, let's follow max_segments restrictions in this case. Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20220304100058.116-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c443cd64fc9b..7fc2c8b97077 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -925,9 +925,15 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, &v); + + /* + * max_discard_seg == 0 is out of spec but we always + * handled it. + */ + if (!v) + v = sg_elems - 2; blk_queue_max_discard_segments(q, - min_not_zero(v, - MAX_DISCARD_SEGMENTS)); + min(v, MAX_DISCARD_SEGMENTS)); blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } From e030759a1ddcbf61d42b6e996bfeb675e0032d8b Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 4 Mar 2022 18:00:58 +0800 Subject: [PATCH 649/773] virtio-blk: Remove BUG_ON() in virtio_queue_rq() Currently we have a BUG_ON() to make sure the number of sg list does not exceed queue_max_segments() in virtio_queue_rq(). However, the block layer uses queue_max_discard_segments() instead of queue_max_segments() to limit the sg list for discard requests. So the BUG_ON() might be triggered if virtio-blk device reports a larger value for max discard segment than queue_max_segments(). To fix it, let's simply remove the BUG_ON() which has become unnecessary after commit 02746e26c39e("virtio-blk: avoid preallocating big SGL for data"). And the unused vblk->sg_elems can also be removed together. Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") Suggested-by: Christoph Hellwig Signed-off-by: Xie Yongji Reviewed-by: Max Gurtovoy Link: https://lore.kernel.org/r/20220304100058.116-2-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7fc2c8b97077..8c415be86732 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -76,9 +76,6 @@ struct virtio_blk { */ refcount_t refs; - /* What host tells us, plus 2 for header & tailer. */ - unsigned int sg_elems; - /* Ida index - used to track minor number allocations. */ int index; @@ -322,8 +319,6 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, blk_status_t status; int err; - BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); - status = virtblk_setup_cmd(vblk->vdev, req, vbr); if (unlikely(status)) return status; @@ -783,8 +778,6 @@ static int virtblk_probe(struct virtio_device *vdev) /* Prevent integer overflows and honor max vq size */ sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); - /* We need extra sg elements at head and tail. */ - sg_elems += 2; vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); if (!vblk) { err = -ENOMEM; @@ -796,7 +789,6 @@ static int virtblk_probe(struct virtio_device *vdev) mutex_init(&vblk->vdev_mutex); vblk->vdev = vdev; - vblk->sg_elems = sg_elems; INIT_WORK(&vblk->config_work, virtblk_config_changed_work); @@ -853,7 +845,7 @@ static int virtblk_probe(struct virtio_device *vdev) set_disk_ro(vblk->disk, 1); /* We can handle whatever the host told us to handle. */ - blk_queue_max_segments(q, vblk->sg_elems-2); + blk_queue_max_segments(q, sg_elems); /* No real sector limit. */ blk_queue_max_hw_sectors(q, -1U); @@ -931,7 +923,7 @@ static int virtblk_probe(struct virtio_device *vdev) * handled it. */ if (!v) - v = sg_elems - 2; + v = sg_elems; blk_queue_max_discard_segments(q, min(v, MAX_DISCARD_SEGMENTS)); From eb057b44dbe35ae14527830236a92f51de8f9184 Mon Sep 17 00:00:00 2001 From: Zhang Min Date: Tue, 1 Mar 2022 17:10:59 +0800 Subject: [PATCH 650/773] vdpa: fix use-after-free on vp_vdpa_remove When vp_vdpa driver is unbind, vp_vdpa is freed in vdpa_unregister_device and then vp_vdpa->mdev.pci_dev is dereferenced in vp_modern_remove, triggering use-after-free. Call Trace of unbinding driver free vp_vdpa : do_syscall_64 vfs_write kernfs_fop_write_iter device_release_driver_internal pci_device_remove vp_vdpa_remove vdpa_unregister_device kobject_release device_release kfree Call Trace of dereference vp_vdpa->mdev.pci_dev: vp_modern_remove pci_release_selected_regions pci_release_region pci_resource_len pci_resource_end (dev)->resource[(bar)].end Signed-off-by: Zhang Min Signed-off-by: Yi Wang Link: https://lore.kernel.org/r/20220301091059.46869-1-wang.yi59@zte.com.cn Signed-off-by: Michael S. Tsirkin Fixes: 64b9f64f80a6 ("vdpa: introduce virtio pci driver") Reviewed-by: Stefano Garzarella --- drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index a57e381e830b..cce101e6a940 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -533,8 +533,8 @@ static void vp_vdpa_remove(struct pci_dev *pdev) { struct vp_vdpa *vp_vdpa = pci_get_drvdata(pdev); - vdpa_unregister_device(&vp_vdpa->vdpa); vp_modern_remove(&vp_vdpa->mdev); + vdpa_unregister_device(&vp_vdpa->vdpa); } static struct pci_driver vp_vdpa_driver = { From e7c552ec897894ec421867059e48474eb7f1ff6d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 25 Feb 2022 06:46:34 -0500 Subject: [PATCH 651/773] virtio: drop default for virtio-mem There's no special reason why virtio-mem needs a default that's different from what kconfig provides, any more than e.g. virtio blk. Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand --- drivers/virtio/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 34f80b7a8a64..492fc26f0b65 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -105,7 +105,6 @@ config VIRTIO_BALLOON config VIRTIO_MEM tristate "Virtio mem driver" - default m depends on X86_64 depends on VIRTIO depends on MEMORY_HOTPLUG From 4c8093637bc9f8cc2e41eed343c12f85d6ff9e25 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 13 Jan 2022 15:11:34 +0100 Subject: [PATCH 652/773] vhost: remove avail_event arg from vhost_update_avail_event() In vhost_update_avail_event() we never used the `avail_event` argument, since its introduction in commit 2723feaa8ec6 ("vhost: set log when updating used flags or avail event"). Let's remove it to clean up the code. Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20220113141134.186773-1-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 55475fd59fb7..082380c03a3e 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1986,7 +1986,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) return 0; } -static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) +static int vhost_update_avail_event(struct vhost_virtqueue *vq) { if (vhost_put_avail_event(vq)) return -EFAULT; @@ -2532,7 +2532,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) return false; } } else { - r = vhost_update_avail_event(vq, vq->avail_idx); + r = vhost_update_avail_event(vq); if (r) { vq_err(vq, "Failed to update avail event index at %p: %d\n", vhost_avail_event(vq), r); From 32f1b53fe8f03d962423ba81f8e92af5839814da Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 18 Jan 2022 16:06:31 +0100 Subject: [PATCH 653/773] tools/virtio: fix virtio_test execution virtio_test hangs on __vring_new_virtqueue() because `vqs_list_lock` is not initialized. Let's initialize it in vdev_info_init(). Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20220118150631.167015-1-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- tools/virtio/virtio_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index cb3f29c09aff..23f142af544a 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -130,6 +130,7 @@ static void vdev_info_init(struct vdev_info* dev, unsigned long long features) memset(dev, 0, sizeof *dev); dev->vdev.features = features; INIT_LIST_HEAD(&dev->vdev.vqs); + spin_lock_init(&dev->vdev.vqs_list_lock); dev->buf_size = 1024; dev->buf = malloc(dev->buf_size); assert(dev->buf); From 3dd7d135e75cb37c8501ba02977332a2a487dd39 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 4 Mar 2022 12:10:38 -0500 Subject: [PATCH 654/773] tools/virtio: handle fallout from folio work just add a stub Signed-off-by: Michael S. Tsirkin --- tools/virtio/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/virtio/linux/mm_types.h diff --git a/tools/virtio/linux/mm_types.h b/tools/virtio/linux/mm_types.h new file mode 100644 index 000000000000..356ba4d496f6 --- /dev/null +++ b/tools/virtio/linux/mm_types.h @@ -0,0 +1,3 @@ +struct folio { + struct page page; +}; From ffb217a13a2eaf6d5bd974fc83036a53ca69f1e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Mar 2022 14:28:31 -0800 Subject: [PATCH 655/773] Linux 5.17-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index daeb5c88b50b..87f672473523 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Superb Owl # *DOCUMENTATION* From 48015b632f770c401f3816f144499a39f2884677 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 11 Feb 2022 17:32:37 +1100 Subject: [PATCH 656/773] powerpc: Fix STACKTRACE=n build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our skiroot_defconfig doesn't enable FTRACE, and so doesn't get STACKTRACE enabled either. That leads to a build failure since commit 1614b2b11fab ("arch: Make ARCH_STACKWALK independent of STACKTRACE") made stacktrace.c build even when STACKTRACE=n. arch/powerpc/kernel/stacktrace.c: In function ‘handle_backtrace_ipi’: arch/powerpc/kernel/stacktrace.c:171:2: error: implicit declaration of function ‘nmi_cpu_backtrace’ 171 | nmi_cpu_backtrace(regs); | ^~~~~~~~~~~~~~~~~ arch/powerpc/kernel/stacktrace.c: In function ‘arch_trigger_cpumask_backtrace’: arch/powerpc/kernel/stacktrace.c:226:2: error: implicit declaration of function ‘nmi_trigger_cpumask_backtrace’ 226 | nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This happens because our headers haven't defined arch_trigger_cpumask_backtrace, which causes lib/nmi_backtrace.c not to build nmi_cpu_backtrace(). The code in question doesn't actually depend on STACKTRACE=y, that was just added because arch_trigger_cpumask_backtrace() lived in stacktrace.c for convenience. So drop the dependency on CONFIG_STACKTRACE, that causes lib/nmi_backtrace.c to build nmi_cpu_backtrace() etc. and fixes the build. Fixes: 1614b2b11fab ("arch: Make ARCH_STACKWALK independent of STACKTRACE") [mpe: Cherry pick of 5a72345e6a78 from next into fixes] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220212111349.2806972-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/nmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index 160abcb8e9fa..ea0e487f87b1 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -9,7 +9,7 @@ long soft_nmi_interrupt(struct pt_regs *regs); static inline void arch_touch_nmi_watchdog(void) {} #endif -#if defined(CONFIG_NMI_IPI) && defined(CONFIG_STACKTRACE) +#ifdef CONFIG_NMI_IPI extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace From 3777ea7bac3113005b7180e6b9dadf16d19a5827 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: [PATCH 657/773] xen/xenbus: don't let xenbus_grant_ring() remove grants in error case Letting xenbus_grant_ring() tear down grants in the error case is problematic, as the other side could already have used these grants. Calling gnttab_end_foreign_access_ref() without checking success is resulting in an unclear situation for any caller of xenbus_grant_ring() as in the error case the memory pages of the ring page might be partially mapped. Freeing them would risk unwanted foreign access to them, while not freeing them would leak memory. In order to remove the need to undo any gnttab_grant_foreign_access() calls, use gnttab_alloc_grant_references() to make sure no further error can occur in the loop granting access to the ring pages. It should be noted that this way of handling removes leaking of grant entries in the error case, too. This is CVE-2022-23040 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- drivers/xen/xenbus/xenbus_client.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index e8bed1cb76ba..df6890681231 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -379,7 +379,14 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, unsigned int nr_pages, grant_ref_t *grefs) { int err; - int i, j; + unsigned int i; + grant_ref_t gref_head; + + err = gnttab_alloc_grant_references(nr_pages, &gref_head); + if (err) { + xenbus_dev_fatal(dev, err, "granting access to ring page"); + return err; + } for (i = 0; i < nr_pages; i++) { unsigned long gfn; @@ -389,23 +396,14 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, else gfn = virt_to_gfn(vaddr); - err = gnttab_grant_foreign_access(dev->otherend_id, gfn, 0); - if (err < 0) { - xenbus_dev_fatal(dev, err, - "granting access to ring page"); - goto fail; - } - grefs[i] = err; + grefs[i] = gnttab_claim_grant_reference(&gref_head); + gnttab_grant_foreign_access_ref(grefs[i], dev->otherend_id, + gfn, 0); vaddr = vaddr + XEN_PAGE_SIZE; } return 0; - -fail: - for (j = 0; j < i; j++) - gnttab_end_foreign_access_ref(grefs[j], 0); - return err; } EXPORT_SYMBOL_GPL(xenbus_grant_ring); From 6b1775f26a2da2b05a6dc8ec2b5d14e9a4701a1a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: [PATCH 658/773] xen/grant-table: add gnttab_try_end_foreign_access() Add a new grant table function gnttab_try_end_foreign_access(), which will remove and free a grant if it is not in use. Its main use case is to either free a grant if it is no longer in use, or to take some other action if it is still in use. This other action can be an error exit, or (e.g. in the case of blkfront persistent grant feature) some special handling. This is CVE-2022-23036, CVE-2022-23038 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - new patch V4: - add comments to header (Jan Beulich) --- drivers/xen/grant-table.c | 14 ++++++++++++-- include/xen/grant_table.h | 12 ++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 3729bea0c989..1b82e7a3722a 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -435,11 +435,21 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly, what, ref, page ? page_to_pfn(page) : -1); } +int gnttab_try_end_foreign_access(grant_ref_t ref) +{ + int ret = _gnttab_end_foreign_access_ref(ref, 0); + + if (ret) + put_free_entry(ref); + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access); + void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page) { - if (gnttab_end_foreign_access_ref(ref, readonly)) { - put_free_entry(ref); + if (gnttab_try_end_foreign_access(ref)) { if (page != 0) put_page(virt_to_page(page)); } else diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index cb854df031ce..358d2817741b 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -104,10 +104,22 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * access has been ended, free the given page too. Access will be ended * immediately iff the grant entry is not in use, otherwise it will happen * some time later. page may be 0, in which case no freeing will occur. + * Note that the granted page might still be accessed (read or write) by the + * other side after gnttab_end_foreign_access() returns, so even if page was + * specified as 0 it is not allowed to just reuse the page for other + * purposes immediately. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); +/* + * End access through the given grant reference, iff the grant entry is + * no longer in use. In case of success ending foreign access, the + * grant reference is deallocated. + * Return 1 if the grant entry was freed, 0 if it is still in use. + */ +int gnttab_try_end_foreign_access(grant_ref_t ref); + int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); From abf1fd5919d6238ee3bc5eb4a9b6c3947caa6638 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: [PATCH 659/773] xen/blkfront: don't use gnttab_query_foreign_access() for mapped status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_end_foreign_access_ref() and check the success of that operation instead. For the ring allocation use alloc_pages_exact() in order to avoid high order pages in case of a multi-page ring. If a grant wasn't unmapped by the backend without persistent grants being used, set the device state to "error". This is CVE-2022-23036 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné --- V2: - use gnttab_try_end_foreign_access() V4: - use alloc_pages_exact() and free_pages_exact() - set state to error if backend didn't unmap (Roger Pau Monné) --- drivers/block/xen-blkfront.c | 63 +++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ca71a0585333..03b5fb341e58 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1288,7 +1288,8 @@ free_shadow: rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * XEN_PAGE_SIZE)); + free_pages_exact(rinfo->ring.sring, + info->nr_ring_pages * XEN_PAGE_SIZE); rinfo->ring.sring = NULL; if (rinfo->irq) @@ -1372,9 +1373,15 @@ static int blkif_get_final_status(enum blk_req_status s1, return BLKIF_RSP_OKAY; } -static bool blkif_completion(unsigned long *id, - struct blkfront_ring_info *rinfo, - struct blkif_response *bret) +/* + * Return values: + * 1 response processed. + * 0 missing further responses. + * -1 error while processing. + */ +static int blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, + struct blkif_response *bret) { int i = 0; struct scatterlist *sg; @@ -1397,7 +1404,7 @@ static bool blkif_completion(unsigned long *id, /* Wait the second response if not yet here. */ if (s2->status < REQ_DONE) - return false; + return 0; bret->status = blkif_get_final_status(s->status, s2->status); @@ -1448,42 +1455,43 @@ static bool blkif_completion(unsigned long *id, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < num_grant; i++) { - if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the * backend has chosen to make this grant persistent) * we add it at the head of the list, so it will be * reused first. */ - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->grants_used[i]->gref); + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + return -1; + } list_add(&s->grants_used[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { /* - * If the grant is not mapped by the backend we end the - * foreign access and add it to the tail of the list, - * so it will not be picked again unless we run out of - * persistent grants. + * If the grant is not mapped by the backend we add it + * to the tail of the list, so it will not be picked + * again unless we run out of persistent grants. */ - gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { - if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { - if (!info->feature_persistent) - pr_alert_ratelimited("backed has not unmapped grant: %u\n", - s->indirect_grants[i]->gref); + if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) { + pr_alert("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + return -1; + } list_add(&s->indirect_grants[i]->node, &rinfo->grants); rinfo->persistent_gnts_c++; } else { struct page *indirect_page; - gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); /* * Add the used indirect page back to the list of * available pages for indirect grefs. @@ -1498,7 +1506,7 @@ static bool blkif_completion(unsigned long *id, } } - return true; + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1564,12 +1572,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } if (bret.operation != BLKIF_OP_DISCARD) { + int ret; + /* * We may need to wait for an extra response if the * I/O request is split in 2 */ - if (!blkif_completion(&id, rinfo, &bret)) + ret = blkif_completion(&id, rinfo, &bret); + if (!ret) continue; + if (unlikely(ret < 0)) + goto err; } if (add_id_to_freelist(rinfo, id)) { @@ -1676,8 +1689,7 @@ static int setup_blkring(struct xenbus_device *dev, for (i = 0; i < info->nr_ring_pages; i++) rinfo->ring_ref[i] = GRANT_INVALID_REF; - sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, - get_order(ring_size)); + sring = alloc_pages_exact(ring_size, GFP_NOIO); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; @@ -1687,7 +1699,7 @@ static int setup_blkring(struct xenbus_device *dev, err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { - free_pages((unsigned long)sring, get_order(ring_size)); + free_pages_exact(sring, ring_size); rinfo->ring.sring = NULL; goto fail; } @@ -2532,11 +2544,10 @@ static void purge_persistent_grants(struct blkfront_info *info) list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants, node) { if (gnt_list_entry->gref == GRANT_INVALID_REF || - gnttab_query_foreign_access(gnt_list_entry->gref)) + !gnttab_try_end_foreign_access(gnt_list_entry->gref)) continue; list_del(&gnt_list_entry->node); - gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); rinfo->persistent_gnts_c--; gnt_list_entry->gref = GRANT_INVALID_REF; list_add_tail(&gnt_list_entry->node, &rinfo->grants); From 31185df7e2b1d2fa1de4900247a12d7b9c7087eb Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: [PATCH 660/773] xen/netfront: don't use gnttab_query_foreign_access() for mapped status It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_end_foreign_access_ref() and check the success of that operation instead. This is CVE-2022-23037 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - use gnttab_try_end_foreign_access() V3: - don't use gnttab_try_end_foreign_access() --- drivers/net/xen-netfront.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 7748f07e2cf1..13926d03418e 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -424,14 +424,12 @@ static bool xennet_tx_buf_gc(struct netfront_queue *queue) queue->tx_link[id] = TX_LINK_NONE; skb = queue->tx_skbs[id]; queue->tx_skbs[id] = NULL; - if (unlikely(gnttab_query_foreign_access( - queue->grant_tx_ref[id]) != 0)) { + if (unlikely(!gnttab_end_foreign_access_ref( + queue->grant_tx_ref[id], GNTMAP_readonly))) { dev_alert(dev, "Grant still in use by backend domain\n"); goto err; } - gnttab_end_foreign_access_ref( - queue->grant_tx_ref[id], GNTMAP_readonly); gnttab_release_grant_reference( &queue->gref_tx_head, queue->grant_tx_ref[id]); queue->grant_tx_ref[id] = GRANT_INVALID_REF; From 33172ab50a53578a95691310f49567c9266968b0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: [PATCH 661/773] xen/scsifront: don't use gnttab_query_foreign_access() for mapped status It isn't enough to check whether a grant is still being in use by calling gnttab_query_foreign_access(), as a mapping could be realized by the other side just after having called that function. In case the call was done in preparation of revoking a grant it is better to do so via gnttab_try_end_foreign_access() and check the success of that operation instead. This is CVE-2022-23038 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - use gnttab_try_end_foreign_access() --- drivers/scsi/xen-scsifront.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index 12c10a5e3d93..7f421600cb66 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -233,12 +233,11 @@ static void scsifront_gnttab_done(struct vscsifrnt_info *info, return; for (i = 0; i < shadow->nr_grants; i++) { - if (unlikely(gnttab_query_foreign_access(shadow->gref[i]))) { + if (unlikely(!gnttab_try_end_foreign_access(shadow->gref[i]))) { shost_printk(KERN_ALERT, info->host, KBUILD_MODNAME "grant still in use by backend\n"); BUG(); } - gnttab_end_foreign_access(shadow->gref[i], 0, 0UL); } kfree(shadow->sg); From d3b6372c5881cb54925212abb62c521df8ba4809 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: [PATCH 662/773] xen/gntalloc: don't use gnttab_query_foreign_access() Using gnttab_query_foreign_access() is unsafe, as it is racy by design. The use case in the gntalloc driver is not needed at all. While at it replace the call of gnttab_end_foreign_access_ref() with a call of gnttab_end_foreign_access(), which is what is really wanted there. In case the grant wasn't used due to an allocation failure, just free the grant via gnttab_free_grant_reference(). This is CVE-2022-23039 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V3: - fix __del_gref() (Jan Beulich) --- drivers/xen/gntalloc.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 3fa40c723e8e..edb0acd0b832 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -169,20 +169,14 @@ undo: __del_gref(gref); } - /* It's possible for the target domain to map the just-allocated grant - * references by blindly guessing their IDs; if this is done, then - * __del_gref will leave them in the queue_gref list. They need to be - * added to the global list so that we can free them when they are no - * longer referenced. - */ - if (unlikely(!list_empty(&queue_gref))) - list_splice_tail(&queue_gref, &gref_list); mutex_unlock(&gref_mutex); return rc; } static void __del_gref(struct gntalloc_gref *gref) { + unsigned long addr; + if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { uint8_t *tmp = kmap(gref->page); tmp[gref->notify.pgoff] = 0; @@ -196,21 +190,16 @@ static void __del_gref(struct gntalloc_gref *gref) gref->notify.flags = 0; if (gref->gref_id) { - if (gnttab_query_foreign_access(gref->gref_id)) - return; - - if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) - return; - - gnttab_free_grant_reference(gref->gref_id); + if (gref->page) { + addr = (unsigned long)page_to_virt(gref->page); + gnttab_end_foreign_access(gref->gref_id, 0, addr); + } else + gnttab_free_grant_reference(gref->gref_id); } gref_size--; list_del(&gref->next_gref); - if (gref->page) - __free_page(gref->page); - kfree(gref); } From 1dbd11ca75fe664d3e54607547771d021f531f59 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:54 +0100 Subject: [PATCH 663/773] xen: remove gnttab_query_foreign_access() Remove gnttab_query_foreign_access(), as it is unused and unsafe to use. All previous use cases assumed a grant would not be in use after gnttab_query_foreign_access() returned 0. This information is useless in best case, as it only refers to a situation in the past, which could have changed already. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- drivers/xen/grant-table.c | 25 ------------------------- include/xen/grant_table.h | 2 -- 2 files changed, 27 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 1b82e7a3722a..e6548910e79f 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -133,13 +133,6 @@ struct gnttab_ops { * return the frame. */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); - /* - * Query the status of a grant entry. Ref parameter is reference of - * queried grant entry, return value is the status of queried entry. - * Detailed status(writing/reading) can be gotten from the return value - * by bit operations. - */ - int (*query_foreign_access)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -284,22 +277,6 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -static int gnttab_query_foreign_access_v1(grant_ref_t ref) -{ - return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); -} - -static int gnttab_query_foreign_access_v2(grant_ref_t ref) -{ - return grstatus[ref] & (GTF_reading|GTF_writing); -} - -int gnttab_query_foreign_access(grant_ref_t ref) -{ - return gnttab_interface->query_foreign_access(ref); -} -EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); - static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) { u16 flags, nflags; @@ -1427,7 +1404,6 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, - .query_foreign_access = gnttab_query_foreign_access_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1439,7 +1415,6 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, - .query_foreign_access = gnttab_query_foreign_access_v2, }; static bool gnttab_need_v2(void) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 358d2817741b..ab9e692a0ef4 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -125,8 +125,6 @@ int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); -int gnttab_query_foreign_access(grant_ref_t ref); - /* * operations on reserved batches of grant references */ From cd7bcfab4e73dcb3de92c2014c19f17af3864bfe Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: [PATCH 664/773] xen/usb: don't use gnttab_end_foreign_access() in xenhcd_gnttab_done() The usage of gnttab_end_foreign_access() in xenhcd_gnttab_done() is not safe against a malicious backend, as the backend could keep the I/O page mapped and modify it even after the granted memory page is being used for completely other purposes in the local system. So replace that use case with gnttab_try_end_foreign_access() and disable the PV host adapter in case the backend didn't stop using the granted page. In xenhcd_urb_request_done() immediately return in case of setting the device state to "error" instead of looking into further backend responses. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - use gnttab_try_end_foreign_access() --- drivers/usb/host/xen-hcd.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/usb/host/xen-hcd.c b/drivers/usb/host/xen-hcd.c index be09fd9bac58..19b8c7ed74cb 100644 --- a/drivers/usb/host/xen-hcd.c +++ b/drivers/usb/host/xen-hcd.c @@ -716,8 +716,9 @@ static int xenhcd_map_urb_for_request(struct xenhcd_info *info, struct urb *urb, return 0; } -static void xenhcd_gnttab_done(struct usb_shadow *shadow) +static void xenhcd_gnttab_done(struct xenhcd_info *info, unsigned int id) { + struct usb_shadow *shadow = info->shadow + id; int nr_segs = 0; int i; @@ -726,8 +727,10 @@ static void xenhcd_gnttab_done(struct usb_shadow *shadow) if (xenusb_pipeisoc(shadow->req.pipe)) nr_segs += shadow->req.u.isoc.nr_frame_desc_segs; - for (i = 0; i < nr_segs; i++) - gnttab_end_foreign_access(shadow->req.seg[i].gref, 0, 0UL); + for (i = 0; i < nr_segs; i++) { + if (!gnttab_try_end_foreign_access(shadow->req.seg[i].gref)) + xenhcd_set_error(info, "backend didn't release grant"); + } shadow->req.nr_buffer_segs = 0; shadow->req.u.isoc.nr_frame_desc_segs = 0; @@ -841,7 +844,9 @@ static void xenhcd_cancel_all_enqueued_urbs(struct xenhcd_info *info) list_for_each_entry_safe(urbp, tmp, &info->in_progress_list, list) { req_id = urbp->req_id; if (!urbp->unlinked) { - xenhcd_gnttab_done(&info->shadow[req_id]); + xenhcd_gnttab_done(info, req_id); + if (info->error) + return; if (urbp->urb->status == -EINPROGRESS) /* not dequeued */ xenhcd_giveback_urb(info, urbp->urb, @@ -942,8 +947,7 @@ static int xenhcd_urb_request_done(struct xenhcd_info *info) rp = info->urb_ring.sring->rsp_prod; if (RING_RESPONSE_PROD_OVERFLOW(&info->urb_ring, rp)) { xenhcd_set_error(info, "Illegal index on urb-ring"); - spin_unlock_irqrestore(&info->lock, flags); - return 0; + goto err; } rmb(); /* ensure we see queued responses up to "rp" */ @@ -952,11 +956,13 @@ static int xenhcd_urb_request_done(struct xenhcd_info *info) id = res.id; if (id >= XENUSB_URB_RING_SIZE) { xenhcd_set_error(info, "Illegal data on urb-ring"); - continue; + goto err; } if (likely(xenusb_pipesubmit(info->shadow[id].req.pipe))) { - xenhcd_gnttab_done(&info->shadow[id]); + xenhcd_gnttab_done(info, id); + if (info->error) + goto err; urb = info->shadow[id].urb; if (likely(urb)) { urb->actual_length = res.actual_length; @@ -978,6 +984,10 @@ static int xenhcd_urb_request_done(struct xenhcd_info *info) spin_unlock_irqrestore(&info->lock, flags); return more_to_do; + + err: + spin_unlock_irqrestore(&info->lock, flags); + return 0; } static int xenhcd_conn_notify(struct xenhcd_info *info) From 5cadd4bb1d7fc9ab201ac14620d1a478357e4ebd Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: [PATCH 665/773] xen/9p: use alloc/free_pages_exact() Instead of __get_free_pages() and free_pages() use alloc_pages_exact() and free_pages_exact(). This is in preparation of a change of gnttab_end_foreign_access() which will prohibit use of high-order pages. By using the local variable "order" instead of ring->intf->ring_order in the error path of xen_9pfs_front_alloc_dataring() another bug is fixed, as the error path can be entered before ring->intf->ring_order is being set. By using alloc_pages_exact() the size in bytes is specified for the allocation, which fixes another bug for the case of order < (PAGE_SHIFT - XEN_PAGE_SHIFT). This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V4: - new patch --- net/9p/trans_xen.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index eb9fb55280ef..01f8067994d6 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -281,9 +281,9 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) ref = priv->rings[i].intf->ref[j]; gnttab_end_foreign_access(ref, 0, 0); } - free_pages((unsigned long)priv->rings[i].data.in, - priv->rings[i].intf->ring_order - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(priv->rings[i].data.in, + 1UL << (priv->rings[i].intf->ring_order + + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); free_page((unsigned long)priv->rings[i].intf); @@ -322,8 +322,8 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, if (ret < 0) goto out; ring->ref = ret; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - order - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + bytes = alloc_pages_exact(1UL << (order + XEN_PAGE_SHIFT), + GFP_KERNEL | __GFP_ZERO); if (!bytes) { ret = -ENOMEM; goto out; @@ -354,9 +354,7 @@ out: if (bytes) { for (i--; i >= 0; i--) gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); - free_pages((unsigned long)bytes, - ring->intf->ring_order - - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(ring->ref, 0, 0); free_page((unsigned long)ring->intf); From b0576cc9c6b843d99c6982888d59a56209341888 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: [PATCH 666/773] xen/pvcalls: use alloc/free_pages_exact() Instead of __get_free_pages() and free_pages() use alloc_pages_exact() and free_pages_exact(). This is in preparation of a change of gnttab_end_foreign_access() which will prohibit use of high-order pages. This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V4: - new patch --- drivers/xen/pvcalls-front.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 3c9ae156b597..0ca351f30a6d 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -337,8 +337,8 @@ static void free_active_ring(struct sock_mapping *map) if (!map->active.ring) return; - free_pages((unsigned long)map->active.data.in, - map->active.ring->ring_order); + free_pages_exact(map->active.data.in, + PAGE_SIZE << map->active.ring->ring_order); free_page((unsigned long)map->active.ring); } @@ -352,8 +352,8 @@ static int alloc_active_ring(struct sock_mapping *map) goto out; map->active.ring->ring_order = PVCALLS_RING_ORDER; - bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PVCALLS_RING_ORDER); + bytes = alloc_pages_exact(PAGE_SIZE << PVCALLS_RING_ORDER, + GFP_KERNEL | __GFP_ZERO); if (!bytes) goto out; From 42baefac638f06314298087394b982ead9ec444b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: [PATCH 667/773] xen/gnttab: fix gnttab_end_foreign_access() without page specified gnttab_end_foreign_access() is used to free a grant reference and optionally to free the associated page. In case the grant is still in use by the other side processing is being deferred. This leads to a problem in case no page to be freed is specified by the caller: the caller doesn't know that the page is still mapped by the other side and thus should not be used for other purposes. The correct way to handle this situation is to take an additional reference to the granted page in case handling is being deferred and to drop that reference when the grant reference could be freed finally. This requires that there are no users of gnttab_end_foreign_access() left directly repurposing the granted page after the call, as this might result in clobbered data or information leaks via the not yet freed grant reference. This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V4: - expand comment in header V5: - get page ref in case of kmalloc() failure, too --- drivers/xen/grant-table.c | 36 +++++++++++++++++++++++++++++------- include/xen/grant_table.h | 7 ++++++- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index e6548910e79f..5c83d41766c8 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -133,6 +133,10 @@ struct gnttab_ops { * return the frame. */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); + /* + * Read the frame number related to a given grant reference. + */ + unsigned long (*read_frame)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -330,6 +334,16 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +static unsigned long gnttab_read_frame_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].frame; +} + +static unsigned long gnttab_read_frame_v2(grant_ref_t ref) +{ + return gnttab_shared.v2[ref].full_page.frame; +} + struct deferred_entry { struct list_head list; grant_ref_t ref; @@ -359,12 +373,9 @@ static void gnttab_handle_deferred(struct timer_list *unused) spin_unlock_irqrestore(&gnttab_list_lock, flags); if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { put_free_entry(entry->ref); - if (entry->page) { - pr_debug("freeing g.e. %#x (pfn %#lx)\n", - entry->ref, page_to_pfn(entry->page)); - put_page(entry->page); - } else - pr_info("freeing g.e. %#x\n", entry->ref); + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + put_page(entry->page); kfree(entry); entry = NULL; } else { @@ -389,9 +400,18 @@ static void gnttab_handle_deferred(struct timer_list *unused) static void gnttab_add_deferred(grant_ref_t ref, bool readonly, struct page *page) { - struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + struct deferred_entry *entry; + gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; const char *what = KERN_WARNING "leaking"; + entry = kmalloc(sizeof(*entry), gfp); + if (!page) { + unsigned long gfn = gnttab_interface->read_frame(ref); + + page = pfn_to_page(gfn_to_pfn(gfn)); + get_page(page); + } + if (entry) { unsigned long flags; @@ -1404,6 +1424,7 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, + .read_frame = gnttab_read_frame_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1415,6 +1436,7 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, + .read_frame = gnttab_read_frame_v2, }; static bool gnttab_need_v2(void) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index ab9e692a0ef4..c9fea9389ebe 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -107,7 +107,12 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * Note that the granted page might still be accessed (read or write) by the * other side after gnttab_end_foreign_access() returns, so even if page was * specified as 0 it is not allowed to just reuse the page for other - * purposes immediately. + * purposes immediately. gnttab_end_foreign_access() will take an additional + * reference to the granted page in this case, which is dropped only after + * the grant is no longer in use. + * This requires that multi page allocations for areas subject to + * gnttab_end_foreign_access() are done via alloc_pages_exact() (and freeing + * via free_pages_exact()) in order to avoid high order pages. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); From 66e3531b33ee51dad17c463b4d9c9f52e341503d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 7 Mar 2022 09:48:55 +0100 Subject: [PATCH 668/773] xen/netfront: react properly to failing gnttab_end_foreign_access_ref() When calling gnttab_end_foreign_access_ref() the returned value must be tested and the reaction to that value should be appropriate. In case of failure in xennet_get_responses() the reaction should not be to crash the system, but to disable the network device. The calls in setup_netfront() can be replaced by calls of gnttab_end_foreign_access(). While at it avoid double free of ring pages and grant references via xennet_disconnect_backend() in this case. This is CVE-2022-23042 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - avoid double free V3: - remove pointless initializer (Jan Beulich) --- drivers/net/xen-netfront.c | 48 ++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 13926d03418e..daa4e6106aac 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -988,7 +988,6 @@ static int xennet_get_responses(struct netfront_queue *queue, struct device *dev = &queue->info->netdev->dev; struct bpf_prog *xdp_prog; struct xdp_buff xdp; - unsigned long ret; int slots = 1; int err = 0; u32 verdict; @@ -1030,8 +1029,13 @@ static int xennet_get_responses(struct netfront_queue *queue, goto next; } - ret = gnttab_end_foreign_access_ref(ref, 0); - BUG_ON(!ret); + if (!gnttab_end_foreign_access_ref(ref, 0)) { + dev_alert(dev, + "Grant still in use by backend domain\n"); + queue->info->broken = true; + dev_alert(dev, "Disabled for further use\n"); + return -EINVAL; + } gnttab_release_grant_reference(&queue->gref_rx_head, ref); @@ -1252,6 +1256,10 @@ static int xennet_poll(struct napi_struct *napi, int budget) &need_xdp_flush); if (unlikely(err)) { + if (queue->info->broken) { + spin_unlock(&queue->rx_lock); + return 0; + } err: while ((skb = __skb_dequeue(&tmpq))) __skb_queue_tail(&errq, skb); @@ -1916,7 +1924,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_queue *queue, unsigned int feature_split_evtchn) { struct xen_netif_tx_sring *txs; - struct xen_netif_rx_sring *rxs; + struct xen_netif_rx_sring *rxs = NULL; grant_ref_t gref; int err; @@ -1936,21 +1944,21 @@ static int setup_netfront(struct xenbus_device *dev, err = xenbus_grant_ring(dev, txs, 1, &gref); if (err < 0) - goto grant_tx_ring_fail; + goto fail; queue->tx_ring_ref = gref; rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); if (!rxs) { err = -ENOMEM; xenbus_dev_fatal(dev, err, "allocating rx ring page"); - goto alloc_rx_ring_fail; + goto fail; } SHARED_RING_INIT(rxs); FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE); err = xenbus_grant_ring(dev, rxs, 1, &gref); if (err < 0) - goto grant_rx_ring_fail; + goto fail; queue->rx_ring_ref = gref; if (feature_split_evtchn) @@ -1963,22 +1971,28 @@ static int setup_netfront(struct xenbus_device *dev, err = setup_netfront_single(queue); if (err) - goto alloc_evtchn_fail; + goto fail; return 0; /* If we fail to setup netfront, it is safe to just revoke access to * granted pages because backend is not accessing it at this point. */ -alloc_evtchn_fail: - gnttab_end_foreign_access_ref(queue->rx_ring_ref, 0); -grant_rx_ring_fail: - free_page((unsigned long)rxs); -alloc_rx_ring_fail: - gnttab_end_foreign_access_ref(queue->tx_ring_ref, 0); -grant_tx_ring_fail: - free_page((unsigned long)txs); -fail: + fail: + if (queue->rx_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(queue->rx_ring_ref, 0, + (unsigned long)rxs); + queue->rx_ring_ref = GRANT_INVALID_REF; + } else { + free_page((unsigned long)rxs); + } + if (queue->tx_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(queue->tx_ring_ref, 0, + (unsigned long)txs); + queue->tx_ring_ref = GRANT_INVALID_REF; + } else { + free_page((unsigned long)txs); + } return err; } From 1760fdb6fe9f796fbdb9b4106b3e0bbacc16b55c Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 4 Mar 2022 11:56:56 +0100 Subject: [PATCH 669/773] mmc: core: Restore (almost) the busy polling for MMC_SEND_OP_COND Commit 76bfc7ccc2fa ("mmc: core: adjust polling interval for CMD1"), significantly decreased the polling period from ~10-12ms into just a couple of us. The purpose was to decrease the total time spent in the busy polling loop, but unfortunate it has lead to problems, that causes eMMC cards to never gets out busy and thus fails to be initialized. To fix the problem, but also to try to keep some of the new improved behaviour, let's start by using a polling period of 1-2ms, which then increases for each loop, according to common polling loop in __mmc_poll_for_busy(). Reported-by: Jean Rene Dawin Reported-by: H. Nikolaus Schaller Cc: Huijin Park Fixes: 76bfc7ccc2fa ("mmc: core: adjust polling interval for CMD1") Signed-off-by: Ulf Hansson Tested-by: Jean Rene Dawin Tested-by: H. Nikolaus Schaller Link: https://lore.kernel.org/r/20220304105656.149281-1-ulf.hansson@linaro.org --- drivers/mmc/core/block.c | 2 +- drivers/mmc/core/mmc.c | 2 +- drivers/mmc/core/mmc_ops.c | 13 +++++++++---- drivers/mmc/core/mmc_ops.h | 3 ++- drivers/mmc/core/sd.c | 2 +- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 8d718aa56d33..4e67c1403cc9 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1908,7 +1908,7 @@ static int mmc_blk_card_busy(struct mmc_card *card, struct request *req) cb_data.card = card; cb_data.status = 0; - err = __mmc_poll_for_busy(card->host, MMC_BLK_TIMEOUT_MS, + err = __mmc_poll_for_busy(card->host, 0, MMC_BLK_TIMEOUT_MS, &mmc_blk_busy_cb, &cb_data); /* diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index bbbbcaf70a59..8421519c2a98 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1962,7 +1962,7 @@ static int mmc_sleep(struct mmc_host *host) goto out_release; } - err = __mmc_poll_for_busy(host, timeout_ms, &mmc_sleep_busy_cb, host); + err = __mmc_poll_for_busy(host, 0, timeout_ms, &mmc_sleep_busy_cb, host); out_release: mmc_retune_release(host); diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index d63d1c735335..180d7e9d3400 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -21,6 +21,8 @@ #define MMC_BKOPS_TIMEOUT_MS (120 * 1000) /* 120s */ #define MMC_SANITIZE_TIMEOUT_MS (240 * 1000) /* 240s */ +#define MMC_OP_COND_PERIOD_US (1 * 1000) /* 1ms */ +#define MMC_OP_COND_TIMEOUT_MS 1000 /* 1s */ static const u8 tuning_blk_pattern_4bit[] = { 0xff, 0x0f, 0xff, 0x00, 0xff, 0xcc, 0xc3, 0xcc, @@ -232,7 +234,9 @@ int mmc_send_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr) cmd.arg = mmc_host_is_spi(host) ? 0 : ocr; cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R3 | MMC_CMD_BCR; - err = __mmc_poll_for_busy(host, 1000, &__mmc_send_op_cond_cb, &cb_data); + err = __mmc_poll_for_busy(host, MMC_OP_COND_PERIOD_US, + MMC_OP_COND_TIMEOUT_MS, + &__mmc_send_op_cond_cb, &cb_data); if (err) return err; @@ -495,13 +499,14 @@ static int mmc_busy_cb(void *cb_data, bool *busy) return 0; } -int __mmc_poll_for_busy(struct mmc_host *host, unsigned int timeout_ms, +int __mmc_poll_for_busy(struct mmc_host *host, unsigned int period_us, + unsigned int timeout_ms, int (*busy_cb)(void *cb_data, bool *busy), void *cb_data) { int err; unsigned long timeout; - unsigned int udelay = 32, udelay_max = 32768; + unsigned int udelay = period_us ? period_us : 32, udelay_max = 32768; bool expired = false; bool busy = false; @@ -546,7 +551,7 @@ int mmc_poll_for_busy(struct mmc_card *card, unsigned int timeout_ms, cb_data.retry_crc_err = retry_crc_err; cb_data.busy_cmd = busy_cmd; - return __mmc_poll_for_busy(host, timeout_ms, &mmc_busy_cb, &cb_data); + return __mmc_poll_for_busy(host, 0, timeout_ms, &mmc_busy_cb, &cb_data); } EXPORT_SYMBOL_GPL(mmc_poll_for_busy); diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h index 9c813b851d0b..09ffbc00908b 100644 --- a/drivers/mmc/core/mmc_ops.h +++ b/drivers/mmc/core/mmc_ops.h @@ -41,7 +41,8 @@ int mmc_can_ext_csd(struct mmc_card *card); int mmc_switch_status(struct mmc_card *card, bool crc_err_fatal); bool mmc_prepare_busy_cmd(struct mmc_host *host, struct mmc_command *cmd, unsigned int timeout_ms); -int __mmc_poll_for_busy(struct mmc_host *host, unsigned int timeout_ms, +int __mmc_poll_for_busy(struct mmc_host *host, unsigned int period_us, + unsigned int timeout_ms, int (*busy_cb)(void *cb_data, bool *busy), void *cb_data); int mmc_poll_for_busy(struct mmc_card *card, unsigned int timeout_ms, diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index bd87012c220c..bfbfed30dc4d 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -1672,7 +1672,7 @@ static int sd_poweroff_notify(struct mmc_card *card) cb_data.card = card; cb_data.reg_buf = reg_buf; - err = __mmc_poll_for_busy(card->host, SD_POWEROFF_NOTIFY_TIMEOUT_MS, + err = __mmc_poll_for_busy(card->host, 0, SD_POWEROFF_NOTIFY_TIMEOUT_MS, &sd_busy_poweroff_notify_cb, &cb_data); out: From d0aeb0d4a3f7d2a0df7e9545892bbeede8f2ac7e Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 5 Mar 2022 00:58:16 -0800 Subject: [PATCH 670/773] isdn: hfcpci: check the return value of dma_set_mask() in setup_hw() The function dma_set_mask() in setup_hw() can fail, so its return value should be checked. Fixes: 1700fe1a10dc ("Add mISDN HFC PCI driver") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/isdn/hardware/mISDN/hfcpci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c index bd087cca1c1d..af17459c1a5c 100644 --- a/drivers/isdn/hardware/mISDN/hfcpci.c +++ b/drivers/isdn/hardware/mISDN/hfcpci.c @@ -2005,7 +2005,11 @@ setup_hw(struct hfc_pci *hc) } /* Allocate memory for FIFOS */ /* the memory needs to be on a 32k boundary within the first 4G */ - dma_set_mask(&hc->pdev->dev, 0xFFFF8000); + if (dma_set_mask(&hc->pdev->dev, 0xFFFF8000)) { + printk(KERN_WARNING + "HFC-PCI: No usable DMA configuration!\n"); + return -EIO; + } buffer = dma_alloc_coherent(&hc->pdev->dev, 0x8000, &hc->hw.dmahandle, GFP_KERNEL); /* We silently assume the address is okay if nonzero */ From e0058f0fa80f6e09c4d363779c241c45a3c56b94 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 5 Mar 2022 01:14:11 -0800 Subject: [PATCH 671/773] net: qlogic: check the return value of dma_alloc_coherent() in qed_vf_hw_prepare() The function dma_alloc_coherent() in qed_vf_hw_prepare() can fail, so its return value should be checked. Fixes: 1408cc1fa48c ("qed: Introduce VFs") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_vf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c index 597cd9cd57b5..7b0e390c0b07 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.c +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c @@ -513,6 +513,9 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn) p_iov->bulletin.size, &p_iov->bulletin.phys, GFP_KERNEL); + if (!p_iov->bulletin.p_virt) + goto free_pf2vf_reply; + DP_VERBOSE(p_hwfn, QED_MSG_IOV, "VF's bulletin Board [%p virt 0x%llx phys 0x%08x bytes]\n", p_iov->bulletin.p_virt, @@ -552,6 +555,10 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn) return rc; +free_pf2vf_reply: + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(union pfvf_tlvs), + p_iov->pf2vf_reply, p_iov->pf2vf_reply_phys); free_vf2pf_request: dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(union vfpf_tlvs), From dd830aed23c6e07cd8e2a163742bf3d63c9add08 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Sat, 5 Mar 2022 12:20:39 +0100 Subject: [PATCH 672/773] net: lantiq_xrx200: fix use after free bug The skb->len field is read after the packet is sent to the network stack. In the meantime, skb can be freed. This patch fixes this bug. Fixes: c3e6b2c35b34 ("net: lantiq_xrx200: add ingress SG DMA support") Reported-by: Eric Dumazet Signed-off-by: Aleksander Jan Bajkowski Acked-by: Hauke Mehrtens Signed-off-by: David S. Miller --- drivers/net/ethernet/lantiq_xrx200.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/lantiq_xrx200.c b/drivers/net/ethernet/lantiq_xrx200.c index 41d11137cde0..5712c3e94be8 100644 --- a/drivers/net/ethernet/lantiq_xrx200.c +++ b/drivers/net/ethernet/lantiq_xrx200.c @@ -260,9 +260,9 @@ static int xrx200_hw_receive(struct xrx200_chan *ch) if (ctl & LTQ_DMA_EOP) { ch->skb_head->protocol = eth_type_trans(ch->skb_head, net_dev); - netif_receive_skb(ch->skb_head); net_dev->stats.rx_packets++; net_dev->stats.rx_bytes += ch->skb_head->len; + netif_receive_skb(ch->skb_head); ch->skb_head = NULL; ch->skb_tail = NULL; ret = XRX200_DMA_PACKET_COMPLETE; From bb77bd31c281f70ec77c9c4f584950a779e05cf8 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Sat, 5 Mar 2022 14:55:04 +0000 Subject: [PATCH 673/773] ethernet: sun: Free the coherent when failing in probing When the driver fails to register net device, it should free the DMA region first, and then do other cleanup. Signed-off-by: Zheyu Ma Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/sunhme.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index ad9029ae6848..77e5dffb558f 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -3146,7 +3146,7 @@ static int happy_meal_pci_probe(struct pci_dev *pdev, if (err) { printk(KERN_ERR "happymeal(PCI): Cannot register net device, " "aborting.\n"); - goto err_out_iounmap; + goto err_out_free_coherent; } pci_set_drvdata(pdev, hp); @@ -3179,6 +3179,10 @@ static int happy_meal_pci_probe(struct pci_dev *pdev, return 0; +err_out_free_coherent: + dma_free_coherent(hp->dma_dev, PAGE_SIZE, + hp->happy_block, hp->hblock_dvma); + err_out_iounmap: iounmap(hp->gregs); From ebe48d368e97d007bfeb76fcb065d6cfc4c96645 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:39 +0100 Subject: [PATCH 674/773] esp: Fix possible buffer overflow in ESP transformation The maximum message size that can be send is bigger than the maximum site that skb_page_frag_refill can allocate. So it is possible to write beyond the allocated buffer. Fix this by doing a fallback to COW in that case. v2: Avoid get get_order() costs as suggested by Linus Torvalds. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Reported-by: valis Signed-off-by: Steffen Klassert --- include/net/esp.h | 2 ++ net/ipv4/esp4.c | 5 +++++ net/ipv6/esp6.c | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/include/net/esp.h b/include/net/esp.h index 9c5637d41d95..90cd02ff77ef 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index e1b1d080e908..70e6c87fbe3d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -446,6 +446,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -455,6 +456,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7591160edce1..b0ffbcd5432d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -482,6 +482,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -490,6 +491,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; From 053c8fdf2c930efdff5496960842bbb5c34ad43a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:40 +0100 Subject: [PATCH 675/773] esp: Fix BEET mode inter address family tunneling on GSO The xfrm{4,6}_beet_gso_segment() functions did not correctly set the SKB_GSO_IPXIP4 and SKB_GSO_IPXIP6 gso types for the address family tunneling case. Fix this by setting these gso types. Fixes: 384a46ea7bdc7 ("esp4: add gso_segment for esp4 beet mode") Fixes: 7f9e40eb18a99 ("esp6: add gso_segment for esp6 beet mode") Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index d87f02a6e934..146d4d54830c 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -160,6 +160,9 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } + if (proto == IPPROTO_IPV6) + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; + __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index ba5e81cd569c..e61172d50817 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -199,6 +199,9 @@ static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, ipv6_skip_exthdr(skb, 0, &proto, &frag); } + if (proto == IPPROTO_IPIP) + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; + __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) From 23c7f8d7989e1646aac82f75761b7648c355cb8a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:41 +0100 Subject: [PATCH 676/773] net: Fix esp GSO on inter address family tunnels. The esp tunnel GSO handlers use skb_mac_gso_segment to push the inner packet to the segmentation handlers. However, skb_mac_gso_segment takes the Ethernet Protocol ID from 'skb->protocol' which is wrong for inter address family tunnels. We fix this by introducing a new skb_eth_gso_segment function. This function can be used if it is necessary to pass the Ethernet Protocol ID directly to the segmentation handler. First users of this function will be the esp4 and esp6 tunnel segmentation handlers. Fixes: c35fe4106b92 ("xfrm: Add mode handlers for IPsec on layer 2") Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 2 ++ net/core/gro.c | 25 +++++++++++++++++++++++++ net/ipv4/esp4_offload.c | 3 +-- net/ipv6/esp6_offload.c | 3 +-- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8b5a314db167..f53ea7038441 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4602,6 +4602,8 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/net/core/gro.c b/net/core/gro.c index a11b286d1495..b7d2b0dc59a2 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -92,6 +92,31 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 146d4d54830c..935026f4c807 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -110,8 +110,7 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); + return skb_eth_gso_segment(skb, features, htons(ETH_P_IP)); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index e61172d50817..3a293838a91d 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -145,8 +145,7 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); + return skb_eth_gso_segment(skb, features, htons(ETH_P_IPV6)); } static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, From d9dc0c84ad2d4cc911ba252c973d1bf18d5eb9cf Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 5 Mar 2022 07:06:42 -0800 Subject: [PATCH 677/773] qed: return status of qed_iov_get_link Clang static analysis reports this issue qed_sriov.c:4727:19: warning: Assigned value is garbage or undefined ivi->max_tx_rate = tx_rate ? tx_rate : link.speed; ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ link is only sometimes set by the call to qed_iov_get_link() qed_iov_get_link fails without setting link or returning status. So change the decl to return status. Fixes: 73390ac9d82b ("qed*: support ndo_get_vf_config") Signed-off-by: Tom Rix Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_sriov.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index 8ac38828ba45..48cf4355bc47 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -3806,11 +3806,11 @@ bool qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn, u32 *p_disabled_vfs) return found; } -static void qed_iov_get_link(struct qed_hwfn *p_hwfn, - u16 vfid, - struct qed_mcp_link_params *p_params, - struct qed_mcp_link_state *p_link, - struct qed_mcp_link_capabilities *p_caps) +static int qed_iov_get_link(struct qed_hwfn *p_hwfn, + u16 vfid, + struct qed_mcp_link_params *p_params, + struct qed_mcp_link_state *p_link, + struct qed_mcp_link_capabilities *p_caps) { struct qed_vf_info *p_vf = qed_iov_get_vf_info(p_hwfn, vfid, @@ -3818,7 +3818,7 @@ static void qed_iov_get_link(struct qed_hwfn *p_hwfn, struct qed_bulletin_content *p_bulletin; if (!p_vf) - return; + return -EINVAL; p_bulletin = p_vf->bulletin.p_virt; @@ -3828,6 +3828,7 @@ static void qed_iov_get_link(struct qed_hwfn *p_hwfn, __qed_vf_get_link_state(p_hwfn, p_link, p_bulletin); if (p_caps) __qed_vf_get_link_caps(p_hwfn, p_caps, p_bulletin); + return 0; } static int @@ -4686,6 +4687,7 @@ static int qed_get_vf_config(struct qed_dev *cdev, struct qed_public_vf_info *vf_info; struct qed_mcp_link_state link; u32 tx_rate; + int ret; /* Sanitize request */ if (IS_VF(cdev)) @@ -4699,7 +4701,9 @@ static int qed_get_vf_config(struct qed_dev *cdev, vf_info = qed_iov_get_public_vf_info(hwfn, vf_id, true); - qed_iov_get_link(hwfn, vf_id, NULL, &link, NULL); + ret = qed_iov_get_link(hwfn, vf_id, NULL, &link, NULL); + if (ret) + return ret; /* Fill information about VF */ ivi->vf = vf_id; From c70c453abcbf3ecbaadd4c3236a5119b8da365cf Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 5 Mar 2022 17:47:20 -0300 Subject: [PATCH 678/773] smsc95xx: Ignore -ENODEV errors when device is unplugged According to Documentation/driver-api/usb/URB.rst when a device is unplugged usb_submit_urb() returns -ENODEV. This error code propagates all the way up to usbnet_read_cmd() and usbnet_write_cmd() calls inside the smsc95xx.c driver during Ethernet cable unplug, unbind or reboot. This causes the following errors to be shown on reboot, for example: ci_hdrc ci_hdrc.1: remove, state 1 usb usb2: USB disconnect, device number 1 usb 2-1: USB disconnect, device number 2 usb 2-1.1: USB disconnect, device number 3 smsc95xx 2-1.1:1.0 eth1: unregister 'smsc95xx' usb-ci_hdrc.1-1.1, smsc95xx USB 2.0 Ethernet smsc95xx 2-1.1:1.0 eth1: Failed to read reg index 0x00000114: -19 smsc95xx 2-1.1:1.0 eth1: Error reading MII_ACCESS smsc95xx 2-1.1:1.0 eth1: __smsc95xx_mdio_read: MII is busy smsc95xx 2-1.1:1.0 eth1: Failed to read reg index 0x00000114: -19 smsc95xx 2-1.1:1.0 eth1: Error reading MII_ACCESS smsc95xx 2-1.1:1.0 eth1: __smsc95xx_mdio_read: MII is busy smsc95xx 2-1.1:1.0 eth1: hardware isn't capable of remote wakeup usb 2-1.4: USB disconnect, device number 4 ci_hdrc ci_hdrc.1: USB bus 2 deregistered ci_hdrc ci_hdrc.0: remove, state 4 usb usb1: USB disconnect, device number 1 ci_hdrc ci_hdrc.0: USB bus 1 deregistered imx2-wdt 30280000.watchdog: Device shutdown: Expect reboot! reboot: Restarting system Ignore the -ENODEV errors inside __smsc95xx_mdio_read() and __smsc95xx_phy_wait_not_busy() and do not print error messages when -ENODEV is returned. Fixes: a049a30fc27c ("net: usb: Correct PHY handling of smsc95xx") Signed-off-by: Fabio Estevam Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index bc1e3dd67c04..a0f29482294d 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -84,9 +84,10 @@ static int __must_check __smsc95xx_read_reg(struct usbnet *dev, u32 index, ret = fn(dev, USB_VENDOR_REQUEST_READ_REGISTER, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, index, &buf, 4); - if (unlikely(ret < 0)) { - netdev_warn(dev->net, "Failed to read reg index 0x%08x: %d\n", - index, ret); + if (ret < 0) { + if (ret != -ENODEV) + netdev_warn(dev->net, "Failed to read reg index 0x%08x: %d\n", + index, ret); return ret; } @@ -116,7 +117,7 @@ static int __must_check __smsc95xx_write_reg(struct usbnet *dev, u32 index, ret = fn(dev, USB_VENDOR_REQUEST_WRITE_REGISTER, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, index, &buf, 4); - if (unlikely(ret < 0)) + if (ret < 0 && ret != -ENODEV) netdev_warn(dev->net, "Failed to write reg index 0x%08x: %d\n", index, ret); @@ -159,6 +160,9 @@ static int __must_check __smsc95xx_phy_wait_not_busy(struct usbnet *dev, do { ret = __smsc95xx_read_reg(dev, MII_ADDR, &val, in_pm); if (ret < 0) { + /* Ignore -ENODEV error during disconnect() */ + if (ret == -ENODEV) + return 0; netdev_warn(dev->net, "Error reading MII_ACCESS\n"); return ret; } @@ -194,7 +198,8 @@ static int __smsc95xx_mdio_read(struct usbnet *dev, int phy_id, int idx, addr = mii_address_cmd(phy_id, idx, MII_READ_ | MII_BUSY_); ret = __smsc95xx_write_reg(dev, MII_ADDR, addr, in_pm); if (ret < 0) { - netdev_warn(dev->net, "Error writing MII_ADDR\n"); + if (ret != -ENODEV) + netdev_warn(dev->net, "Error writing MII_ADDR\n"); goto done; } @@ -206,7 +211,8 @@ static int __smsc95xx_mdio_read(struct usbnet *dev, int phy_id, int idx, ret = __smsc95xx_read_reg(dev, MII_DATA, &val, in_pm); if (ret < 0) { - netdev_warn(dev->net, "Error reading MII_DATA\n"); + if (ret != -ENODEV) + netdev_warn(dev->net, "Error reading MII_DATA\n"); goto done; } @@ -214,6 +220,10 @@ static int __smsc95xx_mdio_read(struct usbnet *dev, int phy_id, int idx, done: mutex_unlock(&dev->phy_mutex); + + /* Ignore -ENODEV error during disconnect() */ + if (ret == -ENODEV) + return 0; return ret; } @@ -235,7 +245,8 @@ static void __smsc95xx_mdio_write(struct usbnet *dev, int phy_id, val = regval; ret = __smsc95xx_write_reg(dev, MII_DATA, val, in_pm); if (ret < 0) { - netdev_warn(dev->net, "Error writing MII_DATA\n"); + if (ret != -ENODEV) + netdev_warn(dev->net, "Error writing MII_DATA\n"); goto done; } @@ -243,7 +254,8 @@ static void __smsc95xx_mdio_write(struct usbnet *dev, int phy_id, addr = mii_address_cmd(phy_id, idx, MII_WRITE_ | MII_BUSY_); ret = __smsc95xx_write_reg(dev, MII_ADDR, addr, in_pm); if (ret < 0) { - netdev_warn(dev->net, "Error writing MII_ADDR\n"); + if (ret != -ENODEV) + netdev_warn(dev->net, "Error writing MII_ADDR\n"); goto done; } From 5f84e73f9a8f14b95115b0eb2080da6d9fa7a82e Mon Sep 17 00:00:00 2001 From: Akhil R Date: Mon, 28 Feb 2022 21:04:05 +0530 Subject: [PATCH 679/773] gpio: tegra186: Add IRQ per bank for Tegra241 Add the number of interrupts per bank for Tegra241 (Grace) to fix the probe failure. Fixes: d1056b771ddb ("gpio: tegra186: Add support for Tegra241") Signed-off-by: Akhil R Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-tegra186.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c index 8d298beffd86..031fe105b58e 100644 --- a/drivers/gpio/gpio-tegra186.c +++ b/drivers/gpio/gpio-tegra186.c @@ -1075,6 +1075,7 @@ static const struct tegra_gpio_soc tegra241_main_soc = { .ports = tegra241_main_ports, .name = "tegra241-gpio", .instance = 0, + .num_irqs_per_bank = 8, }; #define TEGRA241_AON_GPIO_PORT(_name, _bank, _port, _pins) \ @@ -1095,6 +1096,7 @@ static const struct tegra_gpio_soc tegra241_aon_soc = { .ports = tegra241_aon_ports, .name = "tegra241-gpio-aon", .instance = 1, + .num_irqs_per_bank = 8, }; static const struct of_device_id tegra186_gpio_of_match[] = { From fc328a7d1fcce263db0b046917a66f3aa6e68719 Mon Sep 17 00:00:00 2001 From: Marcelo Roberto Jimenez Date: Mon, 7 Mar 2022 10:57:24 +0100 Subject: [PATCH 680/773] gpio: Revert regression in sysfs-gpio (gpiolib.c) Some GPIO lines have stopped working after the patch commit 2ab73c6d8323f ("gpio: Support GPIO controllers without pin-ranges") And this has supposedly been fixed in the following patches commit 89ad556b7f96a ("gpio: Avoid using pin ranges with !PINCTRL") commit 6dbbf84603961 ("gpiolib: Don't free if pin ranges are not defined") But an erratic behavior where some GPIO lines work while others do not work has been introduced. This patch reverts those changes so that the sysfs-gpio interface works properly again. Signed-off-by: Marcelo Roberto Jimenez Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index a3d14277f17c..005642068c0b 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1701,11 +1701,6 @@ static inline void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc) */ int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset) { -#ifdef CONFIG_PINCTRL - if (list_empty(&gc->gpiodev->pin_ranges)) - return 0; -#endif - return pinctrl_gpio_request(gc->gpiodev->base + offset); } EXPORT_SYMBOL_GPL(gpiochip_generic_request); @@ -1717,11 +1712,6 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_request); */ void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset) { -#ifdef CONFIG_PINCTRL - if (list_empty(&gc->gpiodev->pin_ranges)) - return; -#endif - pinctrl_gpio_free(gc->gpiodev->base + offset); } EXPORT_SYMBOL_GPL(gpiochip_generic_free); From 660c619b9d7ccd28648ee3766cdbe94ec7b27402 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Mar 2022 13:56:23 +0200 Subject: [PATCH 681/773] gpiolib: acpi: Convert ACPI value of debounce to microseconds It appears that GPIO ACPI library uses ACPI debounce values directly. However, the GPIO library APIs expect the debounce timeout to be in microseconds. Convert ACPI value of debounce to microseconds. While at it, document this detail where it is appropriate. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215664 Reported-by: Kai-Heng Feng Fixes: 8dcb7a15a585 ("gpiolib: acpi: Take into account debounce settings") Signed-off-by: Andy Shevchenko Tested-by: Kai-Heng Feng Reviewed-by: Mika Westerberg Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 6 ++++-- drivers/gpio/gpiolib.c | 10 ++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index c0f6a25c3279..a5495ad31c9c 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -307,7 +307,8 @@ static struct gpio_desc *acpi_request_own_gpiod(struct gpio_chip *chip, if (IS_ERR(desc)) return desc; - ret = gpio_set_debounce_timeout(desc, agpio->debounce_timeout); + /* ACPI uses hundredths of milliseconds units */ + ret = gpio_set_debounce_timeout(desc, agpio->debounce_timeout * 10); if (ret) dev_warn(chip->parent, "Failed to set debounce-timeout for pin 0x%04X, err %d\n", @@ -1035,7 +1036,8 @@ int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int ind if (ret < 0) return ret; - ret = gpio_set_debounce_timeout(desc, info.debounce); + /* ACPI uses hundredths of milliseconds units */ + ret = gpio_set_debounce_timeout(desc, info.debounce * 10); if (ret) return ret; diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 005642068c0b..defb7c464b87 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2217,6 +2217,16 @@ static int gpio_set_bias(struct gpio_desc *desc) return gpio_set_config_with_argument_optional(desc, bias, arg); } +/** + * gpio_set_debounce_timeout() - Set debounce timeout + * @desc: GPIO descriptor to set the debounce timeout + * @debounce: Debounce timeout in microseconds + * + * The function calls the certain GPIO driver to set debounce timeout + * in the hardware. + * + * Returns 0 on success, or negative error code otherwise. + */ int gpio_set_debounce_timeout(struct gpio_desc *desc, unsigned int debounce) { return gpio_set_config_with_argument_optional(desc, From 804f468853179b9b58af05c153c411931aa5b310 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Fri, 25 Feb 2022 09:02:28 +0200 Subject: [PATCH 682/773] drm/i915/psr: Set "SF Partial Frame Enable" also on full update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we are observing occasional screen flickering when PSR2 selective fetch is enabled. More specifically glitch seems to happen on full frame update when cursor moves to coords x = -1 or y = -1. According to Bspec SF Single full frame should not be set if SF Partial Frame Enable is not set. This happened to be true for ADLP as PSR2_MAN_TRK_CTL_ENABLE is always set and for ADL_P it's actually "SF Partial Frame Enable" (Bit 31). Setting "SF Partial Frame Enable" bit also on full update seems to fix screen flickering. Also make code more clear by setting PSR2_MAN_TRK_CTL_ENABLE only if not on ADL_P. Bit 31 has different meaning in ADL_P. Bspec: 49274 v2: Fix Mihai Harpau email address v3: Modify commit message and remove unnecessary comment Tested-by: Lyude Paul Fixes: 7f6002e58025 ("drm/i915/display: Enable PSR2 selective fetch by default") Reported-by: Lyude Paul Cc: Mihai Harpau Cc: José Roberto de Souza Cc: Ville Syrjälä Bugzilla: https://gitlab.freedesktop.org/drm/intel/-/issues/5077 Signed-off-by: Jouni Högander Reviewed-by: José Roberto de Souza Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20220225070228.855138-1-jouni.hogander@intel.com (cherry picked from commit 8d5516d18b323cf7274d1cf5fe76f4a691f879c6) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_psr.c | 16 ++++++++++++++-- drivers/gpu/drm/i915/i915_reg.h | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index a1a663f362e7..00279e8c2775 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1406,6 +1406,13 @@ static inline u32 man_trk_ctl_single_full_frame_bit_get(struct drm_i915_private PSR2_MAN_TRK_CTL_SF_SINGLE_FULL_FRAME; } +static inline u32 man_trk_ctl_partial_frame_bit_get(struct drm_i915_private *dev_priv) +{ + return IS_ALDERLAKE_P(dev_priv) ? + ADLP_PSR2_MAN_TRK_CTL_SF_PARTIAL_FRAME_UPDATE : + PSR2_MAN_TRK_CTL_SF_PARTIAL_FRAME_UPDATE; +} + static void psr_force_hw_tracking_exit(struct intel_dp *intel_dp) { struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); @@ -1510,7 +1517,13 @@ static void psr2_man_trk_ctl_calc(struct intel_crtc_state *crtc_state, { struct intel_crtc *crtc = to_intel_crtc(crtc_state->uapi.crtc); struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); - u32 val = PSR2_MAN_TRK_CTL_ENABLE; + u32 val = 0; + + if (!IS_ALDERLAKE_P(dev_priv)) + val = PSR2_MAN_TRK_CTL_ENABLE; + + /* SF partial frame enable has to be set even on full update */ + val |= man_trk_ctl_partial_frame_bit_get(dev_priv); if (full_update) { /* @@ -1530,7 +1543,6 @@ static void psr2_man_trk_ctl_calc(struct intel_crtc_state *crtc_state, } else { drm_WARN_ON(crtc_state->uapi.crtc->dev, clip->y1 % 4 || clip->y2 % 4); - val |= PSR2_MAN_TRK_CTL_SF_PARTIAL_FRAME_UPDATE; val |= PSR2_MAN_TRK_CTL_SU_REGION_START_ADDR(clip->y1 / 4 + 1); val |= PSR2_MAN_TRK_CTL_SU_REGION_END_ADDR(clip->y2 / 4 + 1); } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index c2bb33febb68..902e4c802a12 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -4829,6 +4829,7 @@ enum { #define ADLP_PSR2_MAN_TRK_CTL_SU_REGION_START_ADDR(val) REG_FIELD_PREP(ADLP_PSR2_MAN_TRK_CTL_SU_REGION_START_ADDR_MASK, val) #define ADLP_PSR2_MAN_TRK_CTL_SU_REGION_END_ADDR_MASK REG_GENMASK(12, 0) #define ADLP_PSR2_MAN_TRK_CTL_SU_REGION_END_ADDR(val) REG_FIELD_PREP(ADLP_PSR2_MAN_TRK_CTL_SU_REGION_END_ADDR_MASK, val) +#define ADLP_PSR2_MAN_TRK_CTL_SF_PARTIAL_FRAME_UPDATE REG_BIT(31) #define ADLP_PSR2_MAN_TRK_CTL_SF_SINGLE_FULL_FRAME REG_BIT(14) #define ADLP_PSR2_MAN_TRK_CTL_SF_CONTINUOS_FULL_FRAME REG_BIT(13) From 0c4bcfdecb1ac0967619ee7ff44871d93c08c909 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Mar 2022 16:30:44 +0100 Subject: [PATCH 683/773] fuse: fix pipe buffer lifetime for direct_io In FOPEN_DIRECT_IO mode, fuse_file_write_iter() calls fuse_direct_write_iter(), which normally calls fuse_direct_io(), which then imports the write buffer with fuse_get_user_pages(), which uses iov_iter_get_pages() to grab references to userspace pages instead of actually copying memory. On the filesystem device side, these pages can then either be read to userspace (via fuse_dev_read()), or splice()d over into a pipe using fuse_dev_splice_read() as pipe buffers with &nosteal_pipe_buf_ops. This is wrong because after fuse_dev_do_read() unlocks the FUSE request, the userspace filesystem can mark the request as completed, causing write() to return. At that point, the userspace filesystem should no longer have access to the pipe buffer. Fix by copying pages coming from the user address space to new pipe buffers. Reported-by: Jann Horn Fixes: c3021629a0d8 ("fuse: support splice() reading from fuse device") Cc: Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 12 +++++++++++- fs/fuse/file.c | 1 + fs/fuse/fuse_i.h | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cd54a529460d..592730fd6e42 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -941,7 +941,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 829094451774..0fc150c1c50b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1413,6 +1413,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.user_pages = true; if (write) ap->args.in_pages = true; else diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e8e59fbdefeb..eac4984cc753 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,6 +256,7 @@ struct fuse_args { bool nocreds:1; bool in_pages:1; bool out_pages:1; + bool user_pages:1; bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; From 42da5a4ba17070e9d99abf375a5bd70e85d2a6b8 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Sat, 19 Feb 2022 21:36:00 +0200 Subject: [PATCH 684/773] mtd: rawnand: omap2: Actually prevent invalid configuration and build error The root of the problem is that we are selecting symbols that have dependencies. This can cause random configurations that can fail. The cleanest solution is to avoid using select. This driver uses interfaces from the OMAP_GPMC driver so we have to depend on it instead. Fixes: 4cd335dae3cf ("mtd: rawnand: omap2: Prevent invalid configuration and build error") Signed-off-by: Roger Quadros Signed-off-by: Miquel Raynal Tested-by: Randy Dunlap Link: https://lore.kernel.org/linux-mtd/20220219193600.24892-1-rogerq@kernel.org --- drivers/mtd/nand/raw/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig index d986ab4e4c35..820e5dc3bc9b 100644 --- a/drivers/mtd/nand/raw/Kconfig +++ b/drivers/mtd/nand/raw/Kconfig @@ -42,8 +42,7 @@ config MTD_NAND_OMAP2 tristate "OMAP2, OMAP3, OMAP4 and Keystone NAND controller" depends on ARCH_OMAP2PLUS || ARCH_KEYSTONE || ARCH_K3 || COMPILE_TEST depends on HAS_IOMEM - select MEMORY - select OMAP_GPMC + depends on OMAP_GPMC help Support for NAND flash on Texas Instruments OMAP2, OMAP3, OMAP4 and Keystone platforms. From 58c9a5060cb7cd529d49c93954cdafe81c1d642a Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 3 Mar 2022 16:53:56 +0000 Subject: [PATCH 685/773] arm64: proton-pack: Include unprivileged eBPF status in Spectre v2 mitigation reporting The mitigations for Spectre-BHB are only applied when an exception is taken from user-space. The mitigation status is reported via the spectre_v2 sysfs vulnerabilities file. When unprivileged eBPF is enabled the mitigation in the exception vectors can be avoided by an eBPF program. When unprivileged eBPF is enabled, print a warning and report vulnerable via the sysfs vulnerabilities file. Acked-by: Catalin Marinas Signed-off-by: James Morse --- arch/arm64/kernel/proton-pack.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index d3fbff00993d..6d45c63c6454 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -111,6 +112,15 @@ static const char *get_bhb_affected_string(enum mitigation_state bhb_state) } } +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false; +#endif +} + ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { @@ -130,6 +140,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, v2_str = "CSV2"; fallthrough; case SPECTRE_MITIGATED: + if (bhb_state == SPECTRE_MITIGATED && _unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str); case SPECTRE_VULNERABLE: fallthrough; @@ -1125,3 +1138,16 @@ void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt, *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); *updptr++ = cpu_to_le32(aarch64_insn_gen_nop()); } + +#ifdef CONFIG_BPF_SYSCALL +#define EBPF_WARN "Unprivileged eBPF is enabled, data leaks possible via Spectre v2 BHB attacks!\n" +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_state == SPECTRE_VULNERABLE || + spectre_bhb_state != SPECTRE_MITIGATED) + return; + + if (!new_state) + pr_err("WARNING: %s", EBPF_WARN); +} +#endif From 7401b49c50c2b032223de408e28e37cbd63f4c97 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 7 Mar 2022 18:59:09 +0100 Subject: [PATCH 686/773] ARM: tegra: Move Nyan FHD panels to AUX bus Similarly to what was earlier done for other Nyan variants, move the eDP panel on the FHD models to the AUX bus as well. Suggested-by: Dmitry Osipenko Fixes: ef6fb9875ce0 ("ARM: tegra: Add device-tree for 1080p version of Nyan Big") Signed-off-by: Thierry Reding --- arch/arm/boot/dts/tegra124-nyan-big-fhd.dts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/tegra124-nyan-big-fhd.dts b/arch/arm/boot/dts/tegra124-nyan-big-fhd.dts index d35fb79d2f51..4db43324dafa 100644 --- a/arch/arm/boot/dts/tegra124-nyan-big-fhd.dts +++ b/arch/arm/boot/dts/tegra124-nyan-big-fhd.dts @@ -5,7 +5,13 @@ / { /* Version of Nyan Big with 1080p panel */ - panel { - compatible = "auo,b133htn01"; + host1x@50000000 { + dpaux@545c0000 { + aux-bus { + panel: panel { + compatible = "auo,b133htn01"; + }; + }; + }; }; }; From aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Sat, 5 Mar 2022 18:07:14 +0100 Subject: [PATCH 687/773] swiotlb: rework "fix info leak with DMA_FROM_DEVICE" Unfortunately, we ended up merging an old version of the patch "fix info leak with DMA_FROM_DEVICE" instead of merging the latest one. Christoph (the swiotlb maintainer), he asked me to create an incremental fix (after I have pointed this out the mix up, and asked him for guidance). So here we go. The main differences between what we got and what was agreed are: * swiotlb_sync_single_for_device is also required to do an extra bounce * We decided not to introduce DMA_ATTR_OVERWRITE until we have exploiters * The implantation of DMA_ATTR_OVERWRITE is flawed: DMA_ATTR_OVERWRITE must take precedence over DMA_ATTR_SKIP_CPU_SYNC Thus this patch removes DMA_ATTR_OVERWRITE, and makes swiotlb_sync_single_for_device() bounce unconditionally (that is, also when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale data from the swiotlb buffer. Let me note, that if the size used with dma_sync_* API is less than the size used with dma_[un]map_*, under certain circumstances we may still end up with swiotlb not being transparent. In that sense, this is no perfect fix either. To get this bullet proof, we would have to bounce the entire mapping/bounce buffer. For that we would have to figure out the starting address, and the size of the mapping in swiotlb_sync_single_for_device(). While this does seem possible, there seems to be no firm consensus on how things are supposed to work. Signed-off-by: Halil Pasic Fixes: ddbd89deb7d3 ("swiotlb: fix info leak with DMA_FROM_DEVICE") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- Documentation/core-api/dma-attributes.rst | 8 -------- include/linux/dma-mapping.h | 8 -------- kernel/dma/swiotlb.c | 23 +++++++++++++++-------- 3 files changed, 15 insertions(+), 24 deletions(-) diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 17706dc91ec9..1887d92e8e92 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,11 +130,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). - -DMA_ATTR_OVERWRITE ------------------- - -This is a hint to the DMA-mapping subsystem that the device is expected to -overwrite the entire mapped size, thus the caller does not require any of the -previous buffer contents to be preserved. This allows bounce-buffering -implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 6150d11a607e..dca2b1355bb1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,14 +61,6 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) -/* - * This is a hint to the DMA-mapping subsystem that the device is expected - * to overwrite the entire mapped size, thus the caller does not require any - * of the previous buffer contents to be preserved. This allows - * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. - */ -#define DMA_ATTR_OVERWRITE (1UL << 10) - /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index bfc56cb21705..6db1c475ec82 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -627,10 +627,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || - dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -697,10 +701,13 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) - swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); + /* + * Unconditional bounce is necessary to avoid corruption on + * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite + * the whole lengt of the bounce buffer. + */ + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + BUG_ON(!valid_dma_direction(dir)); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, From 1860d30466366774055d993f9b31094ede8af415 Mon Sep 17 00:00:00 2001 From: Kuldeep Singh Date: Mon, 7 Mar 2022 23:50:59 +0530 Subject: [PATCH 688/773] MAINTAINERS: Update git tree for Broadcom iProc SoCs Current git tree for Broadcom iProc SoCs is pretty outdated as it has not updated for a long time. Fix the reference. Signed-off-by: Kuldeep Singh --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea3e6c914384..5d627156efd9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3900,7 +3900,7 @@ M: Scott Branden M: bcm-kernel-feedback-list@broadcom.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained -T: git git://github.com/broadcom/cygnus-linux.git +T: git git://github.com/broadcom/stblinux.git F: arch/arm64/boot/dts/broadcom/northstar2/* F: arch/arm64/boot/dts/broadcom/stingray/* F: drivers/clk/bcm/clk-ns* From 5125091d757a251a128ec38d2397c9d160394eac Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 7 Mar 2022 18:28:05 +0100 Subject: [PATCH 689/773] MAINTAINERS: update Krzysztof Kozlowski's email Use Krzysztof Kozlowski's @kernel.org account in maintainer entries. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220307172805.156760-1-krzysztof.kozlowski@canonical.com' Signed-off-by: Arnd Bergmann --- .mailmap | 1 + MAINTAINERS | 32 ++++++++++++++--------------- drivers/soc/samsung/exynos-chipid.c | 2 +- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/.mailmap b/.mailmap index 10ee1103c823..b96b98a2d9de 100644 --- a/.mailmap +++ b/.mailmap @@ -216,6 +216,7 @@ Koushik Krishna Manikandan Krzysztof Kozlowski Krzysztof Kozlowski +Krzysztof Kozlowski Kuninori Morimoto Kuogee Hsieh Leonardo Bras diff --git a/MAINTAINERS b/MAINTAINERS index 1ba1e4af2cbc..a84bd0855d67 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2572,7 +2572,7 @@ F: sound/soc/rockchip/ N: rockchip ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org @@ -11675,7 +11675,7 @@ F: drivers/iio/proximity/mb1232.c MAXIM MAX17040 FAMILY FUEL GAUGE DRIVERS R: Iskren Chernev -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Matheus Castello L: linux-pm@vger.kernel.org @@ -11685,7 +11685,7 @@ F: drivers/power/supply/max17040_battery.c MAXIM MAX17042 FAMILY FUEL GAUGE DRIVERS R: Hans de Goede -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Sebastian Krzyszkowiak R: Purism Kernel Team @@ -11730,7 +11730,7 @@ F: Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml F: drivers/power/supply/max77976_charger.c MAXIM MUIC CHARGER DRIVERS FOR EXYNOS BASED BOARDS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Bartlomiej Zolnierkiewicz L: linux-pm@vger.kernel.org S: Supported @@ -11739,7 +11739,7 @@ F: drivers/power/supply/max77693_charger.c MAXIM PMIC AND MUIC DRIVERS FOR EXYNOS BASED BOARDS M: Chanwoo Choi -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Bartlomiej Zolnierkiewicz L: linux-kernel@vger.kernel.org S: Supported @@ -12428,7 +12428,7 @@ F: include/linux/memblock.h F: mm/memblock.c MEMORY CONTROLLER DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git @@ -13565,7 +13565,7 @@ F: include/uapi/linux/nexthop.h F: net/ipv4/nexthop.c NFC SUBSYSTEM -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-nfc@lists.01.org (subscribers-only) L: netdev@vger.kernel.org S: Maintained @@ -13879,7 +13879,7 @@ F: Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml F: drivers/regulator/pf8x00-regulator.c NXP PTN5150A CC LOGIC AND EXTCON DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/extcon/extcon-ptn5150.yaml @@ -15305,7 +15305,7 @@ F: drivers/pinctrl/renesas/ PIN CONTROLLER - SAMSUNG M: Tomasz Figa -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -16947,7 +16947,7 @@ W: http://www.ibm.com/developerworks/linux/linux390/ F: drivers/s390/scsi/zfcp_* S3C ADC BATTERY DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-samsung-soc@vger.kernel.org S: Odd Fixes F: drivers/power/supply/s3c_adc_battery.c @@ -16992,7 +16992,7 @@ F: Documentation/admin-guide/LSM/SafeSetID.rst F: security/safesetid/ SAMSUNG AUDIO (ASoC) DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Supported @@ -17000,7 +17000,7 @@ F: Documentation/devicetree/bindings/sound/samsung* F: sound/soc/samsung/ SAMSUNG EXYNOS PSEUDO RANDOM NUMBER GENERATOR (RNG) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -17035,7 +17035,7 @@ S: Maintained F: drivers/platform/x86/samsung-laptop.c SAMSUNG MULTIFUNCTION PMIC DEVICE DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Bartlomiej Zolnierkiewicz L: linux-kernel@vger.kernel.org L: linux-samsung-soc@vger.kernel.org @@ -17061,7 +17061,7 @@ F: drivers/media/platform/s3c-camif/ F: include/media/drv-intf/s3c_camif.h SAMSUNG S3FWRN5 NFC DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Krzysztof Opasiak L: linux-nfc@lists.01.org (subscribers-only) S: Maintained @@ -17083,7 +17083,7 @@ S: Supported F: drivers/media/i2c/s5k5baf.c SAMSUNG S5P Security SubSystem (SSS) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Vladimir Zapolskiy L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org @@ -17118,7 +17118,7 @@ F: include/linux/clk/samsung.h F: include/linux/platform_data/clk-s3c2410.h SAMSUNG SPI DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Andi Shyti L: linux-spi@vger.kernel.org L: linux-samsung-soc@vger.kernel.org diff --git a/drivers/soc/samsung/exynos-chipid.c b/drivers/soc/samsung/exynos-chipid.c index 2746d05936d3..0fb3631e7346 100644 --- a/drivers/soc/samsung/exynos-chipid.c +++ b/drivers/soc/samsung/exynos-chipid.c @@ -204,7 +204,7 @@ module_platform_driver(exynos_chipid_driver); MODULE_DESCRIPTION("Samsung Exynos ChipID controller and ASV driver"); MODULE_AUTHOR("Bartlomiej Zolnierkiewicz "); -MODULE_AUTHOR("Krzysztof Kozlowski "); +MODULE_AUTHOR("Krzysztof Kozlowski "); MODULE_AUTHOR("Pankaj Dubey "); MODULE_AUTHOR("Sylwester Nawrocki "); MODULE_LICENSE("GPL"); From a9a5b720dc8227243f433141ba1343aa53ef57e4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Mar 2022 18:38:40 +0200 Subject: [PATCH 690/773] gpio: sim: Declare gpio_sim_hog_config_item_ops static Compiler is not happy: warning: symbol 'gpio_sim_hog_config_item_ops' was not declared. Should it be static? Fixes: cb8c474e79be ("gpio: sim: new testing module") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index 153fe79e1bf3..bb9bb595c1a8 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -1322,7 +1322,7 @@ static void gpio_sim_hog_config_item_release(struct config_item *item) kfree(hog); } -struct configfs_item_operations gpio_sim_hog_config_item_ops = { +static struct configfs_item_operations gpio_sim_hog_config_item_ops = { .release = gpio_sim_hog_config_item_release, }; From 6e2edd6371a497a6350bb735534c9bda2a31f43d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 3 Mar 2022 18:00:44 +0000 Subject: [PATCH 691/773] arm64: Ensure execute-only permissions are not allowed without EPAN Commit 18107f8a2df6 ("arm64: Support execute-only permissions with Enhanced PAN") re-introduced execute-only permissions when EPAN is available. When EPAN is not available, arch_filter_pgprot() is supposed to change a PAGE_EXECONLY permission into PAGE_READONLY_EXEC. However, if BTI or MTE are present, such check does not detect the execute-only pgprot in the presence of PTE_GP (BTI) or MT_NORMAL_TAGGED (MTE), allowing the user to request PROT_EXEC with PROT_BTI or PROT_MTE. Remove the arch_filter_pgprot() function, change the default VM_EXEC permissions to PAGE_READONLY_EXEC and update the protection_map[] array at core_initcall() if EPAN is detected. Signed-off-by: Catalin Marinas Fixes: 18107f8a2df6 ("arm64: Support execute-only permissions with Enhanced PAN") Cc: # 5.13.x Acked-by: Will Deacon Reviewed-by: Vladimir Murzin Tested-by: Vladimir Murzin --- arch/arm64/Kconfig | 3 --- arch/arm64/include/asm/pgtable-prot.h | 4 ++-- arch/arm64/include/asm/pgtable.h | 11 ----------- arch/arm64/mm/mmap.c | 17 +++++++++++++++++ 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c7a474f71eb4..f711a8eee6ff 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1236,9 +1236,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config ARCH_HAS_FILTER_PGPROT - def_bool y - # Supported by clang >= 7.0 config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 7032f04c8ac6..b1e1b74d993c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -92,7 +92,7 @@ extern bool arm64_use_ng_mappings; #define __P001 PAGE_READONLY #define __P010 PAGE_READONLY #define __P011 PAGE_READONLY -#define __P100 PAGE_EXECONLY +#define __P100 PAGE_READONLY_EXEC /* PAGE_EXECONLY if Enhanced PAN */ #define __P101 PAGE_READONLY_EXEC #define __P110 PAGE_READONLY_EXEC #define __P111 PAGE_READONLY_EXEC @@ -101,7 +101,7 @@ extern bool arm64_use_ng_mappings; #define __S001 PAGE_READONLY #define __S010 PAGE_SHARED #define __S011 PAGE_SHARED -#define __S100 PAGE_EXECONLY +#define __S100 PAGE_READONLY_EXEC /* PAGE_EXECONLY if Enhanced PAN */ #define __S101 PAGE_READONLY_EXEC #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c4ba047a82d2..94e147e5456c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1017,17 +1017,6 @@ static inline bool arch_wants_old_prefaulted_pte(void) } #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte -static inline pgprot_t arch_filter_pgprot(pgprot_t prot) -{ - if (cpus_have_const_cap(ARM64_HAS_EPAN)) - return prot; - - if (pgprot_val(prot) != pgprot_val(PAGE_EXECONLY)) - return prot; - - return PAGE_READONLY_EXEC; -} - static inline bool pud_sect_supported(void) { return PAGE_SIZE == SZ_4K; diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index a38f54cd638c..77ada00280d9 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -7,8 +7,10 @@ #include #include +#include #include +#include #include /* @@ -38,3 +40,18 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) { return !(((pfn << PAGE_SHIFT) + size) & ~PHYS_MASK); } + +static int __init adjust_protection_map(void) +{ + /* + * With Enhanced PAN we can honour the execute-only permissions as + * there is no PAN override with such mappings. + */ + if (cpus_have_const_cap(ARM64_HAS_EPAN)) { + protection_map[VM_EXEC] = PAGE_EXECONLY; + protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; + } + + return 0; +} +arch_initcall(adjust_protection_map); From 9470c29faa91c804aa04de4c10634bf02462bfa5 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Mon, 28 Feb 2022 19:14:36 +0100 Subject: [PATCH 692/773] drm/sun4i: mixer: Fix P010 and P210 format numbers It turns out that DE3 manual has inverted YUV and YVU format numbers for P010 and P210. Invert them. This was tested by playing video decoded to P010 and additionally confirmed by looking at BSP driver source. Fixes: 169ca4b38932 ("drm/sun4i: Add separate DE3 VI layer formats") Signed-off-by: Jernej Skrabec Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220228181436.1424550-1-jernej.skrabec@gmail.com --- drivers/gpu/drm/sun4i/sun8i_mixer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/sun4i/sun8i_mixer.h b/drivers/gpu/drm/sun4i/sun8i_mixer.h index 145833a9d82d..5b3fbee18671 100644 --- a/drivers/gpu/drm/sun4i/sun8i_mixer.h +++ b/drivers/gpu/drm/sun4i/sun8i_mixer.h @@ -111,10 +111,10 @@ /* format 13 is semi-planar YUV411 VUVU */ #define SUN8I_MIXER_FBFMT_YUV411 14 /* format 15 doesn't exist */ -/* format 16 is P010 YVU */ -#define SUN8I_MIXER_FBFMT_P010_YUV 17 -/* format 18 is P210 YVU */ -#define SUN8I_MIXER_FBFMT_P210_YUV 19 +#define SUN8I_MIXER_FBFMT_P010_YUV 16 +/* format 17 is P010 YVU */ +#define SUN8I_MIXER_FBFMT_P210_YUV 18 +/* format 19 is P210 YVU */ /* format 20 is packed YVU444 10-bit */ /* format 21 is packed YUV444 10-bit */ From e5417cbf7ab5df1632e68fe7d9e6331fc0e7dbd6 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 7 Mar 2022 12:13:30 +0000 Subject: [PATCH 693/773] net: dsa: mt7530: fix incorrect test in mt753x_phylink_validate() Discussing one of the tests in mt753x_phylink_validate() with Landen Chao confirms that the "||" should be "&&". Fix this. Fixes: c288575f7810 ("net: dsa: mt7530: Add the support of MT7531 switch") Signed-off-by: Russell King (Oracle) Link: https://lore.kernel.org/r/E1nRCF0-00CiXD-7q@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index ff3c267d0f26..a251bc55727f 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2936,7 +2936,7 @@ mt753x_phylink_validate(struct dsa_switch *ds, int port, phylink_set_port_modes(mask); - if (state->interface != PHY_INTERFACE_MODE_TRGMII || + if (state->interface != PHY_INTERFACE_MODE_TRGMII && !phy_interface_mode_is_8023z(state->interface)) { phylink_set(mask, 10baseT_Half); phylink_set(mask, 10baseT_Full); From 1a4e53d2fc4f68aa654ad96d13ad042e1a8e8a7d Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 7 Mar 2022 18:48:43 +0000 Subject: [PATCH 694/773] spi: Fix invalid sgs value max_seg_size is unsigned int and it can have a value up to 2^32 (for eg:-RZ_DMAC driver sets dma_set_max_seg_size as U32_MAX) When this value is used in min_t() as an integer type, it becomes -1 and the value of sgs becomes 0. Fix this issue by replacing the 'int' data type with 'unsigned int' in min_t(). Signed-off-by: Biju Das Reviewed-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220307184843.9994-1-biju.das.jz@bp.renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 4599b121d744..d96082dc3340 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1019,10 +1019,10 @@ int spi_map_buf(struct spi_controller *ctlr, struct device *dev, int i, ret; if (vmalloced_buf || kmap_buf) { - desc_len = min_t(int, max_seg_size, PAGE_SIZE); + desc_len = min_t(unsigned int, max_seg_size, PAGE_SIZE); sgs = DIV_ROUND_UP(len + offset_in_page(buf), desc_len); } else if (virt_addr_valid(buf)) { - desc_len = min_t(int, max_seg_size, ctlr->max_dma_len); + desc_len = min_t(unsigned int, max_seg_size, ctlr->max_dma_len); sgs = DIV_ROUND_UP(len, desc_len); } else { return -EINVAL; From 2f6edb6bcb2f3f41d876e0eba2ba97f87a0296ea Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 8 Mar 2022 10:36:31 +1030 Subject: [PATCH 695/773] ARM: dts: aspeed: Fix AST2600 quad spi group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Requesting quad mode for the FMC resulted in an error: &fmc { status = "okay"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_fwqspi_default>' [ 0.742963] aspeed-g6-pinctrl 1e6e2000.syscon:pinctrl: invalid function FWQSPID in map table  This is because the quad mode pins are a group of pins, not a function. After applying this patch we can request the pins and the QSPI data lines are muxed: # cat /sys/kernel/debug/pinctrl/1e6e2000.syscon\:pinctrl-aspeed-g6-pinctrl/pinmux-pins |grep 1e620000.spi pin 196 (AE12): device 1e620000.spi function FWSPID group FWQSPID pin 197 (AF12): device 1e620000.spi function FWSPID group FWQSPID pin 240 (Y1): device 1e620000.spi function FWSPID group FWQSPID pin 241 (Y2): device 1e620000.spi function FWSPID group FWQSPID pin 242 (Y3): device 1e620000.spi function FWSPID group FWQSPID pin 243 (Y4): device 1e620000.spi function FWSPID group FWQSPID Fixes: f510f04c8c83 ("ARM: dts: aspeed: Add AST2600 pinmux nodes") Signed-off-by: Joel Stanley Reviewed-by: Andrew Jeffery Link: https://lore.kernel.org/r/20220304011010.974863-1-joel@jms.id.au Link: https://lore.kernel.org/r/20220304011010.974863-1-joel@jms.id.au' Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi index 6dde51c2aed3..e4775bbceecc 100644 --- a/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi +++ b/arch/arm/boot/dts/aspeed-g6-pinctrl.dtsi @@ -118,7 +118,7 @@ }; pinctrl_fwqspid_default: fwqspid_default { - function = "FWQSPID"; + function = "FWSPID"; groups = "FWQSPID"; }; From 5adf349439d29f92467e864f728dfc23180f3ef9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Mar 2022 12:23:23 +0100 Subject: [PATCH 696/773] x86/module: Fix the paravirt vs alternative order Ever since commit 4e6292114c74 ("x86/paravirt: Add new features for paravirt patching") there is an ordering dependency between patching paravirt ops and patching alternatives, the module loader still violates this. Fixes: 4e6292114c74 ("x86/paravirt: Add new features for paravirt patching") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Miroslav Benes Cc: Link: https://lore.kernel.org/r/20220303112825.068773913@infradead.org --- arch/x86/kernel/module.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 95fa745e310a..96d7c27b7093 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -273,6 +273,14 @@ int module_finalize(const Elf_Ehdr *hdr, retpolines = s; } + /* + * See alternative_instructions() for the ordering rules between the + * various patching types. + */ + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); @@ -290,11 +298,6 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - /* make jump label nops */ jump_label_apply_nops(me); From 979452fbc43028675b5a5da156f91928b739dea8 Mon Sep 17 00:00:00 2001 From: Robert Foss Date: Tue, 8 Mar 2022 10:49:10 +0100 Subject: [PATCH 697/773] dt-bindings: drm/bridge: anx7625: Revert DPI support Revert DPI support from binding. DPI support relies on the bus-type enum which does not yet support Mipi DPI, since no v4l2_fwnode_bus_type has been defined for this bus type. When DPI for anx7625 was initially added, it assumed that V4L2_FWNODE_BUS_TYPE_PARALLEL was the correct bus type for representing DPI, which it is not. In order to prevent adding this mis-usage to the ABI, let's revert the support. Signed-off-by: Robert Foss Reviewed-by: Laurent Pinchart Reviewed-by: Rob Herring Signed-off-by: Arnd Bergmann --- .../display/bridge/analogix,anx7625.yaml | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml index 1d3e88daca04..25b5ef3f759c 100644 --- a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml +++ b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml @@ -91,22 +91,7 @@ properties: $ref: /schemas/graph.yaml#/$defs/port-base unevaluatedProperties: false description: - MIPI DSI/DPI input. - - properties: - endpoint: - $ref: /schemas/media/video-interfaces.yaml# - type: object - additionalProperties: false - - properties: - remote-endpoint: true - - bus-type: - enum: [1, 5] - default: 1 - - data-lanes: true + Video port for MIPI DSI input. port@1: $ref: /schemas/graph.yaml#/properties/port @@ -155,8 +140,6 @@ examples: reg = <0>; anx7625_in: endpoint { remote-endpoint = <&mipi_dsi>; - bus-type = <5>; - data-lanes = <0 1 2 3>; }; }; From d3258737afc0101f497745f83fc4038c963a6b81 Mon Sep 17 00:00:00 2001 From: Robert Foss Date: Tue, 8 Mar 2022 10:49:11 +0100 Subject: [PATCH 698/773] Revert "arm64: dts: mt8183: jacuzzi: Fix bus properties in anx's DSI endpoint" This reverts commit 32568ae37596b529628ac09b875f4874e614f63f. Signed-off-by: Robert Foss Reviewed-by: Chen-Yu Tsai Reviewed-by: Laurent Pinchart Acked-by: Matthias Brugger Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi index e8f133dc96b9..8f7bf33f607d 100644 --- a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi @@ -171,8 +171,6 @@ anx7625_in: endpoint { remote-endpoint = <&dsi_out>; - bus-type = <5>; - data-lanes = <0 1 2 3>; }; }; From 25875aa71dfefd1959f07e626c4d285b88b27ac2 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 7 Mar 2022 19:28:32 +0000 Subject: [PATCH 699/773] ARM: include unprivileged BPF status in Spectre V2 reporting The mitigations for Spectre-BHB are only applied when an exception is taken, but when unprivileged BPF is enabled, userspace can load BPF programs that can be used to exploit the problem. When unprivileged BPF is enabled, report the vulnerable status via the spectre_v2 sysfs file. Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/spectre.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index ade967f18d06..e7fea962d632 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -1,9 +1,19 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include +static bool _unprivileged_ebpf_enabled(void) +{ +#ifdef CONFIG_BPF_SYSCALL + return !sysctl_unprivileged_bpf_disabled; +#else + return false +#endif +} + ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { @@ -31,6 +41,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, if (spectre_v2_state != SPECTRE_MITIGATED) return sprintf(buf, "%s\n", "Vulnerable"); + if (_unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + switch (spectre_v2_methods) { case SPECTRE_V2_METHOD_BPIALL: method = "Branch predictor hardening"; From d986afd5a7b75b477ac347b222354cecd97edc87 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 4 Mar 2022 15:55:59 +0800 Subject: [PATCH 700/773] MAINTAINERS: Update Jisheng's email address I'm leaving synaptics. Update my email address to my korg mail address and add entries to .mailmap as well to map my work addresses to korg mail address. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/ce7213bd-28ac-6580-466e-875e755fe0ae@synaptics.com' Signed-off-by: Arnd Bergmann --- .mailmap | 2 ++ MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index b96b98a2d9de..97ccdf147111 100644 --- a/.mailmap +++ b/.mailmap @@ -187,6 +187,8 @@ Jiri Slaby Jiri Slaby Jiri Slaby Jiri Slaby +Jisheng Zhang +Jisheng Zhang Johan Hovold Johan Hovold John Paul Adrian Glaubitz diff --git a/MAINTAINERS b/MAINTAINERS index 6ec18c4bd242..d53eb153157a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2739,7 +2739,7 @@ N: stm32 N: stm ARM/Synaptics SoC support -M: Jisheng Zhang +M: Jisheng Zhang M: Sebastian Hesselbarth L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained From 7e807f4b081c5813df21da54e9a0491ea2ce16e7 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 3 Mar 2022 17:23:49 -0600 Subject: [PATCH 701/773] dt-bindings: mfd: Fix pinctrl node name warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent addition pinctrl.yaml in commit c09acbc499e8 ("dt-bindings: pinctrl: use pinctrl.yaml") resulted in some node name warnings: Documentation/devicetree/bindings/mfd/cirrus,lochnagar.example.dt.yaml: \ lochnagar-pinctrl: $nodename:0: 'lochnagar-pinctrl' does not match '^(pinctrl|pinmux)(@[0-9a-f]+)?$' Documentation/devicetree/bindings/mfd/cirrus,madera.example.dt.yaml: \ codec@1a: $nodename:0: 'codec@1a' does not match '^(pinctrl|pinmux)(@[0-9a-f]+)?$' Documentation/devicetree/bindings/mfd/brcm,cru.example.dt.yaml: \ pin-controller@1c0: $nodename:0: 'pin-controller@1c0' does not match '^(pinctrl|pinmux)(@[0-9a-f]+)?$' Fix the node names to the preferred 'pinctrl'. For cirrus,madera, nothing from pinctrl.yaml schema is used, so just drop the reference. Fixes: c09acbc499e8 ("dt-bindings: pinctrl: use pinctrl.yaml") Cc: Rafał Miłecki Signed-off-by: Rob Herring Acked-by: Charles Keepax Acked-by: Lee Jones Link: https://lore.kernel.org/r/20220303232350.2591143-1-robh@kernel.org --- Documentation/devicetree/bindings/mfd/brcm,cru.yaml | 4 ++-- Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml | 6 +++--- .../devicetree/bindings/pinctrl/cirrus,madera.yaml | 3 --- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/brcm,cru.yaml b/Documentation/devicetree/bindings/mfd/brcm,cru.yaml index be4a2df71c25..b85819fbb07c 100644 --- a/Documentation/devicetree/bindings/mfd/brcm,cru.yaml +++ b/Documentation/devicetree/bindings/mfd/brcm,cru.yaml @@ -39,7 +39,7 @@ patternProperties: '^phy@[a-f0-9]+$': $ref: ../phy/bcm-ns-usb2-phy.yaml - '^pin-controller@[a-f0-9]+$': + '^pinctrl@[a-f0-9]+$': $ref: ../pinctrl/brcm,ns-pinmux.yaml '^syscon@[a-f0-9]+$': @@ -94,7 +94,7 @@ examples: reg = <0x180 0x4>; }; - pin-controller@1c0 { + pinctrl@1c0 { compatible = "brcm,bcm4708-pinmux"; reg = <0x1c0 0x24>; reg-names = "cru_gpio_control"; diff --git a/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml b/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml index c00ad3e21c21..ad285cb480c9 100644 --- a/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml +++ b/Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml @@ -126,7 +126,7 @@ properties: clock-frequency: const: 12288000 - lochnagar-pinctrl: + pinctrl: type: object $ref: /schemas/pinctrl/cirrus,lochnagar.yaml# @@ -255,7 +255,7 @@ required: - reg - reset-gpios - lochnagar-clk - - lochnagar-pinctrl + - pinctrl additionalProperties: false @@ -293,7 +293,7 @@ examples: clock-frequency = <32768>; }; - lochnagar-pinctrl { + pinctrl { compatible = "cirrus,lochnagar-pinctrl"; gpio-controller; diff --git a/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml b/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml index c85f759ae5a3..8a90d8273767 100644 --- a/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml +++ b/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml @@ -107,9 +107,6 @@ properties: additionalProperties: false -allOf: - - $ref: "pinctrl.yaml#" - required: - pinctrl-0 - pinctrl-names From f6eafa4022dd61e029205bea4d4147d26e69fef2 Mon Sep 17 00:00:00 2001 From: Aswath Govindraju Date: Thu, 16 Dec 2021 09:40:11 +0530 Subject: [PATCH 702/773] dt-bindings: phy: ti,tcan104x-can: Document mux-states property On some boards, for routing CAN signals from controller to transceivers, muxes might need to be set. This can be implemented using mux-states property. Therefore, document the same in the respective bindings. Signed-off-by: Aswath Govindraju Reviewed-by: Rob Herring Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20211216041012.16892-2-a-govindraju@ti.com --- Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml b/Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml index 6107880e5246..02b76f15e717 100644 --- a/Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml +++ b/Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml @@ -37,6 +37,12 @@ properties: max bit rate supported in bps minimum: 1 + mux-states: + description: + mux controller node to route the signals from controller to + transceiver. + maxItems: 1 + required: - compatible - '#phy-cells' @@ -53,4 +59,5 @@ examples: max-bitrate = <5000000>; standby-gpios = <&wakeup_gpio1 16 GPIO_ACTIVE_LOW>; enable-gpios = <&main_gpio1 67 GPIO_ACTIVE_HIGH>; + mux-states = <&mux0 1>; }; From 330f4c53d3c2d8b11d86ec03a964b86dc81452f5 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Tue, 8 Mar 2022 20:18:20 +0100 Subject: [PATCH 703/773] ARM: fix build error when BPF_SYSCALL is disabled It was missing a semicolon. Signed-off-by: Emmanuel Gil Peyrot Reviewed-by: Nathan Chancellor Fixes: 25875aa71dfe ("ARM: include unprivileged BPF status in Spectre V2 reporting"). Signed-off-by: Linus Torvalds --- arch/arm/kernel/spectre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/spectre.c b/arch/arm/kernel/spectre.c index e7fea962d632..0dcefc36fb7a 100644 --- a/arch/arm/kernel/spectre.c +++ b/arch/arm/kernel/spectre.c @@ -10,7 +10,7 @@ static bool _unprivileged_ebpf_enabled(void) #ifdef CONFIG_BPF_SYSCALL return !sysctl_unprivileged_bpf_disabled; #else - return false + return false; #endif } From 2cf29e55894886965722e6625f6a03630b4db31d Mon Sep 17 00:00:00 2001 From: Michal Maloszewski Date: Mon, 24 Jan 2022 13:35:43 +0000 Subject: [PATCH 704/773] iavf: Fix handling of vlan strip virtual channel messages Modify netdev->features for vlan stripping based on virtual channel messages received from the PF. Change is needed to synchronize vlan strip status between PF sysfs and iavf ethtool. Fixes: 5951a2b9812d ("iavf: Fix VLAN feature flags after VFR") Signed-off-by: Norbert Ciosek Signed-off-by: Michal Maloszewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/iavf/iavf_virtchnl.c | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 88844d68e150..5263cefe46f5 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -1834,6 +1834,22 @@ void iavf_request_reset(struct iavf_adapter *adapter) adapter->current_op = VIRTCHNL_OP_UNKNOWN; } +/** + * iavf_netdev_features_vlan_strip_set - update vlan strip status + * @netdev: ptr to netdev being adjusted + * @enable: enable or disable vlan strip + * + * Helper function to change vlan strip status in netdev->features. + */ +static void iavf_netdev_features_vlan_strip_set(struct net_device *netdev, + const bool enable) +{ + if (enable) + netdev->features |= NETIF_F_HW_VLAN_CTAG_RX; + else + netdev->features &= ~NETIF_F_HW_VLAN_CTAG_RX; +} + /** * iavf_virtchnl_completion * @adapter: adapter structure @@ -2057,8 +2073,18 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, } break; case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: + dev_warn(&adapter->pdev->dev, "Changing VLAN Stripping is not allowed when Port VLAN is configured\n"); + /* Vlan stripping could not be enabled by ethtool. + * Disable it in netdev->features. + */ + iavf_netdev_features_vlan_strip_set(netdev, false); + break; case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING: dev_warn(&adapter->pdev->dev, "Changing VLAN Stripping is not allowed when Port VLAN is configured\n"); + /* Vlan stripping could not be disabled by ethtool. + * Enable it in netdev->features. + */ + iavf_netdev_features_vlan_strip_set(netdev, true); break; default: dev_err(&adapter->pdev->dev, "PF returned error %d (%s) to our request %d\n", @@ -2312,6 +2338,20 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_unlock_bh(&adapter->adv_rss_lock); } break; + case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: + /* PF enabled vlan strip on this VF. + * Update netdev->features if needed to be in sync with ethtool. + */ + if (!v_retval) + iavf_netdev_features_vlan_strip_set(netdev, true); + break; + case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING: + /* PF disabled vlan strip on this VF. + * Update netdev->features if needed to be in sync with ethtool. + */ + if (!v_retval) + iavf_netdev_features_vlan_strip_set(netdev, false); + break; default: if (adapter->current_op && (v_opcode != adapter->current_op)) dev_warn(&adapter->pdev->dev, "Expected response %d from PF, received %d\n", From 57d03f5608c34079f6f15031f4e8b1e2ae95dcb0 Mon Sep 17 00:00:00 2001 From: Michal Maloszewski Date: Wed, 2 Feb 2022 12:44:54 +0000 Subject: [PATCH 705/773] iavf: Fix adopting new combined setting In some cases overloaded flag IAVF_FLAG_REINIT_ITR_NEEDED which should indicate that interrupts need to be completely reinitialized during reset leads to RTNL deadlocks using ethtool -C while a reset is in progress. To fix, it was added a new flag IAVF_FLAG_REINIT_MSIX_NEEDED used to trigger MSI-X reinit. New combined setting is fixed adopt after VF reset. This has been implemented by call reinit interrupt scheme during VF reset. Without this fix new combined setting has never been adopted. Fixes: 209f2f9c7181 ("iavf: Add support for VIRTCHNL_VF_OFFLOAD_VLAN_V2 negotiation") Signed-off-by: Grzegorz Szczurek Signed-off-by: Mitch Williams Signed-off-by: Michal Maloszewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 + drivers/net/ethernet/intel/iavf/iavf_main.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 89423947ee65..4babe4705a55 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -288,6 +288,7 @@ struct iavf_adapter { #define IAVF_FLAG_REINIT_ITR_NEEDED BIT(16) #define IAVF_FLAG_QUEUES_DISABLED BIT(17) #define IAVF_FLAG_SETUP_NETDEV_FEATURES BIT(18) +#define IAVF_FLAG_REINIT_MSIX_NEEDED BIT(20) /* duplicates for common code */ #define IAVF_FLAG_DCB_ENABLED 0 /* flags for admin queue service task */ diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index dcf24264c7ea..8e644e9ed8da 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2120,7 +2120,7 @@ int iavf_parse_vf_resource_msg(struct iavf_adapter *adapter) "Requested %d queues, but PF only gave us %d.\n", num_req_queues, adapter->vsi_res->num_queue_pairs); - adapter->flags |= IAVF_FLAG_REINIT_ITR_NEEDED; + adapter->flags |= IAVF_FLAG_REINIT_MSIX_NEEDED; adapter->num_req_queues = adapter->vsi_res->num_queue_pairs; iavf_schedule_reset(adapter); @@ -2727,7 +2727,8 @@ continue_reset: err); adapter->aq_required = 0; - if (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED) { + if ((adapter->flags & IAVF_FLAG_REINIT_MSIX_NEEDED) || + (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED)) { err = iavf_reinit_interrupt_scheme(adapter); if (err) goto reset_err; @@ -2799,12 +2800,13 @@ continue_reset: if (err) goto reset_err; - if (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED) { + if ((adapter->flags & IAVF_FLAG_REINIT_MSIX_NEEDED) || + (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED)) { err = iavf_request_traffic_irqs(adapter, netdev->name); if (err) goto reset_err; - adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; + adapter->flags &= ~IAVF_FLAG_REINIT_MSIX_NEEDED; } iavf_configure(adapter); @@ -2819,6 +2821,9 @@ continue_reset: iavf_change_state(adapter, __IAVF_DOWN); wake_up(&adapter->down_waitqueue); } + + adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; + mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); From 5710ab79166504013f7c0ae6a57e7d2fd26e5c43 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 16 Feb 2022 16:51:35 -0800 Subject: [PATCH 706/773] i40e: stop disabling VFs due to PF error responses The i40e_vc_send_msg_to_vf_ex (and its wrapper i40e_vc_send_msg_to_vf) function has logic to detect "failure" responses sent to the VF. If a VF is sent more than I40E_DEFAULT_NUM_INVALID_MSGS_ALLOWED, then the VF is marked as disabled. In either case, a dev_info message is printed stating that a VF opcode failed. This logic originates from the early implementation of VF support in commit 5c3c48ac6bf5 ("i40e: implement virtual device interface"). That commit did not go far enough. The "logic" for this behavior seems to be that error responses somehow indicate a malicious VF. This is not really true. The PF might be sending an error for any number of reasons such as lacking resources, an unsupported operation, etc. This does not indicate a malicious VF. We already have a separate robust malicious VF detection which relies on hardware logic to detect and prevent a variety of behaviors. There is no justification for this behavior in the original implementation. In fact, a later commit 18b7af57d9c1 ("i40e: Lower some message levels") reduced the opcode failure message from a dev_err to a dev_info. In addition, recent commit 01cbf50877e6 ("i40e: Fix to not show opcode msg on unsuccessful VF MAC change") changed the logic to allow quieting it for expected failures. That commit prevented this logic from kicking in for specific circumstances. This change did not go far enough. The behavior is not documented nor is it part of any requirement for our products. Other operating systems such as the FreeBSD implementation of our driver do not include this logic. It is clear this check does not make sense, and causes problems which led to ugly workarounds. Fix this by just removing the entire logic and the need for the i40e_vc_send_msg_to_vf_ex function. Fixes: 01cbf50877e6 ("i40e: Fix to not show opcode msg on unsuccessful VF MAC change") Fixes: 5c3c48ac6bf5 ("i40e: implement virtual device interface") Signed-off-by: Jacob Keller Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/i40e/i40e_debugfs.c | 6 +- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 57 +++---------------- .../ethernet/intel/i40e/i40e_virtchnl_pf.h | 5 -- 3 files changed, 9 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index 1e57cc8c47d7..9db5001297c7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -742,10 +742,8 @@ static void i40e_dbg_dump_vf(struct i40e_pf *pf, int vf_id) vsi = pf->vsi[vf->lan_vsi_idx]; dev_info(&pf->pdev->dev, "vf %2d: VSI id=%d, seid=%d, qps=%d\n", vf_id, vf->lan_vsi_id, vsi->seid, vf->num_queue_pairs); - dev_info(&pf->pdev->dev, " num MDD=%lld, invalid msg=%lld, valid msg=%lld\n", - vf->num_mdd_events, - vf->num_invalid_msgs, - vf->num_valid_msgs); + dev_info(&pf->pdev->dev, " num MDD=%lld\n", + vf->num_mdd_events); } else { dev_info(&pf->pdev->dev, "invalid VF id %d\n", vf_id); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index dfdb6e786461..2606e8f0f19b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1917,19 +1917,17 @@ sriov_configure_out: /***********************virtual channel routines******************/ /** - * i40e_vc_send_msg_to_vf_ex + * i40e_vc_send_msg_to_vf * @vf: pointer to the VF info * @v_opcode: virtual channel opcode * @v_retval: virtual channel return value * @msg: pointer to the msg buffer * @msglen: msg length - * @is_quiet: true for not printing unsuccessful return values, false otherwise * * send msg to VF **/ -static int i40e_vc_send_msg_to_vf_ex(struct i40e_vf *vf, u32 v_opcode, - u32 v_retval, u8 *msg, u16 msglen, - bool is_quiet) +static int i40e_vc_send_msg_to_vf(struct i40e_vf *vf, u32 v_opcode, + u32 v_retval, u8 *msg, u16 msglen) { struct i40e_pf *pf; struct i40e_hw *hw; @@ -1944,25 +1942,6 @@ static int i40e_vc_send_msg_to_vf_ex(struct i40e_vf *vf, u32 v_opcode, hw = &pf->hw; abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id; - /* single place to detect unsuccessful return values */ - if (v_retval && !is_quiet) { - vf->num_invalid_msgs++; - dev_info(&pf->pdev->dev, "VF %d failed opcode %d, retval: %d\n", - vf->vf_id, v_opcode, v_retval); - if (vf->num_invalid_msgs > - I40E_DEFAULT_NUM_INVALID_MSGS_ALLOWED) { - dev_err(&pf->pdev->dev, - "Number of invalid messages exceeded for VF %d\n", - vf->vf_id); - dev_err(&pf->pdev->dev, "Use PF Control I/F to enable the VF\n"); - set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states); - } - } else { - vf->num_valid_msgs++; - /* reset the invalid counter, if a valid message is received. */ - vf->num_invalid_msgs = 0; - } - aq_ret = i40e_aq_send_msg_to_vf(hw, abs_vf_id, v_opcode, v_retval, msg, msglen, NULL); if (aq_ret) { @@ -1975,23 +1954,6 @@ static int i40e_vc_send_msg_to_vf_ex(struct i40e_vf *vf, u32 v_opcode, return 0; } -/** - * i40e_vc_send_msg_to_vf - * @vf: pointer to the VF info - * @v_opcode: virtual channel opcode - * @v_retval: virtual channel return value - * @msg: pointer to the msg buffer - * @msglen: msg length - * - * send msg to VF - **/ -static int i40e_vc_send_msg_to_vf(struct i40e_vf *vf, u32 v_opcode, - u32 v_retval, u8 *msg, u16 msglen) -{ - return i40e_vc_send_msg_to_vf_ex(vf, v_opcode, v_retval, - msg, msglen, false); -} - /** * i40e_vc_send_resp_to_vf * @vf: pointer to the VF info @@ -2822,7 +2784,6 @@ error_param: * i40e_check_vf_permission * @vf: pointer to the VF info * @al: MAC address list from virtchnl - * @is_quiet: set true for printing msg without opcode info, false otherwise * * Check that the given list of MAC addresses is allowed. Will return -EPERM * if any address in the list is not valid. Checks the following conditions: @@ -2837,8 +2798,7 @@ error_param: * addresses might not be accurate. **/ static inline int i40e_check_vf_permission(struct i40e_vf *vf, - struct virtchnl_ether_addr_list *al, - bool *is_quiet) + struct virtchnl_ether_addr_list *al) { struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx]; @@ -2846,7 +2806,6 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, int mac2add_cnt = 0; int i; - *is_quiet = false; for (i = 0; i < al->num_elements; i++) { struct i40e_mac_filter *f; u8 *addr = al->list[i].addr; @@ -2870,7 +2829,6 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, !ether_addr_equal(addr, vf->default_lan_addr.addr)) { dev_err(&pf->pdev->dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); - *is_quiet = true; return -EPERM; } @@ -2921,7 +2879,6 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg) (struct virtchnl_ether_addr_list *)msg; struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = NULL; - bool is_quiet = false; i40e_status ret = 0; int i; @@ -2938,7 +2895,7 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg) */ spin_lock_bh(&vsi->mac_filter_hash_lock); - ret = i40e_check_vf_permission(vf, al, &is_quiet); + ret = i40e_check_vf_permission(vf, al); if (ret) { spin_unlock_bh(&vsi->mac_filter_hash_lock); goto error_param; @@ -2976,8 +2933,8 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg) error_param: /* send the response to the VF */ - return i40e_vc_send_msg_to_vf_ex(vf, VIRTCHNL_OP_ADD_ETH_ADDR, - ret, NULL, 0, is_quiet); + return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_ETH_ADDR, + ret, NULL, 0); } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 03c42fd0fea1..a554d0a0b09b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -10,8 +10,6 @@ #define I40E_VIRTCHNL_SUPPORTED_QTYPES 2 -#define I40E_DEFAULT_NUM_INVALID_MSGS_ALLOWED 10 - #define I40E_VLAN_PRIORITY_SHIFT 13 #define I40E_VLAN_MASK 0xFFF #define I40E_PRIORITY_MASK 0xE000 @@ -92,9 +90,6 @@ struct i40e_vf { u8 num_queue_pairs; /* num of qps assigned to VF vsis */ u8 num_req_queues; /* num of requested qps */ u64 num_mdd_events; /* num of mdd events detected */ - /* num of continuous malformed or invalid msgs detected */ - u64 num_invalid_msgs; - u64 num_valid_msgs; /* num of valid msgs detected */ unsigned long vf_caps; /* vf's adv. capabilities */ unsigned long vf_states; /* vf's runtime states */ From 79498d5af8e458102242d1667cf44df1f1564e63 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 16 Feb 2022 16:51:36 -0800 Subject: [PATCH 707/773] ice: stop disabling VFs due to PF error responses The ice_vc_send_msg_to_vf function has logic to detect "failure" responses being sent to a VF. If a VF is sent more than ICE_DFLT_NUM_INVAL_MSGS_ALLOWED then the VF is marked as disabled. Almost identical logic also existed in the i40e driver. This logic was added to the ice driver in commit 1071a8358a28 ("ice: Implement virtchnl commands for AVF support") which itself copied from the i40e implementation in commit 5c3c48ac6bf5 ("i40e: implement virtual device interface"). Neither commit provides a proper explanation or justification of the check. In fact, later commits to i40e changed the logic to allow bypassing the check in some specific instances. The "logic" for this seems to be that error responses somehow indicate a malicious VF. This is not really true. The PF might be sending an error for any number of reasons such as lack of resources, etc. Additionally, this causes the PF to log an info message for every failed VF response which may confuse users, and can spam the kernel log. This behavior is not documented as part of any requirement for our products and other operating system drivers such as the FreeBSD implementation of our drivers do not include this type of check. In fact, the change from dev_err to dev_info in i40e commit 18b7af57d9c1 ("i40e: Lower some message levels") explains that these messages typically don't actually indicate a real issue. It is quite likely that a user who hits this in practice will be very confused as the VF will be disabled without an obvious way to recover. We already have robust malicious driver detection logic using actual hardware detection mechanisms that detect and prevent invalid device usage. Remove the logic since its not a documented requirement and the behavior is not intuitive. Fixes: 1071a8358a28 ("ice: Implement virtchnl commands for AVF support") Signed-off-by: Jacob Keller Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ice/ice_virtchnl_pf.c | 18 ------------------ .../net/ethernet/intel/ice/ice_virtchnl_pf.h | 3 --- 2 files changed, 21 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 408f78e3eb13..1be3cd4b2bef 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -2182,24 +2182,6 @@ ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode, dev = ice_pf_to_dev(pf); - /* single place to detect unsuccessful return values */ - if (v_retval) { - vf->num_inval_msgs++; - dev_info(dev, "VF %d failed opcode %d, retval: %d\n", vf->vf_id, - v_opcode, v_retval); - if (vf->num_inval_msgs > ICE_DFLT_NUM_INVAL_MSGS_ALLOWED) { - dev_err(dev, "Number of invalid messages exceeded for VF %d\n", - vf->vf_id); - dev_err(dev, "Use PF Control I/F to enable the VF\n"); - set_bit(ICE_VF_STATE_DIS, vf->vf_states); - return -EIO; - } - } else { - vf->num_valid_msgs++; - /* reset the invalid counter, if a valid message is received. */ - vf->num_inval_msgs = 0; - } - aq_ret = ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, v_opcode, v_retval, msg, msglen, NULL); if (aq_ret && pf->hw.mailboxq.sq_last_status != ICE_AQ_RC_ENOSYS) { diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h index 752487a1bdd6..8f27255cc0cc 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h @@ -14,7 +14,6 @@ #define ICE_MAX_MACADDR_PER_VF 18 /* Malicious Driver Detection */ -#define ICE_DFLT_NUM_INVAL_MSGS_ALLOWED 10 #define ICE_MDD_EVENTS_THRESHOLD 30 /* Static VF transaction/status register def */ @@ -134,8 +133,6 @@ struct ice_vf { unsigned int max_tx_rate; /* Maximum Tx bandwidth limit in Mbps */ DECLARE_BITMAP(vf_states, ICE_VF_STATES_NBITS); /* VF runtime states */ - u64 num_inval_msgs; /* number of continuous invalid msgs */ - u64 num_valid_msgs; /* number of valid msgs detected */ unsigned long vf_caps; /* VF's adv. capabilities */ u8 num_req_qs; /* num of queue pairs requested by VF */ u16 num_mac; From 97b0129146b1544bbb0773585327896da3bb4e0a Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Fri, 18 Feb 2022 12:39:25 -0800 Subject: [PATCH 708/773] ice: Fix error with handling of bonding MTU When a bonded interface is destroyed, .ndo_change_mtu can be called during the tear-down process while the RTNL lock is held. This is a problem since the auxiliary driver linked to the LAN driver needs to be notified of the MTU change, and this requires grabbing a device_lock on the auxiliary_device's dev. Currently this is being attempted in the same execution context as the call to .ndo_change_mtu which is causing a dead-lock. Move the notification of the changed MTU to a separate execution context (watchdog service task) and eliminate the "before" notification. Fixes: 348048e724a0e ("ice: Implement iidc operations") Signed-off-by: Dave Ertman Tested-by: Jonathan Toppins Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_main.c | 29 +++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 473b1f6be9de..3121f9b04f59 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -483,6 +483,7 @@ enum ice_pf_flags { ICE_FLAG_MDD_AUTO_RESET_VF, ICE_FLAG_LINK_LENIENT_MODE_ENA, ICE_FLAG_PLUG_AUX_DEV, + ICE_FLAG_MTU_CHANGED, ICE_PF_FLAGS_NBITS /* must be last */ }; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index f3c346e13b7a..6fc6514eb007 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2258,6 +2258,17 @@ static void ice_service_task(struct work_struct *work) if (test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) ice_plug_aux_dev(pf); + if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) { + struct iidc_event *event; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (event) { + set_bit(IIDC_EVENT_AFTER_MTU_CHANGE, event->type); + ice_send_event_to_aux(pf, event); + kfree(event); + } + } + ice_clean_adminq_subtask(pf); ice_check_media_subtask(pf); ice_check_for_hang_subtask(pf); @@ -6822,7 +6833,6 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_vsi *vsi = np->vsi; struct ice_pf *pf = vsi->back; - struct iidc_event *event; u8 count = 0; int err = 0; @@ -6857,14 +6867,6 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) return -EBUSY; } - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - set_bit(IIDC_EVENT_BEFORE_MTU_CHANGE, event->type); - ice_send_event_to_aux(pf, event); - clear_bit(IIDC_EVENT_BEFORE_MTU_CHANGE, event->type); - netdev->mtu = (unsigned int)new_mtu; /* if VSI is up, bring it down and then back up */ @@ -6872,21 +6874,18 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu) err = ice_down(vsi); if (err) { netdev_err(netdev, "change MTU if_down err %d\n", err); - goto event_after; + return err; } err = ice_up(vsi); if (err) { netdev_err(netdev, "change MTU if_up err %d\n", err); - goto event_after; + return err; } } netdev_dbg(netdev, "changed MTU to %d\n", new_mtu); -event_after: - set_bit(IIDC_EVENT_AFTER_MTU_CHANGE, event->type); - ice_send_event_to_aux(pf, event); - kfree(event); + set_bit(ICE_FLAG_MTU_CHANGED, pf->flags); return err; } From 3d97f1afd8d831e0c0dc1157418f94b8faa97b54 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 16 Jan 2022 19:46:20 +0100 Subject: [PATCH 709/773] ice: Don't use GFP_KERNEL in atomic context ice_misc_intr() is an irq handler. It should not sleep. Use GFP_ATOMIC instead of GFP_KERNEL when allocating some memory. Fixes: 348048e724a0 ("ice: Implement iidc operations") Signed-off-by: Christophe JAILLET Tested-by: Leszek Kaliszczuk Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 6fc6514eb007..83e3e8aae6cf 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3034,7 +3034,7 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data) struct iidc_event *event; ena_mask &= ~ICE_AUX_CRIT_ERR; - event = kzalloc(sizeof(*event), GFP_KERNEL); + event = kzalloc(sizeof(*event), GFP_ATOMIC); if (event) { set_bit(IIDC_EVENT_CRIT_ERR, event->type); /* report the entire OICR value to AUX driver */ From ad35ffa252af67d4cc7c744b9377a2b577748e3f Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Tue, 22 Feb 2022 11:43:04 +0000 Subject: [PATCH 710/773] ice: Fix curr_link_speed advertised speed Change curr_link_speed advertised speed, due to link_info.link_speed is not equal phy.curr_user_speed_req. Without this patch it is impossible to set advertised speed to same as link_speed. Testing Hints: Try to set advertised speed to 25G only with 25G default link (use ethtool -s 0x80000000) Fixes: 48cb27f2fd18 ("ice: Implement handlers for ethtool PHY/link operations") Signed-off-by: Grzegorz Siwik Signed-off-by: Jedrzej Jagielski Tested-by: Gurucharan (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index e2e3ef7fba7f..a5dc9824e255 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -2298,7 +2298,7 @@ ice_set_link_ksettings(struct net_device *netdev, if (err) goto done; - curr_link_speed = pi->phy.link_info.link_speed; + curr_link_speed = pi->phy.curr_user_speed_req; adv_link_speed = ice_ksettings_find_adv_link_speed(ks); /* If speed didn't get set, set it to what it currently is. From 0a5aa8d161d19a1b12fd25b434b32f7c885c73bb Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Tue, 8 Mar 2022 17:09:15 +0900 Subject: [PATCH 711/773] block: fix blk_mq_attempt_bio_merge and rq_qos_throttle protection Commit 9d497e2941c3 ("block: don't protect submit_bio_checks by q_usage_counter") moved blk_mq_attempt_bio_merge and rq_qos_throttle calls out of q_usage_counter protection. However, these functions require q_usage_counter protection. The blk_mq_attempt_bio_merge call without the protection resulted in blktests block/005 failure with KASAN null- ptr-deref or use-after-free at bio merge. The rq_qos_throttle call without the protection caused kernel hang at qos throttle. To fix the failures, move the blk_mq_attempt_bio_merge and rq_qos_throttle calls back to q_usage_counter protection. Fixes: 9d497e2941c3 ("block: don't protect submit_bio_checks by q_usage_counter") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20220308080915.3473689-1-shinichiro.kawasaki@wdc.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d69ca91fbc8b..9a9185a0a2d1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2718,7 +2718,8 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio) + struct bio *bio, + unsigned int nsegs) { struct blk_mq_alloc_data data = { .q = q, @@ -2730,6 +2731,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (unlikely(bio_queue_enter(bio))) return NULL; + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + goto queue_exit; + + rq_qos_throttle(q, bio); + if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; @@ -2742,12 +2748,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); +queue_exit: blk_queue_exit(q); return NULL; } static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio *bio) + struct blk_plug *plug, struct bio **bio, unsigned int nsegs) { struct request *rq; @@ -2757,12 +2764,19 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, if (!rq || rq->q != q) return NULL; - if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) + if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { + *bio = NULL; return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + } + + rq_qos_throttle(q, *bio); + + if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) return NULL; - rq->cmd_flags = bio->bi_opf; + rq->cmd_flags = (*bio)->bi_opf; plug->cached_rq = rq_list_next(rq); INIT_LIST_HEAD(&rq->queuelist); return rq; @@ -2800,14 +2814,11 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; - if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) - return; - - rq_qos_throttle(q, bio); - - rq = blk_mq_get_cached_request(q, plug, bio); + rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); if (!rq) { - rq = blk_mq_get_new_requests(q, plug, bio); + if (!bio) + return; + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) return; } From b19ab4b38b06aae12442b2de95ccf58b5dc53584 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 8 Mar 2022 02:47:49 +0000 Subject: [PATCH 712/773] ethernet: Fix error handling in xemaclite_of_probe This node pointer is returned by of_parse_phandle() with refcount incremented in this function. Calling of_node_put() to avoid the refcount leak. As the remove function do. Fixes: 5cdaaa12866e ("net: emaclite: adding MDIO and phy lib support") Signed-off-by: Miaoqian Lin Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220308024751.2320-1-linmq006@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 519599480b15..77fa2cb03aca 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1183,7 +1183,7 @@ static int xemaclite_of_probe(struct platform_device *ofdev) if (rc) { dev_err(dev, "Cannot register network device, aborting\n"); - goto error; + goto put_node; } dev_info(dev, @@ -1191,6 +1191,8 @@ static int xemaclite_of_probe(struct platform_device *ofdev) (unsigned long __force)ndev->mem_start, lp->base_addr, ndev->irq); return 0; +put_node: + of_node_put(lp->phy_node); error: free_netdev(ndev); return rc; From c79fcc27be90b308b3fa90811aefafdd4078668c Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 8 Mar 2022 02:11:59 +0000 Subject: [PATCH 713/773] tipc: fix incorrect order of state message data sanity check When receiving a state message, function tipc_link_validate_msg() is called to validate its header portion. Then, its data portion is validated before it can be accessed correctly. However, current data sanity check is done after the message header is accessed to update some link variables. This commit fixes this issue by moving the data sanity check to the beginning of state message handling and right after the header sanity check. Fixes: 9aa422ad3266 ("tipc: improve size validations for received domain records") Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Link: https://lore.kernel.org/r/20220308021200.9245-1-tung.q.nguyen@dektech.com.au Signed-off-by: Jakub Kicinski --- net/tipc/link.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 1e14d7f8f28f..e260c0d557f5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2286,6 +2286,11 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; case STATE_MSG: + /* Validate Gap ACK blocks, drop if invalid */ + glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); + if (glen > dlen) + break; + l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ @@ -2311,10 +2316,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; } - /* Receive Gap ACK blocks from peer if any */ - glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); - if(glen > dlen) - break; tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); From 7228918b34615ef6317edcd9a058a057bc54aa32 Mon Sep 17 00:00:00 2001 From: Ross Philipson Date: Wed, 23 Feb 2022 21:07:35 -0500 Subject: [PATCH 714/773] x86/boot: Fix memremap of setup_indirect structures As documented, the setup_indirect structure is nested inside the setup_data structures in the setup_data list. The code currently accesses the fields inside the setup_indirect structure but only the sizeof(struct setup_data) is being memremapped. No crash occurred but this is just due to how the area is remapped under the covers. Properly memremap both the setup_data and setup_indirect structures in these cases before accessing them. Fixes: b3c72fc9a78e ("x86/boot: Introduce setup_indirect") Signed-off-by: Ross Philipson Signed-off-by: Borislav Petkov Reviewed-by: Daniel Kiper Cc: Link: https://lore.kernel.org/r/1645668456-22036-2-git-send-email-ross.philipson@oracle.com --- arch/x86/kernel/e820.c | 41 ++++++++++++++------ arch/x86/kernel/kdebugfs.c | 35 +++++++++++++---- arch/x86/kernel/ksysfs.c | 77 ++++++++++++++++++++++++++++++-------- arch/x86/kernel/setup.c | 34 +++++++++++++---- arch/x86/mm/ioremap.c | 24 ++++++++++-- 5 files changed, 165 insertions(+), 46 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index bc0657f0deed..f267205f2d5a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -995,8 +995,10 @@ early_param("memmap", parse_memmap_opt); */ void __init e820__reserve_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; if (!pa_data) @@ -1004,6 +1006,14 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("e820: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); /* @@ -1015,18 +1025,27 @@ void __init e820__reserve_setup_data(void) sizeof(*data) + data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - e820__range_update(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("e820: failed to memremap indirect setup_data\n"); + return; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + e820__range_update(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } } - pa_data = data->next; - early_memunmap(data, sizeof(*data)); + pa_data = pa_next; + early_memunmap(data, len); } e820__update_table(e820_table); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 64b6da95af98..e2e89bebcbc3 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -88,11 +88,13 @@ create_setup_data_node(struct dentry *parent, int no, static int __init create_setup_data_nodes(struct dentry *parent) { + struct setup_indirect *indirect; struct setup_data_node *node; struct setup_data *data; - int error; + u64 pa_data, pa_next; struct dentry *d; - u64 pa_data; + int error; + u32 len; int no = 0; d = debugfs_create_dir("setup_data", parent); @@ -112,12 +114,29 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = -ENOMEM; goto err_dir; } + pa_next = data->next; - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - node->paddr = ((struct setup_indirect *)data->data)->addr; - node->type = ((struct setup_indirect *)data->data)->type; - node->len = ((struct setup_indirect *)data->data)->len; + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + node->paddr = indirect->addr; + node->type = indirect->type; + node->len = indirect->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } } else { node->paddr = pa_data; node->type = data->type; @@ -125,7 +144,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) } create_setup_data_node(d, no, node); - pa_data = data->next; + pa_data = pa_next; memunmap(data); no++; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index d0a19121c6a4..257892fcefa7 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -91,26 +91,41 @@ static int get_setup_data_paddr(int nr, u64 *paddr) static int __init get_setup_data_size(int nr, size_t *size) { - int i = 0; + u64 pa_data = boot_params.hdr.setup_data, pa_next; + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data = boot_params.hdr.setup_data; + int i = 0; + u32 len; while (pa_data) { data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; + pa_next = data->next; + if (nr == i) { - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) - *size = ((struct setup_indirect *)data->data)->len; - else + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + *size = indirect->len; + else + *size = data->len; + } else { *size = data->len; + } memunmap(data); return 0; } - pa_data = data->next; + pa_data = pa_next; memunmap(data); i++; } @@ -120,9 +135,11 @@ static int __init get_setup_data_size(int nr, size_t *size) static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + struct setup_indirect *indirect; + struct setup_data *data; int nr, ret; u64 paddr; - struct setup_data *data; + u32 len; ret = kobj_to_setup_data_nr(kobj, &nr); if (ret) @@ -135,10 +152,20 @@ static ssize_t type_show(struct kobject *kobj, if (!data) return -ENOMEM; - if (data->type == SETUP_INDIRECT) - ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type); - else + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + ret = sprintf(buf, "0x%x\n", indirect->type); + } else { ret = sprintf(buf, "0x%x\n", data->type); + } + memunmap(data); return ret; } @@ -149,9 +176,10 @@ static ssize_t setup_data_data_read(struct file *fp, char *buf, loff_t off, size_t count) { + struct setup_indirect *indirect; + struct setup_data *data; int nr, ret = 0; u64 paddr, len; - struct setup_data *data; void *p; ret = kobj_to_setup_data_nr(kobj, &nr); @@ -165,10 +193,27 @@ static ssize_t setup_data_data_read(struct file *fp, if (!data) return -ENOMEM; - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - paddr = ((struct setup_indirect *)data->data)->addr; - len = ((struct setup_indirect *)data->data)->len; + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } else { + /* + * Even though this is technically undefined, return + * the data as though it is a normal setup_data struct. + * This will at least allow it to be inspected. + */ + paddr += sizeof(*data); + len = data->len; + } } else { paddr += sizeof(*data); len = data->len; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7a132eb794d..90d7e1788c91 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -369,21 +369,41 @@ static void __init parse_setup_data(void) static void __init memblock_x86_reserve_range_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + memblock_reserve(pa_data, sizeof(*data) + data->len); - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) - memblock_reserve(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len); + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("setup: failed to memremap indirect setup_data\n"); + return; + } - pa_data = data->next; - early_memunmap(data, sizeof(*data)); + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + memblock_reserve(indirect->addr, indirect->len); + } + + pa_data = pa_next; + early_memunmap(data, len); } } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 026031b3b782..ab666c4cd357 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -615,6 +615,7 @@ static bool memremap_is_efi_data(resource_size_t phys_addr, static bool memremap_is_setup_data(resource_size_t phys_addr, unsigned long size) { + struct setup_indirect *indirect; struct setup_data *data; u64 paddr, paddr_next; @@ -627,6 +628,10 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, data = memremap(paddr, sizeof(*data), MEMREMAP_WB | MEMREMAP_DEC); + if (!data) { + pr_warn("failed to memremap setup_data entry\n"); + return false; + } paddr_next = data->next; len = data->len; @@ -636,10 +641,21 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, return true; } - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - paddr = ((struct setup_indirect *)data->data)->addr; - len = ((struct setup_indirect *)data->data)->len; + if (data->type == SETUP_INDIRECT) { + memunmap(data); + data = memremap(paddr, sizeof(*data) + len, + MEMREMAP_WB | MEMREMAP_DEC); + if (!data) { + pr_warn("failed to memremap indirect setup_data\n"); + return false; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } } memunmap(data); From 445c1470b6ef96440e7cfc42dfc160f5004fd149 Mon Sep 17 00:00:00 2001 From: Ross Philipson Date: Wed, 23 Feb 2022 21:07:36 -0500 Subject: [PATCH 715/773] x86/boot: Add setup_indirect support in early_memremap_is_setup_data() The x86 boot documentation describes the setup_indirect structures and how they are used. Only one of the two functions in ioremap.c that needed to be modified to be aware of the introduction of setup_indirect functionality was updated. Adds comparable support to the other function where it was missing. Fixes: b3c72fc9a78e ("x86/boot: Introduce setup_indirect") Signed-off-by: Ross Philipson Signed-off-by: Borislav Petkov Reviewed-by: Daniel Kiper Cc: Link: https://lore.kernel.org/r/1645668456-22036-3-git-send-email-ross.philipson@oracle.com --- arch/x86/mm/ioremap.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index ab666c4cd357..17a492c27306 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -676,22 +676,51 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, unsigned long size) { + struct setup_indirect *indirect; struct setup_data *data; u64 paddr, paddr_next; paddr = boot_params.hdr.setup_data; while (paddr) { - unsigned int len; + unsigned int len, size; if (phys_addr == paddr) return true; data = early_memremap_decrypted(paddr, sizeof(*data)); + if (!data) { + pr_warn("failed to early memremap setup_data entry\n"); + return false; + } + + size = sizeof(*data); paddr_next = data->next; len = data->len; - early_memunmap(data, sizeof(*data)); + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + early_memunmap(data, sizeof(*data)); + return true; + } + + if (data->type == SETUP_INDIRECT) { + size += len; + early_memunmap(data, sizeof(*data)); + data = early_memremap_decrypted(paddr, size); + if (!data) { + pr_warn("failed to early memremap indirect setup_data\n"); + return false; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } + } + + early_memunmap(data, size); if ((phys_addr > paddr) && (phys_addr < (paddr + len))) return true; From 6babfc6e6fab068018c36e8f6605184b8c0b349d Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 8 Mar 2022 14:40:07 +0800 Subject: [PATCH 716/773] net: ethernet: ti: cpts: Handle error for clk_enable As the potential failure of the clk_enable(), it should be better to check it and return error if fails. Fixes: 8a2c9a5ab4b9 ("net: ethernet: ti: cpts: rework initialization/deinitialization") Signed-off-by: Jiasheng Jiang Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index dc70a6bfaa6a..92ca739fac01 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -568,7 +568,9 @@ int cpts_register(struct cpts *cpts) for (i = 0; i < CPTS_MAX_EVENTS; i++) list_add(&cpts->pool_data[i].list, &cpts->pool); - clk_enable(cpts->refclk); + err = clk_enable(cpts->refclk); + if (err) + return err; cpts_write32(cpts, CPTS_EN, control); cpts_write32(cpts, TS_PEND_EN, int_enable); From 2a760554dcba450d3ad61b32375b50ed6d59a87c Mon Sep 17 00:00:00 2001 From: "Minghao Chi (CGEL ZTE)" Date: Tue, 8 Mar 2022 06:43:09 +0000 Subject: [PATCH 717/773] net:mcf8390: Use platform_get_irq() to get the interrupt It is not recommened to use platform_get_resource(pdev, IORESOURCE_IRQ) for requesting IRQ's resources any more, as they can be not ready yet in case of DT-booting. platform_get_irq() instead is a recommended way for getting IRQ even if it was not retrieved earlier. It also makes code simpler because we're getting "int" value right away and no conversion from resource to int is required. Reported-by: Zeal Robot Signed-off-by: Minghao Chi (CGEL ZTE) Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/mcf8390.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/8390/mcf8390.c b/drivers/net/ethernet/8390/mcf8390.c index e320cccba61a..90cd7bdf06f5 100644 --- a/drivers/net/ethernet/8390/mcf8390.c +++ b/drivers/net/ethernet/8390/mcf8390.c @@ -405,12 +405,12 @@ static int mcf8390_init(struct net_device *dev) static int mcf8390_probe(struct platform_device *pdev) { struct net_device *dev; - struct resource *mem, *irq; + struct resource *mem; resource_size_t msize; - int ret; + int ret, irq; - irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (irq == NULL) { + irq = platform_get_irq(pdev, 0); + if (irq < 0) { dev_err(&pdev->dev, "no IRQ specified?\n"); return -ENXIO; } @@ -433,7 +433,7 @@ static int mcf8390_probe(struct platform_device *pdev) SET_NETDEV_DEV(dev, &pdev->dev); platform_set_drvdata(pdev, dev); - dev->irq = irq->start; + dev->irq = irq; dev->base_addr = mem->start; ret = mcf8390_init(dev); From 2169b79258c8be803d2595d6456b1e77129fe154 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Tue, 8 Mar 2022 14:57:39 +0800 Subject: [PATCH 718/773] net: ethernet: lpc_eth: Handle error for clk_enable As the potential failure of the clk_enable(), it should be better to check it and return error if fails. Fixes: b7370112f519 ("lpc32xx: Added ethernet driver") Signed-off-by: Jiasheng Jiang Signed-off-by: David S. Miller --- drivers/net/ethernet/nxp/lpc_eth.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index bc39558fe82b..756f97dce85b 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1471,6 +1471,7 @@ static int lpc_eth_drv_resume(struct platform_device *pdev) { struct net_device *ndev = platform_get_drvdata(pdev); struct netdata_local *pldat; + int ret; if (device_may_wakeup(&pdev->dev)) disable_irq_wake(ndev->irq); @@ -1480,7 +1481,9 @@ static int lpc_eth_drv_resume(struct platform_device *pdev) pldat = netdev_priv(ndev); /* Enable interface clock */ - clk_enable(pldat->clk); + ret = clk_enable(pldat->clk); + if (ret) + return ret; /* Reset and initialize */ __lpc_eth_reset(pldat); From c9ffa3e2bc451816ce0295e40063514fabf2bd36 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 8 Mar 2022 07:42:47 +0000 Subject: [PATCH 719/773] net: marvell: prestera: Add missing of_node_put() in prestera_switch_set_base_mac_addr This node pointer is returned by of_find_compatible_node() with refcount incremented. Calling of_node_put() to aovid the refcount leak. Fixes: 501ef3066c89 ("net: marvell: prestera: Add driver for Prestera family ASIC devices") Signed-off-by: Miaoqian Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/prestera/prestera_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index cad93f747d0c..73cd0a4b7291 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -554,6 +554,7 @@ static int prestera_switch_set_base_mac_addr(struct prestera_switch *sw) dev_info(prestera_dev(sw), "using random base mac address\n"); } of_node_put(base_mac_np); + of_node_put(np); return prestera_hw_switch_mac_set(sw, sw->base_mac); } From 71171ac8eb34ce7fe6b3267dce27c313ab3cb3ac Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 8 Mar 2022 16:12:23 +0800 Subject: [PATCH 720/773] ax25: Fix NULL pointer dereference in ax25_kill_by_device When two ax25 devices attempted to establish connection, the requester use ax25_create(), ax25_bind() and ax25_connect() to initiate connection. The receiver use ax25_rcv() to accept connection and use ax25_create_cb() in ax25_rcv() to create ax25_cb, but the ax25_cb->sk is NULL. When the receiver is detaching, a NULL pointer dereference bug caused by sock_hold(sk) in ax25_kill_by_device() will happen. The corresponding fail log is shown below: =============================================================== BUG: KASAN: null-ptr-deref in ax25_device_event+0xfd/0x290 Call Trace: ... ax25_device_event+0xfd/0x290 raw_notifier_call_chain+0x5e/0x70 dev_close_many+0x174/0x220 unregister_netdevice_many+0x1f7/0xa60 unregister_netdevice_queue+0x12f/0x170 unregister_netdev+0x13/0x20 mkiss_close+0xcd/0x140 tty_ldisc_release+0xc0/0x220 tty_release_struct+0x17/0xa0 tty_release+0x62d/0x670 ... This patch add condition check in ax25_kill_by_device(). If s->sk is NULL, it will goto if branch to kill device. Fixes: 4e0f718daf97 ("ax25: improve the incomplete fix to avoid UAF and NPD bugs") Reported-by: Thomas Osterried Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d53cbb4e2503..6bd097180772 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -87,6 +87,13 @@ again: ax25_for_each(s, &ax25_list) { if (s->ax25_dev == ax25_dev) { sk = s->sk; + if (!sk) { + spin_unlock_bh(&ax25_list_lock); + s->ax25_dev = NULL; + ax25_disconnect(s, ENETUNREACH); + spin_lock_bh(&ax25_list_lock); + goto again; + } sock_hold(sk); spin_unlock_bh(&ax25_list_lock); lock_sock(sk); From b859ebedd1e730bbda69142fca87af4e712649a1 Mon Sep 17 00:00:00 2001 From: Paul Semel Date: Tue, 8 Mar 2022 10:30:58 +0100 Subject: [PATCH 721/773] arm64: kasan: fix include error in MTE functions Fix `error: expected string literal in 'asm'`. This happens when compiling an ebpf object file that includes `net/net_namespace.h` from linux kernel headers. Include trace: include/net/net_namespace.h:10 include/linux/workqueue.h:9 include/linux/timer.h:8 include/linux/debugobjects.h:6 include/linux/spinlock.h:90 include/linux/workqueue.h:9 arch/arm64/include/asm/spinlock.h:9 arch/arm64/include/generated/asm/qrwlock.h:1 include/asm-generic/qrwlock.h:14 arch/arm64/include/asm/processor.h:33 arch/arm64/include/asm/kasan.h:9 arch/arm64/include/asm/mte-kasan.h:45 arch/arm64/include/asm/mte-def.h:14 Signed-off-by: Paul Semel Fixes: 2cb34276427a ("arm64: kasan: simplify and inline MTE functions") Cc: # 5.12.x Link: https://lore.kernel.org/r/bacb5387-2992-97e4-0c48-1ed925905bee@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mte-kasan.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index e4704a403237..a857bcacf0fe 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -5,6 +5,7 @@ #ifndef __ASM_MTE_KASAN_H #define __ASM_MTE_KASAN_H +#include #include #ifndef __ASSEMBLY__ From f0cfe17bcc1dd2f0872966b554a148e888833ee9 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 9 Mar 2022 14:13:02 +0100 Subject: [PATCH 722/773] tracing/osnoise: Do not unregister events twice Nicolas reported that using: # trace-cmd record -e all -M 10 -p osnoise --poll Resulted in the following kernel warning: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1217 at kernel/tracepoint.c:404 tracepoint_probe_unregister+0x280/0x370 [...] CPU: 0 PID: 1217 Comm: trace-cmd Not tainted 5.17.0-rc6-next-20220307-nico+ #19 RIP: 0010:tracepoint_probe_unregister+0x280/0x370 [...] CR2: 00007ff919b29497 CR3: 0000000109da4005 CR4: 0000000000170ef0 Call Trace: osnoise_workload_stop+0x36/0x90 tracing_set_tracer+0x108/0x260 tracing_set_trace_write+0x94/0xd0 ? __check_object_size.part.0+0x10a/0x150 ? selinux_file_permission+0x104/0x150 vfs_write+0xb5/0x290 ksys_write+0x5f/0xe0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7ff919a18127 [...] ---[ end trace 0000000000000000 ]--- The warning complains about an attempt to unregister an unregistered tracepoint. This happens on trace-cmd because it first stops tracing, and then switches the tracer to nop. Which is equivalent to: # cd /sys/kernel/tracing/ # echo osnoise > current_tracer # echo 0 > tracing_on # echo nop > current_tracer The osnoise tracer stops the workload when no trace instance is actually collecting data. This can be caused both by disabling tracing or disabling the tracer itself. To avoid unregistering events twice, use the existing trace_osnoise_callback_enabled variable to check if the events (and the workload) are actually active before trying to deactivate them. Link: https://lore.kernel.org/all/c898d1911f7f9303b7e14726e7cc9678fbfb4a0e.camel@redhat.com/ Link: https://lkml.kernel.org/r/938765e17d5a781c2df429a98f0b2e7cc317b022.1646823913.git.bristot@kernel.org Cc: stable@vger.kernel.org Cc: Marcelo Tosatti Fixes: 2fac8d6486d5 ("tracing/osnoise: Allow multiple instances of the same tracer") Reported-by: Nicolas Saenz Julienne Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index cfddb30e65ab..2aa3efdca755 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2200,6 +2200,17 @@ static void osnoise_workload_stop(void) if (osnoise_has_registered_instances()) return; + /* + * If callbacks were already disabled in a previous stop + * call, there is no need to disable then again. + * + * For instance, this happens when tracing is stopped via: + * echo 0 > tracing_on + * echo nop > current_tracer. + */ + if (!trace_osnoise_callback_enabled) + return; + trace_osnoise_callback_enabled = false; /* * Make sure that ftrace_nmi_enter/exit() see From caf4c86bf136845982c5103b2661751b40c474c0 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Mon, 7 Mar 2022 19:07:40 +0100 Subject: [PATCH 723/773] tracing/osnoise: Force quiescent states while tracing At the moment running osnoise on a nohz_full CPU or uncontested FIFO priority and a PREEMPT_RCU kernel might have the side effect of extending grace periods too much. This will entice RCU to force a context switch on the wayward CPU to end the grace period, all while introducing unwarranted noise into the tracer. This behaviour is unavoidable as overly extending grace periods might exhaust the system's memory. This same exact problem is what extended quiescent states (EQS) were created for, conversely, rcu_momentary_dyntick_idle() emulates them by performing a zero duration EQS. So let's make use of it. In the common case rcu_momentary_dyntick_idle() is fairly inexpensive: atomically incrementing a local per-CPU counter and doing a store. So it shouldn't affect osnoise's measurements (which has a 1us granularity), so we'll call it unanimously. The uncommon case involve calling rcu_momentary_dyntick_idle() after having the osnoise process: - Receive an expedited quiescent state IPI with preemption disabled or during an RCU critical section. (activates rdp->cpu_no_qs.b.exp code-path). - Being preempted within in an RCU critical section and having the subsequent outermost rcu_read_unlock() called with interrupts disabled. (t->rcu_read_unlock_special.b.blocked code-path). Neither of those are possible at the moment, and are unlikely to be in the future given the osnoise's loop design. On top of this, the noise generated by the situations described above is unavoidable, and if not exposed by rcu_momentary_dyntick_idle() will be eventually seen in subsequent rcu_read_unlock() calls or schedule operations. Link: https://lkml.kernel.org/r/20220307180740.577607-1-nsaenzju@redhat.com Cc: stable@vger.kernel.org Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Signed-off-by: Nicolas Saenz Julienne Acked-by: Paul E. McKenney Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 2aa3efdca755..5e3c62a08fc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1386,6 +1386,26 @@ static int run_osnoise(void) osnoise_stop_tracing(); } + /* + * In some cases, notably when running on a nohz_full CPU with + * a stopped tick PREEMPT_RCU has no way to account for QSs. + * This will eventually cause unwarranted noise as PREEMPT_RCU + * will force preemption as the means of ending the current + * grace period. We avoid this problem by calling + * rcu_momentary_dyntick_idle(), which performs a zero duration + * EQS allowing PREEMPT_RCU to end the current grace period. + * This call shouldn't be wrapped inside an RCU critical + * section. + * + * Note that in non PREEMPT_RCU kernels QSs are handled through + * cond_resched() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + local_irq_disable(); + rcu_momentary_dyntick_idle(); + local_irq_enable(); + } + /* * For the non-preemptive kernel config: let threads runs, if * they so wish. From 78cbc6513217b00be6a9904415ef7ff3619eb035 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 7 Mar 2022 08:43:03 +0800 Subject: [PATCH 724/773] ftrace: Fix some W=1 warnings in kernel doc comments Clean up the following clang-w1 warning: kernel/trace/ftrace.c:7827: warning: Function parameter or member 'ops' not described in 'unregister_ftrace_function'. kernel/trace/ftrace.c:7805: warning: Function parameter or member 'ops' not described in 'register_ftrace_function'. Link: https://lkml.kernel.org/r/20220307004303.26399-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a4b462b6f944..6105b7036482 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7790,7 +7790,7 @@ int ftrace_is_dead(void) /** * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. + * @ops: ops structure that holds the function for profiling. * * Register a function to be called by all functions in the * kernel. @@ -7817,7 +7817,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_function); /** * unregister_ftrace_function - unregister a function for profiling. - * @ops - ops structure that holds the function to unregister + * @ops: ops structure that holds the function to unregister * * Unregister a function that was added to be called by ftrace profiling. */ From ac77998b7ac3044f0509b097da9637184598980d Mon Sep 17 00:00:00 2001 From: Mohammad Kabat Date: Thu, 25 Mar 2021 14:38:55 +0200 Subject: [PATCH 725/773] net/mlx5: Fix size field in bufferx_reg struct According to HW spec the field "size" should be 16 bits in bufferx register. Fixes: e281682bf294 ("net/mlx5_core: HW data structs/types definitions cleanup") Signed-off-by: Mohammad Kabat Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 598ac3bcc901..5743f5b3414b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9900,8 +9900,8 @@ struct mlx5_ifc_bufferx_reg_bits { u8 reserved_at_0[0x6]; u8 lossy[0x1]; u8 epsb[0x1]; - u8 reserved_at_8[0xc]; - u8 size[0xc]; + u8 reserved_at_8[0x8]; + u8 size[0x10]; u8 xoff_threshold[0x10]; u8 xon_threshold[0x10]; From 063bd355595428750803d8736a9bb7c8db67d42d Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 4 Feb 2022 11:47:44 +0200 Subject: [PATCH 726/773] net/mlx5: Fix a race on command flush flow Fix a refcount use after free warning due to a race on command entry. Such race occurs when one of the commands releases its last refcount and frees its index and entry while another process running command flush flow takes refcount to this command entry. The process which handles commands flush may see this command as needed to be flushed if the other process released its refcount but didn't release the index yet. Fix it by adding the needed spin lock. It fixes the following warning trace: refcount_t: addition on 0; use-after-free. WARNING: CPU: 11 PID: 540311 at lib/refcount.c:25 refcount_warn_saturate+0x80/0xe0 ... RIP: 0010:refcount_warn_saturate+0x80/0xe0 ... Call Trace: mlx5_cmd_trigger_completions+0x293/0x340 [mlx5_core] mlx5_cmd_flush+0x3a/0xf0 [mlx5_core] enter_error_state+0x44/0x80 [mlx5_core] mlx5_fw_fatal_reporter_err_work+0x37/0xe0 [mlx5_core] process_one_work+0x1be/0x390 worker_thread+0x4d/0x3d0 ? rescuer_thread+0x350/0x350 kthread+0x141/0x160 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 Fixes: 50b2412b7e78 ("net/mlx5: Avoid possible free of command entry while timeout comp handler") Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 17fe05809653..3eacd8739929 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -131,11 +131,8 @@ static int cmd_alloc_index(struct mlx5_cmd *cmd) static void cmd_free_index(struct mlx5_cmd *cmd, int idx) { - unsigned long flags; - - spin_lock_irqsave(&cmd->alloc_lock, flags); + lockdep_assert_held(&cmd->alloc_lock); set_bit(idx, &cmd->bitmask); - spin_unlock_irqrestore(&cmd->alloc_lock, flags); } static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) @@ -145,17 +142,21 @@ static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) { + struct mlx5_cmd *cmd = ent->cmd; + unsigned long flags; + + spin_lock_irqsave(&cmd->alloc_lock, flags); if (!refcount_dec_and_test(&ent->refcnt)) - return; + goto out; if (ent->idx >= 0) { - struct mlx5_cmd *cmd = ent->cmd; - cmd_free_index(cmd, ent->idx); up(ent->page_queue ? &cmd->pages_sem : &cmd->sem); } cmd_free_ent(ent); +out: + spin_unlock_irqrestore(&cmd->alloc_lock, flags); } static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx) From 39bab83b119faac4bf7f07173a42ed35be95147e Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Mon, 17 Jan 2022 15:32:16 +0200 Subject: [PATCH 727/773] net/mlx5: Fix offloading with ESWITCH_IPV4_TTL_MODIFY_ENABLE Only prio 1 is supported for nic mode when there is no ignore flow level support in firmware. But for switchdev mode, which supports fixed number of statically pre-allocated prios, this restriction is not relevant so it can be relaxed. Fixes: d671e109bd85 ("net/mlx5: Fix tc max supported prio for nic mode") Signed-off-by: Dima Chumak Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index 1e8ec4f236b2..df58cba37930 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -121,9 +121,6 @@ u32 mlx5_chains_get_nf_ft_chain(struct mlx5_fs_chains *chains) u32 mlx5_chains_get_prio_range(struct mlx5_fs_chains *chains) { - if (!mlx5_chains_prios_supported(chains)) - return 1; - if (mlx5_chains_ignore_flow_level_supported(chains)) return UINT_MAX; From ad11c4f1d8fd1f03639460e425a36f7fd0ea83f5 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 16 Feb 2022 13:56:57 +0200 Subject: [PATCH 728/773] net/mlx5e: Lag, Only handle events from highest priority multipath entry There could be multiple multipath entries but changing the port affinity for each one doesn't make much sense and there should be a default one. So only track the entry with lowest priority value. The commit doesn't affect existing users with a single entry. Fixes: 544fe7c2e654 ("net/mlx5e: Activate HW multipath and handle port affinity based on FIB events") Signed-off-by: Roi Dayan Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index 1ca01a5b6cdd..626aa60b6099 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -126,6 +126,10 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, return; } + /* Handle multipath entry with lower priority value */ + if (mp->mfi && mp->mfi != fi && fi->fib_priority >= mp->mfi->fib_priority) + return; + /* Handle add/replace event */ nhs = fib_info_num_path(fi); if (nhs == 1) { @@ -135,12 +139,13 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev); if (i < 0) - i = MLX5_LAG_NORMAL_AFFINITY; - else - ++i; + return; + i++; mlx5_lag_set_port_affinity(ldev, i); } + + mp->mfi = fi; return; } From 99a2b9be077ae3a5d97fbf5f7782e0f2e9812978 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 2 Mar 2022 17:07:08 +0200 Subject: [PATCH 729/773] net/mlx5e: SHAMPO, reduce TIR indication SHAMPO is an RQ / WQ feature, an indication was added to the TIR in the first place to enforce suitability between connected TIR and RQ, this enforcement does not exist in current the Firmware implementation and was redundant in the first place. Fixes: 83439f3c37aa ("net/mlx5e: Add HW-GRO offload") Signed-off-by: Ben Ben-Ishay Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tir.c | 3 --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +-- include/linux/mlx5/mlx5_ifc.h | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c index da169b816665..d4239e3b3c88 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c @@ -88,9 +88,6 @@ void mlx5e_tir_builder_build_packet_merge(struct mlx5e_tir_builder *builder, (MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ - rough_max_l2_l3_hdr_sz) >> 8); MLX5_SET(tirc, tirc, lro_timeout_period_usecs, pkt_merge_param->timeout); break; - case MLX5E_PACKET_MERGE_SHAMPO: - MLX5_SET(tirc, tirc, packet_merge_mask, MLX5_TIRC_PACKET_MERGE_MASK_SHAMPO); - break; default: break; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bf80fb612449..3667f5ef5990 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3616,8 +3616,7 @@ static int set_feature_hw_gro(struct net_device *netdev, bool enable) goto out; } - err = mlx5e_safe_switch_params(priv, &new_params, - mlx5e_modify_tirs_packet_merge_ctx, NULL, reset); + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, reset); out: mutex_unlock(&priv->state_lock); return err; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5743f5b3414b..49a48d7709ac 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3434,7 +3434,6 @@ enum { enum { MLX5_TIRC_PACKET_MERGE_MASK_IPV4_LRO = BIT(0), MLX5_TIRC_PACKET_MERGE_MASK_IPV6_LRO = BIT(1), - MLX5_TIRC_PACKET_MERGE_MASK_SHAMPO = BIT(2), }; enum { From 33970b031dc4653cc9dc80f2886976706c4c8ef1 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 9 Mar 2022 19:08:42 +0000 Subject: [PATCH 730/773] ARM: fix co-processor register typo In the recent Spectre BHB patches, there was a typo that is only exposed in certain configurations: mcr p15,0,XX,c7,r5,4 should have been mcr p15,0,XX,c7,c5,4 Reported-by: kernel test robot Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Signed-off-by: Russell King (Oracle) Acked-by: Catalin Marinas Signed-off-by: Linus Torvalds --- arch/arm/include/asm/assembler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index b50cc0169d5c..aee73ef5b3dc 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -113,7 +113,7 @@ .endm .macro isb, args - mcr p15, 0, r0, c7, r5, 4 + mcr p15, 0, r0, c7, c5, 4 .endm #endif From 52c9f93a9c482251cb0d224faa602ba26d462be8 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 9 Mar 2022 12:16:34 -0700 Subject: [PATCH 731/773] arm64: Do not include __READ_ONCE() block in assembly files When building arm64 defconfig + CONFIG_LTO_CLANG_{FULL,THIN}=y after commit 558c303c9734 ("arm64: Mitigate spectre style branch history side channels"), the following error occurs: :4:2: error: invalid fixup for movz/movk instruction mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 ^ Marc figured out that moving "#include " in include/linux/arm-smccc.h into a !__ASSEMBLY__ block resolves it. The full include chain with CONFIG_LTO=y from include/linux/arm-smccc.h: include/linux/init.h include/linux/compiler.h arch/arm64/include/asm/rwonce.h arch/arm64/include/asm/alternative-macros.h arch/arm64/include/asm/assembler.h The asm/alternative-macros.h include in asm/rwonce.h only happens when CONFIG_LTO is set, which ultimately casues asm/assembler.h to be included before the definition of ARM_SMCCC_ARCH_WORKAROUND_3. As a result, the preprocessor does not expand ARM_SMCCC_ARCH_WORKAROUND_3 in __mitigate_spectre_bhb_fw, which results in the error above. Avoid this problem by just avoiding the CONFIG_LTO=y __READ_ONCE() block in asm/rwonce.h with assembly files, as nothing in that block is useful to assembly files, which allows ARM_SMCCC_ARCH_WORKAROUND_3 to be properly expanded with CONFIG_LTO=y builds. Fixes: e35123d83ee3 ("arm64: lto: Strengthen READ_ONCE() to acquire when CONFIG_LTO=y") Cc: # 5.11.x Link: https://lore.kernel.org/r/20220309155716.3988480-1-maz@kernel.org/ Reported-by: Marc Zyngier Acked-by: James Morse Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20220309191633.2307110-1-nathan@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/rwonce.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h index 1bce62fa908a..56f7b1d4d54b 100644 --- a/arch/arm64/include/asm/rwonce.h +++ b/arch/arm64/include/asm/rwonce.h @@ -5,7 +5,7 @@ #ifndef __ASM_RWONCE_H #define __ASM_RWONCE_H -#ifdef CONFIG_LTO +#if defined(CONFIG_LTO) && !defined(__ASSEMBLY__) #include #include @@ -66,7 +66,7 @@ }) #endif /* !BUILD_VDSO */ -#endif /* CONFIG_LTO */ +#endif /* CONFIG_LTO && !__ASSEMBLY__ */ #include From 36168e387fa7d0f1fe0cd5cf76c8cea7aee714fa Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 9 Mar 2022 15:07:27 -0700 Subject: [PATCH 732/773] ARM: Do not use NOCROSSREFS directive with ld.lld ld.lld does not support the NOCROSSREFS directive at the moment, which breaks the build after commit b9baf5c8c5c3 ("ARM: Spectre-BHB workaround"): ld.lld: error: ./arch/arm/kernel/vmlinux.lds:34: AT expected, but got NOCROSSREFS Support for this directive will eventually be implemented, at which point a version check can be added. To avoid breaking the build in the meantime, just define NOCROSSREFS to nothing when using ld.lld, with a link to the issue for tracking. Cc: stable@vger.kernel.org Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Link: https://github.com/ClangBuiltLinux/linux/issues/1609 Signed-off-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- arch/arm/include/asm/vmlinux.lds.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h index 0ef21bfae9f6..fad45c884e98 100644 --- a/arch/arm/include/asm/vmlinux.lds.h +++ b/arch/arm/include/asm/vmlinux.lds.h @@ -26,6 +26,14 @@ #define ARM_MMU_DISCARD(x) x #endif +/* + * ld.lld does not support NOCROSSREFS: + * https://github.com/ClangBuiltLinux/linux/issues/1609 + */ +#ifdef CONFIG_LD_IS_LLD +#define NOCROSSREFS +#endif + /* Set start/end symbol names to the LMA for the section */ #define ARM_LMA(sym, section) \ sym##_start = LOADADDR(section); \ From f80cfe2f26581f188429c12bd937eb905ad3ac7b Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 8 Mar 2022 21:50:07 +0300 Subject: [PATCH 733/773] NFC: port100: fix use-after-free in port100_send_complete Syzbot reported UAF in port100_send_complete(). The root case is in missing usb_kill_urb() calls on error handling path of ->probe function. port100_send_complete() accesses devm allocated memory which will be freed on probe failure. We should kill this urbs before returning an error from probe function to prevent reported use-after-free Fail log: BUG: KASAN: use-after-free in port100_send_complete+0x16e/0x1a0 drivers/nfc/port100.c:935 Read of size 1 at addr ffff88801bb59540 by task ksoftirqd/2/26 ... Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x303 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 port100_send_complete+0x16e/0x1a0 drivers/nfc/port100.c:935 __usb_hcd_giveback_urb+0x2b0/0x5c0 drivers/usb/core/hcd.c:1670 ... Allocated by task 1255: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:436 [inline] ____kasan_kmalloc mm/kasan/common.c:515 [inline] ____kasan_kmalloc mm/kasan/common.c:474 [inline] __kasan_kmalloc+0xa6/0xd0 mm/kasan/common.c:524 alloc_dr drivers/base/devres.c:116 [inline] devm_kmalloc+0x96/0x1d0 drivers/base/devres.c:823 devm_kzalloc include/linux/device.h:209 [inline] port100_probe+0x8a/0x1320 drivers/nfc/port100.c:1502 Freed by task 1255: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free+0xff/0x140 mm/kasan/common.c:328 kasan_slab_free include/linux/kasan.h:236 [inline] __cache_free mm/slab.c:3437 [inline] kfree+0xf8/0x2b0 mm/slab.c:3794 release_nodes+0x112/0x1a0 drivers/base/devres.c:501 devres_release_all+0x114/0x190 drivers/base/devres.c:530 really_probe+0x626/0xcc0 drivers/base/dd.c:670 Reported-and-tested-by: syzbot+16bcb127fb73baeecb14@syzkaller.appspotmail.com Fixes: 0347a6ab300a ("NFC: port100: Commands mechanism implementation") Signed-off-by: Pavel Skripkin Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220308185007.6987-1-paskripkin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/nfc/port100.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c index d7db1a0e6be1..00d8ea6dcb5d 100644 --- a/drivers/nfc/port100.c +++ b/drivers/nfc/port100.c @@ -1612,7 +1612,9 @@ free_nfc_dev: nfc_digital_free_device(dev->nfc_digital_dev); error: + usb_kill_urb(dev->in_urb); usb_free_urb(dev->in_urb); + usb_kill_urb(dev->out_urb); usb_free_urb(dev->out_urb); usb_put_dev(dev->udev); From 18dfc667550fe9c032a6dcc3402b50e691e18029 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 8 Mar 2022 23:15:00 +0100 Subject: [PATCH 734/773] selftests: pmtu.sh: Kill tcpdump processes launched by subshell. The cleanup() function takes care of killing processes launched by the test functions. It relies on variables like ${tcpdump_pids} to get the relevant PIDs. But tests are run in their own subshell, so updated *_pids values are invisible to other shells. Therefore cleanup() never sees any process to kill: $ ./tools/testing/selftests/net/pmtu.sh -t pmtu_ipv4_exception TEST: ipv4: PMTU exceptions [ OK ] TEST: ipv4: PMTU exceptions - nexthop objects [ OK ] $ pgrep -af tcpdump 6084 tcpdump -s 0 -i veth_A-R1 -w pmtu_ipv4_exception_veth_A-R1.pcap 6085 tcpdump -s 0 -i veth_R1-A -w pmtu_ipv4_exception_veth_R1-A.pcap 6086 tcpdump -s 0 -i veth_R1-B -w pmtu_ipv4_exception_veth_R1-B.pcap 6087 tcpdump -s 0 -i veth_B-R1 -w pmtu_ipv4_exception_veth_B-R1.pcap 6088 tcpdump -s 0 -i veth_A-R2 -w pmtu_ipv4_exception_veth_A-R2.pcap 6089 tcpdump -s 0 -i veth_R2-A -w pmtu_ipv4_exception_veth_R2-A.pcap 6090 tcpdump -s 0 -i veth_R2-B -w pmtu_ipv4_exception_veth_R2-B.pcap 6091 tcpdump -s 0 -i veth_B-R2 -w pmtu_ipv4_exception_veth_B-R2.pcap 6228 tcpdump -s 0 -i veth_A-R1 -w pmtu_ipv4_exception_veth_A-R1.pcap 6229 tcpdump -s 0 -i veth_R1-A -w pmtu_ipv4_exception_veth_R1-A.pcap 6230 tcpdump -s 0 -i veth_R1-B -w pmtu_ipv4_exception_veth_R1-B.pcap 6231 tcpdump -s 0 -i veth_B-R1 -w pmtu_ipv4_exception_veth_B-R1.pcap 6232 tcpdump -s 0 -i veth_A-R2 -w pmtu_ipv4_exception_veth_A-R2.pcap 6233 tcpdump -s 0 -i veth_R2-A -w pmtu_ipv4_exception_veth_R2-A.pcap 6234 tcpdump -s 0 -i veth_R2-B -w pmtu_ipv4_exception_veth_R2-B.pcap 6235 tcpdump -s 0 -i veth_B-R2 -w pmtu_ipv4_exception_veth_B-R2.pcap Fix this by running cleanup() in the context of the test subshell. Now that each test cleans the environment after completion, there's no need for calling cleanup() again when the next test starts. So let's drop it from the setup() function. This is okay because cleanup() is also called when pmtu.sh starts, so even the first test starts in a clean environment. Also, use tcpdump's immediate mode. Otherwise it might not have time to process buffered packets, resulting in missing packets or even empty pcap files for short tests. Note: PAUSE_ON_FAIL is still evaluated before cleanup(), so one can still inspect the test environment upon failure when using -p. Fixes: a92a0a7b8e7c ("selftests: pmtu: Simplify cleanup and namespace names") Signed-off-by: Guillaume Nault Reviewed-by: Shuah Khan Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 543ad7513a8e..2e8972573d91 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -865,7 +865,6 @@ setup_ovs_bridge() { setup() { [ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip - cleanup for arg do eval setup_${arg} || { echo " ${arg} not supported"; return 1; } done @@ -876,7 +875,7 @@ trace() { for arg do [ "${ns_cmd}" = "" ] && ns_cmd="${arg}" && continue - ${ns_cmd} tcpdump -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null & + ${ns_cmd} tcpdump --immediate-mode -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null & tcpdump_pids="${tcpdump_pids} $!" ns_cmd= done @@ -1836,6 +1835,10 @@ run_test() { unset IFS + # Since cleanup() relies on variables modified by this subshell, it + # has to run in this context. + trap cleanup EXIT + if [ "$VERBOSE" = "1" ]; then printf "\n##########################################################################\n\n" fi From 94a4a4fe4c696413932eed8bdec46574de9576b8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 8 Mar 2022 23:15:03 +0100 Subject: [PATCH 735/773] selftests: pmtu.sh: Kill nettest processes launched in subshell. When using "run_cmd &", then "$!" refers to the PID of the subshell used to run , not the command itself. Therefore nettest_pids actually doesn't contain the list of the nettest commands running in the background. So cleanup() can't kill them and the nettest processes run until completion (fortunately they have a 5s timeout). Fix this by defining a new command for running processes in the background, for which "$!" really refers to the PID of the command run. Also, double quote variables on the modified lines, to avoid shellcheck warnings. Fixes: ece1278a9b81 ("selftests: net: add ESP-in-UDP PMTU test") Signed-off-by: Guillaume Nault Reviewed-by: Shuah Khan Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 2e8972573d91..694732e4b344 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -374,6 +374,16 @@ run_cmd() { return $rc } +run_cmd_bg() { + cmd="$*" + + if [ "$VERBOSE" = "1" ]; then + printf " COMMAND: %s &\n" "${cmd}" + fi + + $cmd 2>&1 & +} + # Find the auto-generated name for this namespace nsname() { eval echo \$NS_$1 @@ -670,10 +680,10 @@ setup_nettest_xfrm() { [ ${1} -eq 6 ] && proto="-6" || proto="" port=${2} - run_cmd ${ns_a} nettest ${proto} -q -D -s -x -p ${port} -t 5 & + run_cmd_bg "${ns_a}" nettest "${proto}" -q -D -s -x -p "${port}" -t 5 nettest_pids="${nettest_pids} $!" - run_cmd ${ns_b} nettest ${proto} -q -D -s -x -p ${port} -t 5 & + run_cmd_bg "${ns_b}" nettest "${proto}" -q -D -s -x -p "${port}" -t 5 nettest_pids="${nettest_pids} $!" } From 03fe003547975680fdb9ff5ab0e41cb68276c4f2 Mon Sep 17 00:00:00 2001 From: Mark Featherston Date: Wed, 9 Mar 2022 17:16:16 -0800 Subject: [PATCH 736/773] gpio: ts4900: Do not set DAT and OE together This works around an issue with the hardware where both OE and DAT are exposed in the same register. If both are updated simultaneously, the harware makes no guarantees that OE or DAT will actually change in any given order and may result in a glitch of a few ns on a GPIO pin when changing direction and value in a single write. Setting direction to input now only affects OE bit. Setting direction to output updates DAT first, then OE. Fixes: 9c6686322d74 ("gpio: add Technologic I2C-FPGA gpio support") Signed-off-by: Mark Featherston Signed-off-by: Kris Bahnsen Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-ts4900.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpio-ts4900.c b/drivers/gpio/gpio-ts4900.c index d885032cf814..d918d2df4de2 100644 --- a/drivers/gpio/gpio-ts4900.c +++ b/drivers/gpio/gpio-ts4900.c @@ -1,7 +1,7 @@ /* * Digital I/O driver for Technologic Systems I2C FPGA Core * - * Copyright (C) 2015 Technologic Systems + * Copyright (C) 2015, 2018 Technologic Systems * Copyright (C) 2016 Savoir-Faire Linux * * This program is free software; you can redistribute it and/or @@ -55,19 +55,33 @@ static int ts4900_gpio_direction_input(struct gpio_chip *chip, { struct ts4900_gpio_priv *priv = gpiochip_get_data(chip); - /* - * This will clear the output enable bit, the other bits are - * dontcare when this is cleared + /* Only clear the OE bit here, requires a RMW. Prevents potential issue + * with OE and data getting to the physical pin at different times. */ - return regmap_write(priv->regmap, offset, 0); + return regmap_update_bits(priv->regmap, offset, TS4900_GPIO_OE, 0); } static int ts4900_gpio_direction_output(struct gpio_chip *chip, unsigned int offset, int value) { struct ts4900_gpio_priv *priv = gpiochip_get_data(chip); + unsigned int reg; int ret; + /* If changing from an input to an output, we need to first set the + * proper data bit to what is requested and then set OE bit. This + * prevents a glitch that can occur on the IO line + */ + regmap_read(priv->regmap, offset, ®); + if (!(reg & TS4900_GPIO_OE)) { + if (value) + reg = TS4900_GPIO_OUT; + else + reg &= ~TS4900_GPIO_OUT; + + regmap_write(priv->regmap, offset, reg); + } + if (value) ret = regmap_write(priv->regmap, offset, TS4900_GPIO_OE | TS4900_GPIO_OUT); From 55d01c98a88b346e217eaa931b32e7baea905c9a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 8 Mar 2022 09:44:54 +0100 Subject: [PATCH 737/773] gpio: sim: fix a typo Just noticed this when applying Andy's patch. s/childred/children/ Fixes: cb8c474e79be ("gpio: sim: new testing module") Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- drivers/gpio/gpio-sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index bb9bb595c1a8..8e5d87984a48 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -547,7 +547,7 @@ struct gpio_sim_bank { * * So we need to store the pointer to the parent struct here. We can * dereference it anywhere we need with no checks and no locking as - * it's guaranteed to survive the childred and protected by configfs + * it's guaranteed to survive the children and protected by configfs * locks. * * Same for other structures. From b1a384d2cbccb1eb3f84765020d25e2c1929706e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 10 Mar 2022 10:22:14 +0000 Subject: [PATCH 738/773] ARM: fix build warning in proc-v7-bugs.c The kernel test robot discovered that building without HARDEN_BRANCH_PREDICTOR issues a warning due to a missing argument to pr_info(). Add the missing argument. Reported-by: kernel test robot Fixes: 9dd78194a372 ("ARM: report Spectre v2 status through sysfs") Signed-off-by: Russell King (Oracle) Signed-off-by: Linus Torvalds --- arch/arm/mm/proc-v7-bugs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c index c226feab2457..06dbfb968182 100644 --- a/arch/arm/mm/proc-v7-bugs.c +++ b/arch/arm/mm/proc-v7-bugs.c @@ -108,7 +108,8 @@ static unsigned int spectre_v2_install_workaround(unsigned int method) #else static unsigned int spectre_v2_install_workaround(unsigned int method) { - pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n"); + pr_info("CPU%u: Spectre V2: workarounds disabled by configuration\n", + smp_processor_id()); return SPECTRE_VULNERABLE; } From a1cc1697bb56cdf880ad4d17b79a39ef2c294bc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 10 Mar 2022 11:39:23 +0100 Subject: [PATCH 739/773] arm64: dts: marvell: armada-37xx: Remap IO space to bus address 0x0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy and old PCI I/O based cards do not support 32-bit I/O addressing. Since commit 64f160e19e92 ("PCI: aardvark: Configure PCIe resources from 'ranges' DT property") kernel can set different PCIe address on CPU and different on the bus for the one A37xx address mapping without any firmware support in case the bus address does not conflict with other A37xx mapping. So remap I/O space to the bus address 0x0 to enable support for old legacy I/O port based cards which have hardcoded I/O ports in low address space. Note that DDR on A37xx is mapped to bus address 0x0. And mapping of I/O space can be set to address 0x0 too because MEM space and I/O space are separate and so do not conflict. Remapping IO space on Turris Mox to different address is not possible to due bootloader bug. Signed-off-by: Pali Rohár Reported-by: Arnd Bergmann Fixes: 76f6386b25cc ("arm64: dts: marvell: Add Aardvark PCIe support for Armada 3700") Cc: stable@vger.kernel.org # 64f160e19e92 ("PCI: aardvark: Configure PCIe resources from 'ranges' DT property") Cc: stable@vger.kernel.org # 514ef1e62d65 ("arm64: dts: marvell: armada-37xx: Extend PCIe MEM space") Reviewed-by: Arnd Bergmann Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts | 7 ++++++- arch/arm64/boot/dts/marvell/armada-37xx.dtsi | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts index 04da07ae4420..4b377fe807e0 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts @@ -138,7 +138,9 @@ /* * U-Boot port for Turris Mox has a bug which always expects that "ranges" DT property * contains exactly 2 ranges with 3 (child) address cells, 2 (parent) address cells and - * 2 size cells and also expects that the second range starts at 16 MB offset. If these + * 2 size cells and also expects that the second range starts at 16 MB offset. Also it + * expects that first range uses same address for PCI (child) and CPU (parent) cells (so + * no remapping) and that this address is the lowest from all specified ranges. If these * conditions are not met then U-Boot crashes during loading kernel DTB file. PCIe address * space is 128 MB long, so the best split between MEM and IO is to use fixed 16 MB window * for IO and the rest 112 MB (64+32+16) for MEM, despite that maximal IO size is just 64 kB. @@ -147,6 +149,9 @@ * https://source.denx.de/u-boot/u-boot/-/commit/cb2ddb291ee6fcbddd6d8f4ff49089dfe580f5d7 * https://source.denx.de/u-boot/u-boot/-/commit/c64ac3b3185aeb3846297ad7391fc6df8ecd73bf * https://source.denx.de/u-boot/u-boot/-/commit/4a82fca8e330157081fc132a591ebd99ba02ee33 + * Bug related to requirement of same child and parent addresses for first range is fixed + * in U-Boot version 2022.04 by following commit: + * https://source.denx.de/u-boot/u-boot/-/commit/1fd54253bca7d43d046bba4853fe5fafd034bc17 */ #address-cells = <3>; #size-cells = <2>; diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi index 673f4906eef9..fb78ef613b29 100644 --- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -499,7 +499,7 @@ * (totaling 127 MiB) for MEM. */ ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x07f00000 /* Port 0 MEM */ - 0x81000000 0 0xefff0000 0 0xefff0000 0 0x00010000>; /* Port 0 IO */ + 0x81000000 0 0x00000000 0 0xefff0000 0 0x00010000>; /* Port 0 IO */ interrupt-map-mask = <0 0 0 7>; interrupt-map = <0 0 0 1 &pcie_intc 0>, <0 0 0 2 &pcie_intc 1>, From c80ee64a8020ef1a6a92109798080786829b8994 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 11 Feb 2022 00:49:43 +0800 Subject: [PATCH 740/773] riscv: alternative only works on !XIP_KERNEL The alternative mechanism needs runtime code patching, it can't work on XIP_KERNEL. And the errata workarounds are implemented via the alternative mechanism. So add !XIP_KERNEL dependency for alternative and erratas. Signed-off-by: Jisheng Zhang Fixes: 44c922572952 ("RISC-V: enable XIP") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.erratas | 1 + arch/riscv/Kconfig.socs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas index b44d6ecdb46e..0aacd7052585 100644 --- a/arch/riscv/Kconfig.erratas +++ b/arch/riscv/Kconfig.erratas @@ -2,6 +2,7 @@ menu "CPU errata selection" config RISCV_ERRATA_ALTERNATIVE bool "RISC-V alternative scheme" + depends on !XIP_KERNEL default y help This Kconfig allows the kernel to automatically patch the diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 6ec44a22278a..c112ab2a9052 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -14,8 +14,8 @@ config SOC_SIFIVE select CLK_SIFIVE select CLK_SIFIVE_PRCI select SIFIVE_PLIC - select RISCV_ERRATA_ALTERNATIVE - select ERRATA_SIFIVE + select RISCV_ERRATA_ALTERNATIVE if !XIP_KERNEL + select ERRATA_SIFIVE if !XIP_KERNEL help This enables support for SiFive SoC platform hardware. From fe673d3f5bf1fc50cdc4b754831db91a2ec10126 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 8 Mar 2022 11:55:48 -0800 Subject: [PATCH 741/773] mm: gup: make fault_in_safe_writeable() use fixup_user_fault() Instead of using GUP, make fault_in_safe_writeable() actually force a 'handle_mm_fault()' using the same fixup_user_fault() machinery that futexes already use. Using the GUP machinery meant that fault_in_safe_writeable() did not do everything that a real fault would do, ranging from not auto-expanding the stack segment, to not updating accessed or dirty flags in the page tables (GUP sets those flags on the pages themselves). The latter causes problems on architectures (like s390) that do accessed bit handling in software, which meant that fault_in_safe_writeable() didn't actually do all the fault handling it needed to, and trying to access the user address afterwards would still cause faults. Reported-and-tested-by: Andreas Gruenbacher Fixes: cdd591fc86e3 ("iov_iter: Introduce fault_in_iov_iter_writeable") Link: https://lore.kernel.org/all/CAHc6FU5nP+nziNGG0JAF1FUx-GV7kKFvM7aZuU_XD2_1v4vnvg@mail.gmail.com/ Acked-by: David Hildenbrand Signed-off-by: Linus Torvalds --- mm/gup.c | 63 ++++++++++++++++++++------------------------------------ 1 file changed, 22 insertions(+), 41 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a9d4d724aef7..7bc1ba9ce440 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1729,11 +1729,11 @@ EXPORT_SYMBOL(fault_in_writeable); * @uaddr: start of address range * @size: length of address range * - * Faults in an address range using get_user_pages, i.e., without triggering - * hardware page faults. This is primarily useful when we already know that - * some or all of the pages in the address range aren't in memory. + * Faults in an address range for writing. This is primarily useful when we + * already know that some or all of the pages in the address range aren't in + * memory. * - * Other than fault_in_writeable(), this function is non-destructive. + * Unlike fault_in_writeable(), this function is non-destructive. * * Note that we don't pin or otherwise hold the pages referenced that we fault * in. There's no guarantee that they'll stay in memory for any duration of @@ -1744,46 +1744,27 @@ EXPORT_SYMBOL(fault_in_writeable); */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { - unsigned long start = (unsigned long)untagged_addr(uaddr); - unsigned long end, nstart, nend; + unsigned long start = (unsigned long)uaddr, end; struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - int locked = 0; + bool unlocked = false; - nstart = start & PAGE_MASK; - end = PAGE_ALIGN(start + size); - if (end < nstart) - end = 0; - for (; nstart != end; nstart = nend) { - unsigned long nr_pages; - long ret; - - if (!locked) { - locked = 1; - mmap_read_lock(mm); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - nend = end ? min(end, vma->vm_end) : vma->vm_end; - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - nr_pages = (nend - nstart) / PAGE_SIZE; - ret = __get_user_pages_locked(mm, nstart, nr_pages, - NULL, NULL, &locked, - FOLL_TOUCH | FOLL_WRITE); - if (ret <= 0) - break; - nend = nstart + ret * PAGE_SIZE; - } - if (locked) - mmap_read_unlock(mm); - if (nstart == end) + if (unlikely(size == 0)) return 0; - return size - min_t(size_t, nstart - start, size); + end = PAGE_ALIGN(start + size); + if (end < start) + end = 0; + + mmap_read_lock(mm); + do { + if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) + break; + start = (start + PAGE_SIZE) & PAGE_MASK; + } while (start != end); + mmap_read_unlock(mm); + + if (size > (unsigned long)uaddr - start) + return size - ((unsigned long)uaddr - start); + return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); From 2ac5b58e645c66932438bb021cb5b52097ce70b0 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 10 Mar 2022 01:53:13 +0000 Subject: [PATCH 742/773] gianfar: ethtool: Fix refcount leak in gfar_get_ts_info The of_find_compatible_node() function returns a node pointer with refcount incremented, We should use of_node_put() on it when done Add the missing of_node_put() to release the refcount. Fixes: 7349a74ea75c ("net: ethernet: gianfar_ethtool: get phc index through drvdata") Signed-off-by: Miaoqian Lin Reviewed-by: Jesse Brandeburg Reviewed-by: Claudiu Manoil Link: https://lore.kernel.org/r/20220310015313.14938-1-linmq006@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/gianfar_ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index ff756265d58f..9a2c16d69e2c 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -1464,6 +1464,7 @@ static int gfar_get_ts_info(struct net_device *dev, ptp_node = of_find_compatible_node(NULL, NULL, "fsl,etsec-ptp"); if (ptp_node) { ptp_dev = of_find_device_by_node(ptp_node); + of_node_put(ptp_node); if (ptp_dev) ptp = platform_get_drvdata(ptp_dev); } From 37c9d66c95564c85a001d8a035354f0220a1e1c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Wed, 9 Mar 2022 15:22:28 +0100 Subject: [PATCH 743/773] net: phy: DP83822: clear MISR2 register to disable interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MISR1 was cleared twice but the original author intention was probably to clear MISR1 & MISR2 to completely disable interrupts. Fix it to clear MISR2. Fixes: 87461f7a58ab ("net: phy: DP83822 initial driver submission") Signed-off-by: Clément Léger Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220309142228.761153-1-clement.leger@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 211b5476a6f5..ce17b2af3218 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -274,7 +274,7 @@ static int dp83822_config_intr(struct phy_device *phydev) if (err < 0) return err; - err = phy_write(phydev, MII_DP83822_MISR1, 0); + err = phy_write(phydev, MII_DP83822_MISR2, 0); if (err < 0) return err; From 26183cfe478c1d1d5cd1e3920a4b2c5b1980849d Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Tue, 8 Mar 2022 22:25:44 -0800 Subject: [PATCH 744/773] net: phy: correct spelling error of media in documentation The header file incorrectly referenced "median-independant interface" instead of media. Correct this typo. Signed-off-by: Colin Foster Fixes: 4069a572d423 ("net: phy: Document core PHY structures") Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20220309062544.3073-1-colin.foster@in-advantage.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/phy.h b/include/linux/phy.h index 6de8d7a90d78..8fa70ba063a5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -87,8 +87,8 @@ extern const int phy_10gbit_features_array[1]; * * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined - * @PHY_INTERFACE_MODE_MII: Median-independent interface - * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface + * @PHY_INTERFACE_MODE_MII: Media-independent interface + * @PHY_INTERFACE_MODE_GMII: Gigabit media-independent interface * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface From 633593a808980f82d251d0ca89730d8bb8b0220c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Mar 2022 16:11:45 -0800 Subject: [PATCH 745/773] sctp: fix kernel-infoleak for SCTP sockets syzbot reported a kernel infoleak [1] of 4 bytes. After analysis, it turned out r->idiag_expires is not initialized if inet_sctp_diag_fill() calls inet_diag_msg_common_fill() Make sure to clear idiag_timer/idiag_retrans/idiag_expires and let inet_diag_msg_sctpasoc_fill() fill them again if needed. [1] BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:121 [inline] BUG: KMSAN: kernel-infoleak in copyout lib/iov_iter.c:154 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x6ef/0x25a0 lib/iov_iter.c:668 instrument_copy_to_user include/linux/instrumented.h:121 [inline] copyout lib/iov_iter.c:154 [inline] _copy_to_iter+0x6ef/0x25a0 lib/iov_iter.c:668 copy_to_iter include/linux/uio.h:162 [inline] simple_copy_to_iter+0xf3/0x140 net/core/datagram.c:519 __skb_datagram_iter+0x2d5/0x11b0 net/core/datagram.c:425 skb_copy_datagram_iter+0xdc/0x270 net/core/datagram.c:533 skb_copy_datagram_msg include/linux/skbuff.h:3696 [inline] netlink_recvmsg+0x669/0x1c80 net/netlink/af_netlink.c:1977 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] __sys_recvfrom+0x795/0xa10 net/socket.c:2097 __do_sys_recvfrom net/socket.c:2115 [inline] __se_sys_recvfrom net/socket.c:2111 [inline] __x64_sys_recvfrom+0x19d/0x210 net/socket.c:2111 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: slab_post_alloc_hook mm/slab.h:737 [inline] slab_alloc_node mm/slub.c:3247 [inline] __kmalloc_node_track_caller+0xe0c/0x1510 mm/slub.c:4975 kmalloc_reserve net/core/skbuff.c:354 [inline] __alloc_skb+0x545/0xf90 net/core/skbuff.c:426 alloc_skb include/linux/skbuff.h:1158 [inline] netlink_dump+0x3e5/0x16c0 net/netlink/af_netlink.c:2248 __netlink_dump_start+0xcf8/0xe90 net/netlink/af_netlink.c:2373 netlink_dump_start include/linux/netlink.h:254 [inline] inet_diag_handler_cmd+0x2e7/0x400 net/ipv4/inet_diag.c:1341 sock_diag_rcv_msg+0x24a/0x620 netlink_rcv_skb+0x40c/0x7e0 net/netlink/af_netlink.c:2494 sock_diag_rcv+0x63/0x80 net/core/sock_diag.c:277 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x1093/0x1360 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x14d9/0x1720 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] sock_write_iter+0x594/0x690 net/socket.c:1061 do_iter_readv_writev+0xa7f/0xc70 do_iter_write+0x52c/0x1500 fs/read_write.c:851 vfs_writev fs/read_write.c:924 [inline] do_writev+0x645/0xe00 fs/read_write.c:967 __do_sys_writev fs/read_write.c:1040 [inline] __se_sys_writev fs/read_write.c:1037 [inline] __x64_sys_writev+0xe5/0x120 fs/read_write.c:1037 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Bytes 68-71 of 2508 are uninitialized Memory access of size 2508 starts at ffff888114f9b000 Data copied to user address 00007f7fe09ff2e0 CPU: 1 PID: 3478 Comm: syz-executor306 Not tainted 5.17.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Vlad Yasevich Cc: Neil Horman Cc: Marcelo Ricardo Leitner Reviewed-by: Xin Long Link: https://lore.kernel.org/r/20220310001145.297371-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/diag.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 034e2c74497d..d9c6d8f30f09 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -61,10 +61,6 @@ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; r->idiag_retrans = asoc->rtx_data_chunks; r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies); - } else { - r->idiag_timer = 0; - r->idiag_retrans = 0; - r->idiag_expires = 0; } } @@ -144,13 +140,14 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, r = nlmsg_data(nlh); BUG_ON(!sk_fullsock(sk)); + r->idiag_timer = 0; + r->idiag_retrans = 0; + r->idiag_expires = 0; if (asoc) { inet_diag_msg_sctpasoc_fill(r, sk, asoc); } else { inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; - r->idiag_timer = 0; - r->idiag_retrans = 0; } if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) From bc0e610a6eb0d46e4123fafdbe5e6141d9fff3be Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Wed, 9 Mar 2022 20:18:24 +0800 Subject: [PATCH 746/773] net: arc_emac: Fix use after free in arc_mdio_probe() If bus->state is equal to MDIOBUS_ALLOCATED, mdiobus_free(bus) will free the "bus". But bus->name is still used in the next line, which will lead to a use after free. We can fix it by putting the name in a local variable and make the bus->name point to the rodata section "name",then use the name in the error message without referring to bus to avoid the uaf. Fixes: 95b5fc03c189 ("net: arc_emac: Make use of the helper function dev_err_probe()") Signed-off-by: Jianglei Nie Link: https://lore.kernel.org/r/20220309121824.36529-1-niejianglei2021@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/arc/emac_mdio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/arc/emac_mdio.c b/drivers/net/ethernet/arc/emac_mdio.c index 9acf589b1178..87f40c2ba904 100644 --- a/drivers/net/ethernet/arc/emac_mdio.c +++ b/drivers/net/ethernet/arc/emac_mdio.c @@ -132,6 +132,7 @@ int arc_mdio_probe(struct arc_emac_priv *priv) { struct arc_emac_mdio_bus_data *data = &priv->bus_data; struct device_node *np = priv->dev->of_node; + const char *name = "Synopsys MII Bus"; struct mii_bus *bus; int error; @@ -142,7 +143,7 @@ int arc_mdio_probe(struct arc_emac_priv *priv) priv->bus = bus; bus->priv = priv; bus->parent = priv->dev; - bus->name = "Synopsys MII Bus"; + bus->name = name; bus->read = &arc_mdio_read; bus->write = &arc_mdio_write; bus->reset = &arc_mdio_reset; @@ -167,7 +168,7 @@ int arc_mdio_probe(struct arc_emac_priv *priv) if (error) { mdiobus_free(bus); return dev_err_probe(priv->dev, error, - "cannot register MDIO bus %s\n", bus->name); + "cannot register MDIO bus %s\n", name); } return 0; From 00b022f8f876a3a036b0df7f971001bef6398605 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 9 Mar 2022 22:55:35 -0600 Subject: [PATCH 747/773] net: bcmgenet: Don't claim WOL when its not available Some of the bcmgenet platforms don't correctly support WOL, yet ethtool returns: "Supports Wake-on: gsf" which is false. Ideally if there isn't a wol_irq, or there is something else that keeps the device from being able to wakeup it should display: "Supports Wake-on: d" This patch checks whether the device can wakup, before using the hard-coded supported flags. This corrects the ethtool reporting, as well as the WOL configuration because ethtool verifies that the mode is supported before attempting it. Fixes: c51de7f3976b ("net: bcmgenet: add Wake-on-LAN support code") Signed-off-by: Jeremy Linton Tested-by: Peter Robinson Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20220310045535.224450-1-jeremy.linton@arm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index e31a5a397f11..f55d9d9c01a8 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -40,6 +40,13 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) { struct bcmgenet_priv *priv = netdev_priv(dev); + struct device *kdev = &priv->pdev->dev; + + if (!device_can_wakeup(kdev)) { + wol->supported = 0; + wol->wolopts = 0; + return; + } wol->supported = WAKE_MAGIC | WAKE_MAGICSECURE | WAKE_FILTER; wol->wolopts = priv->wolopts; From 2c87c6f9fbddc5b84d67b2fa3f432fcac6d99d93 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Mar 2022 22:04:47 +0100 Subject: [PATCH 748/773] net: phy: meson-gxl: improve link-up behavior Sometimes the link comes up but no data flows. This patch fixes this behavior. It's not clear what's the root cause of the issue. According to the tests one other link-up issue remains. In very rare cases the link isn't even reported as up. Fixes: 84c8f773d2dc ("net: phy: meson-gxl: remove the use of .ack_callback()") Tested-by: Erico Nunes Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/e3473452-a1f9-efcf-5fdd-02b6f44c3fcd@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/meson-gxl.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index c49062ad72c6..73f7962a37d3 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -243,7 +243,13 @@ static irqreturn_t meson_gxl_handle_interrupt(struct phy_device *phydev) irq_status == INTSRC_ENERGY_DETECT) return IRQ_HANDLED; - phy_trigger_machine(phydev); + /* Give PHY some time before MAC starts sending data. This works + * around an issue where network doesn't come up properly. + */ + if (!(irq_status & INTSRC_LINK_DOWN)) + phy_queue_state_machine(phydev, msecs_to_jiffies(100)); + else + phy_trigger_machine(phydev); return IRQ_HANDLED; } From 5cb1ebdbc4342b1c2ce89516e19808d64417bdbc Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 10 Mar 2022 18:16:41 +0100 Subject: [PATCH 749/773] ice: Fix race condition during interface enslave Commit 5dbbbd01cbba83 ("ice: Avoid RTNL lock when re-creating auxiliary device") changes a process of re-creation of aux device so ice_plug_aux_dev() is called from ice_service_task() context. This unfortunately opens a race window that can result in dead-lock when interface has left LAG and immediately enters LAG again. Reproducer: ``` #!/bin/sh ip link add lag0 type bond mode 1 miimon 100 ip link set lag0 for n in {1..10}; do echo Cycle: $n ip link set ens7f0 master lag0 sleep 1 ip link set ens7f0 nomaster done ``` This results in: [20976.208697] Workqueue: ice ice_service_task [ice] [20976.213422] Call Trace: [20976.215871] __schedule+0x2d1/0x830 [20976.219364] schedule+0x35/0xa0 [20976.222510] schedule_preempt_disabled+0xa/0x10 [20976.227043] __mutex_lock.isra.7+0x310/0x420 [20976.235071] enum_all_gids_of_dev_cb+0x1c/0x100 [ib_core] [20976.251215] ib_enum_roce_netdev+0xa4/0xe0 [ib_core] [20976.256192] ib_cache_setup_one+0x33/0xa0 [ib_core] [20976.261079] ib_register_device+0x40d/0x580 [ib_core] [20976.266139] irdma_ib_register_device+0x129/0x250 [irdma] [20976.281409] irdma_probe+0x2c1/0x360 [irdma] [20976.285691] auxiliary_bus_probe+0x45/0x70 [20976.289790] really_probe+0x1f2/0x480 [20976.298509] driver_probe_device+0x49/0xc0 [20976.302609] bus_for_each_drv+0x79/0xc0 [20976.306448] __device_attach+0xdc/0x160 [20976.310286] bus_probe_device+0x9d/0xb0 [20976.314128] device_add+0x43c/0x890 [20976.321287] __auxiliary_device_add+0x43/0x60 [20976.325644] ice_plug_aux_dev+0xb2/0x100 [ice] [20976.330109] ice_service_task+0xd0c/0xed0 [ice] [20976.342591] process_one_work+0x1a7/0x360 [20976.350536] worker_thread+0x30/0x390 [20976.358128] kthread+0x10a/0x120 [20976.365547] ret_from_fork+0x1f/0x40 ... [20976.438030] task:ip state:D stack: 0 pid:213658 ppid:213627 flags:0x00004084 [20976.446469] Call Trace: [20976.448921] __schedule+0x2d1/0x830 [20976.452414] schedule+0x35/0xa0 [20976.455559] schedule_preempt_disabled+0xa/0x10 [20976.460090] __mutex_lock.isra.7+0x310/0x420 [20976.464364] device_del+0x36/0x3c0 [20976.467772] ice_unplug_aux_dev+0x1a/0x40 [ice] [20976.472313] ice_lag_event_handler+0x2a2/0x520 [ice] [20976.477288] notifier_call_chain+0x47/0x70 [20976.481386] __netdev_upper_dev_link+0x18b/0x280 [20976.489845] bond_enslave+0xe05/0x1790 [bonding] [20976.494475] do_setlink+0x336/0xf50 [20976.502517] __rtnl_newlink+0x529/0x8b0 [20976.543441] rtnl_newlink+0x43/0x60 [20976.546934] rtnetlink_rcv_msg+0x2b1/0x360 [20976.559238] netlink_rcv_skb+0x4c/0x120 [20976.563079] netlink_unicast+0x196/0x230 [20976.567005] netlink_sendmsg+0x204/0x3d0 [20976.570930] sock_sendmsg+0x4c/0x50 [20976.574423] ____sys_sendmsg+0x1eb/0x250 [20976.586807] ___sys_sendmsg+0x7c/0xc0 [20976.606353] __sys_sendmsg+0x57/0xa0 [20976.609930] do_syscall_64+0x5b/0x1a0 [20976.613598] entry_SYSCALL_64_after_hwframe+0x65/0xca 1. Command 'ip link ... set nomaster' causes that ice_plug_aux_dev() is called from ice_service_task() context, aux device is created and associated device->lock is taken. 2. Command 'ip link ... set master...' calls ice's notifier under RTNL lock and that notifier calls ice_unplug_aux_dev(). That function tries to take aux device->lock but this is already taken by ice_plug_aux_dev() in step 1 3. Later ice_plug_aux_dev() tries to take RTNL lock but this is already taken in step 2 4. Dead-lock The patch fixes this issue by following changes: - Bit ICE_FLAG_PLUG_AUX_DEV is kept to be set during ice_plug_aux_dev() call in ice_service_task() - The bit is checked in ice_clear_rdma_cap() and only if it is not set then ice_unplug_aux_dev() is called. If it is set (in other words plugging of aux device was requested and ice_plug_aux_dev() is potentially running) then the function only clears the bit - Once ice_plug_aux_dev() call (in ice_service_task) is finished the bit ICE_FLAG_PLUG_AUX_DEV is cleared but it is also checked whether it was already cleared by ice_clear_rdma_cap(). If so then aux device is unplugged. Signed-off-by: Ivan Vecera Co-developed-by: Petr Oros Signed-off-by: Petr Oros Reviewed-by: Dave Ertman Link: https://lore.kernel.org/r/20220310171641.3863659-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice.h | 11 ++++++++++- drivers/net/ethernet/intel/ice/ice_main.c | 12 +++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 3121f9b04f59..bea1d1e39fa2 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -898,7 +898,16 @@ static inline void ice_set_rdma_cap(struct ice_pf *pf) */ static inline void ice_clear_rdma_cap(struct ice_pf *pf) { - ice_unplug_aux_dev(pf); + /* We can directly unplug aux device here only if the flag bit + * ICE_FLAG_PLUG_AUX_DEV is not set because ice_unplug_aux_dev() + * could race with ice_plug_aux_dev() called from + * ice_service_task(). In this case we only clear that bit now and + * aux device will be unplugged later once ice_plug_aux_device() + * called from ice_service_task() finishes (see ice_service_task()). + */ + if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) + ice_unplug_aux_dev(pf); + clear_bit(ICE_FLAG_RDMA_ENA, pf->flags); clear_bit(ICE_FLAG_AUX_ENA, pf->flags); } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 83e3e8aae6cf..493942e910be 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2255,9 +2255,19 @@ static void ice_service_task(struct work_struct *work) return; } - if (test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) + if (test_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) { + /* Plug aux device per request */ ice_plug_aux_dev(pf); + /* Mark plugging as done but check whether unplug was + * requested during ice_plug_aux_dev() call + * (e.g. from ice_clear_rdma_cap()) and if so then + * plug aux device. + */ + if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) + ice_unplug_aux_dev(pf); + } + if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) { struct iidc_event *event; From e0ae713023a9d09d6e1b454bdc8e8c1dd32c586e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 9 Mar 2022 23:13:45 +0100 Subject: [PATCH 750/773] xdp: xdp_mem_allocator can be NULL in trace_mem_connect(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the commit mentioned below __xdp_reg_mem_model() can return a NULL pointer. This pointer is dereferenced in trace_mem_connect() which leads to segfault. The trace points (mem_connect + mem_disconnect) were put in place to pair connect/disconnect using the IDs. The ID is only assigned if __xdp_reg_mem_model() does not return NULL. That connect trace point is of no use if there is no ID. Skip that connect trace point if xdp_alloc is NULL. [ Toke Høiland-Jørgensen delivered the reasoning for skipping the trace point ] Fixes: 4a48ef70b93b8 ("xdp: Allow registering memory model without rxq reference") Signed-off-by: Sebastian Andrzej Siewior Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/YikmmXsffE+QajTB@linutronix.de Signed-off-by: Jakub Kicinski --- net/core/xdp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 7aba35504986..73fae16264e1 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -357,7 +357,8 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); - trace_mem_connect(xdp_alloc, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) + trace_mem_connect(xdp_alloc, xdp_rxq); return 0; } From 0966d385830de3470b7131db8e86c0c5bc9c52dc Mon Sep 17 00:00:00 2001 From: Emil Renner Berthing Date: Wed, 23 Feb 2022 20:12:57 +0100 Subject: [PATCH 751/773] riscv: Fix auipc+jalr relocation range checks RISC-V can do PC-relative jumps with a 32bit range using the following two instructions: auipc t0, imm20 ; t0 = PC + imm20 * 2^12 jalr ra, t0, imm12 ; ra = PC + 4, PC = t0 + imm12 Crucially both the 20bit immediate imm20 and the 12bit immediate imm12 are treated as two's-complement signed values. For this reason the immediates are usually calculated like this: imm20 = (offset + 0x800) >> 12 imm12 = offset & 0xfff ..where offset is the signed offset from the auipc instruction. When the 11th bit of offset is 0 the addition of 0x800 doesn't change the top 20 bits and imm12 considered positive. When the 11th bit is 1 the carry of the addition by 0x800 means imm20 is one higher, but since imm12 is then considered negative the two's complement representation means it all cancels out nicely. However, this addition by 0x800 (2^11) means an offset greater than or equal to 2^31 - 2^11 would overflow so imm20 is considered negative and result in a backwards jump. Similarly the lower range of offset is also moved down by 2^11 and hence the true 32bit range is [-2^31 - 2^11, 2^31 - 2^11) Signed-off-by: Emil Renner Berthing Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 68a9e3d1fe16..4a48287513c3 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -13,6 +13,19 @@ #include #include +/* + * The auipc+jalr instruction pair can reach any PC-relative offset + * in the range [-2^31 - 2^11, 2^31 - 2^11) + */ +static bool riscv_insn_valid_32bit_offset(ptrdiff_t val) +{ +#ifdef CONFIG_32BIT + return true; +#else + return (-(1L << 31) - (1L << 11)) <= val && val < ((1L << 31) - (1L << 11)); +#endif +} + static int apply_r_riscv_32_rela(struct module *me, u32 *location, Elf_Addr v) { if (v != (u32)v) { @@ -95,7 +108,7 @@ static int apply_r_riscv_pcrel_hi20_rela(struct module *me, u32 *location, ptrdiff_t offset = (void *)v - (void *)location; s32 hi20; - if (offset != (s32)offset) { + if (!riscv_insn_valid_32bit_offset(offset)) { pr_err( "%s: target %016llx can not be addressed by the 32-bit offset from PC = %p\n", me->name, (long long)v, location); @@ -197,10 +210,9 @@ static int apply_r_riscv_call_plt_rela(struct module *me, u32 *location, Elf_Addr v) { ptrdiff_t offset = (void *)v - (void *)location; - s32 fill_v = offset; u32 hi20, lo12; - if (offset != fill_v) { + if (!riscv_insn_valid_32bit_offset(offset)) { /* Only emit the plt entry if offset over 32-bit range */ if (IS_ENABLED(CONFIG_MODULE_SECTIONS)) { offset = module_emit_plt_entry(me, v); @@ -224,10 +236,9 @@ static int apply_r_riscv_call_rela(struct module *me, u32 *location, Elf_Addr v) { ptrdiff_t offset = (void *)v - (void *)location; - s32 fill_v = offset; u32 hi20, lo12; - if (offset != fill_v) { + if (!riscv_insn_valid_32bit_offset(offset)) { pr_err( "%s: target %016llx can not be addressed by the 32-bit offset from PC = %p\n", me->name, (long long)v, location); From c993ee0f9f81caf5767a50d1faeba39a0dc82af2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:31 +0000 Subject: [PATCH 752/773] watch_queue: Fix filter limit check In watch_queue_set_filter(), there are a couple of places where we check that the filter type value does not exceed what the type_filter bitmap can hold. One place calculates the number of bits by: if (tf[i].type >= sizeof(wfilter->type_filter) * 8) which is fine, but the second does: if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) which is not. This can lead to a couple of out-of-bounds writes due to a too-large type: (1) __set_bit() on wfilter->type_filter (2) Writing more elements in wfilter->filters[] than we allocated. Fix this by just using the proper WATCH_TYPE__NR instead, which is the number of types we actually know about. The bug may cause an oops looking something like: BUG: KASAN: slab-out-of-bounds in watch_queue_set_filter+0x659/0x740 Write of size 4 at addr ffff88800d2c66bc by task watch_queue_oob/611 ... Call Trace: dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x150 ... kasan_report.cold+0x7f/0x11b ... watch_queue_set_filter+0x659/0x740 ... __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Allocated by task 611: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 watch_queue_set_filter+0x23a/0x740 __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88800d2c66a0 which belongs to the cache kmalloc-32 of size 32 The buggy address is located 28 bytes inside of 32-byte region [ffff88800d2c66a0, ffff88800d2c66c0) Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- include/linux/watch_queue.h | 3 ++- kernel/watch_queue.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index c994d1b2cdba..3b9a40ae8bdb 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -28,7 +28,8 @@ struct watch_type_filter { struct watch_filter { union { struct rcu_head rcu; - unsigned long type_filter[2]; /* Bitmask of accepted types */ + /* Bitmask of accepted types */ + DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[]; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c9eb20dd2c5..427b0318e303 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -320,7 +320,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ - if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } @@ -336,7 +336,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { - if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; From db8facfc9fafacefe8a835416a6b77c838088f8b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:38 +0000 Subject: [PATCH 753/773] watch_queue, pipe: Free watchqueue state after clearing pipe ring In free_pipe_info(), free the watchqueue state after clearing the pipe ring as each pipe ring descriptor has a release function, and in the case of a notification message, this is watch_queue_pipe_buf_release() which tries to mark the allocation bitmap that was previously released. Fix this by moving the put of the pipe's ref on the watch queue to after the ring has been cleared. We still need to call watch_queue_clear() before doing that to make sure that the pipe is disconnected from any notification sources first. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/pipe.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index cc28623a67b6..4eb88bc138bb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -831,10 +831,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) int i; #ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); - put_watch_queue(pipe->watch_queue); - } #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); @@ -844,6 +842,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (buf->ops) pipe_buf_release(pipe, buf); } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + put_watch_queue(pipe->watch_queue); +#endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); From c1853fbadcba1497f4907971e7107888e0714c81 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:46 +0000 Subject: [PATCH 754/773] watch_queue: Fix to release page in ->release() When a pipe ring descriptor points to a notification message, the refcount on the backing page is incremented by the generic get function, but the release function, which marks the bitmap, doesn't drop the page ref. Fix this by calling generic_pipe_buf_release() at the end of watch_queue_pipe_buf_release(). Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 427b0318e303..dfb3a7e28280 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -54,6 +54,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit += page->index; set_bit(bit, wqueue->notes_bitmap); + generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing From 96a4d8912b28451cd62825fd7caa0e66e091d938 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:08 +0000 Subject: [PATCH 755/773] watch_queue: Fix to always request a pow-of-2 pipe ring size The pipe ring size must always be a power of 2 as the head and tail pointers are masked off by AND'ing with the size of the ring - 1. watch_queue_set_size(), however, lets you specify any number of notes between 1 and 511. This number is passed through to pipe_resize_ring() without checking/forcing its alignment. Fix this by rounding the number of slots required up to the nearest power of two. The request is meant to guarantee that at least that many notifications can be generated before the queue is full, so rounding down isn't an option, but, alternatively, it may be better to give an error if we aren't allowed to allocate that much ring space. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index dfb3a7e28280..4bcd400984a7 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -244,7 +244,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; } - ret = pipe_resize_ring(pipe, nr_notes); + ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; From a66bd7575b5f449ee0ba20cfd21c3bc5b04ef361 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 11 Mar 2022 13:24:15 +0000 Subject: [PATCH 756/773] watch_queue: Use the bitmap API when applicable Use bitmap_alloc() to simplify code, improve the semantic and reduce some open-coded arithmetic in allocator arguments. Also change a memset(0xff) into an equivalent bitmap_fill() to keep consistency. Signed-off-by: Christophe JAILLET Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 4bcd400984a7..5b516eb2c7cc 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -220,7 +220,6 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) struct page **pages; unsigned long *bitmap; unsigned long user_bufs; - unsigned int bmsize; int ret, i, nr_pages; if (!wqueue) @@ -259,13 +258,11 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; } - bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; - bmsize *= sizeof(unsigned long); - bitmap = kmalloc(bmsize, GFP_KERNEL); + bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); if (!bitmap) goto error_p; - memset(bitmap, 0xff, bmsize); + bitmap_fill(bitmap, nr_notes); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; From 3b4c0371928c17af03e8397ac842346624017ce6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:22 +0000 Subject: [PATCH 757/773] watch_queue: Fix the alloc bitmap size to reflect notes allocated Currently, watch_queue_set_size() sets the number of notes available in wqueue->nr_notes according to the number of notes allocated, but sets the size of the bitmap to the unrounded number of notes originally asked for. Fix this by setting the bitmap size to the number of notes we're actually going to make available (ie. the number allocated). Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5b516eb2c7cc..9c476d2cbac0 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -243,6 +243,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; } + nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; @@ -266,7 +267,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; - wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + wqueue->nr_notes = nr_notes; return 0; error_p: From 7ea1a0124b6da246b5bc8c66cddaafd36acf3ecb Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:29 +0000 Subject: [PATCH 758/773] watch_queue: Free the alloc bitmap when the watch_queue is torn down Free the watch_queue note allocation bitmap when the watch_queue is destroyed. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c476d2cbac0..c12267ccc70e 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -370,6 +370,7 @@ static void __put_watch_queue(struct kref *kref) for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); + bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) From 2ed147f015af2b48f41c6f0b6746aa9ea85c19f3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:36 +0000 Subject: [PATCH 759/773] watch_queue: Fix lack of barrier/sync/lock between post and read There's nothing to synchronise post_one_notification() versus pipe_read(). Whilst posting is done under pipe->rd_wait.lock, the reader only takes pipe->mutex which cannot bar notification posting as that may need to be made from contexts that cannot sleep. Fix this by setting pipe->head with a barrier in post_one_notification() and reading pipe->head with a barrier in pipe_read(). If that's not sufficient, the rd_wait.lock will need to be taken, possibly in a ->confirm() op so that it only applies to notifications. The lock would, however, have to be dropped before copy_page_to_iter() is invoked. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/pipe.c | 3 ++- kernel/watch_queue.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 4eb88bc138bb..2667db9506e2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { - unsigned int head = pipe->head; + /* Read ->head with a barrier vs post_one_notification() */ + unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index c12267ccc70e..37bcd900fd77 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -113,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; - pipe->head = head + 1; + smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); From 4edc0760412b0c4ecefc7e02cb855b310b122825 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:24:47 +0000 Subject: [PATCH 760/773] watch_queue: Make comment about setting ->defunct more accurate watch_queue_clear() has a comment stating that setting ->defunct to true preventing new additions as well as preventing notifications. Whilst the latter is true, the first bit is superfluous since at the time this function is called, the pipe cannot be accessed to add new event sources. Remove the "new additions" bit from the comment. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 37bcd900fd77..00703444a219 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -566,7 +566,7 @@ void watch_queue_clear(struct watch_queue *wqueue) rcu_read_lock(); spin_lock_bh(&wqueue->lock); - /* Prevent new additions and prevent notifications from happening */ + /* Prevent new notifications from being stored. */ wqueue->defunct = true; while (!hlist_empty(&wqueue->watches)) { From a365a65f9ca1ceb9cf1ac29db4a4f51df7c507ad Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Thu, 10 Mar 2022 20:09:15 +0800 Subject: [PATCH 761/773] x86/traps: Mark do_int3() NOKPROBE_SYMBOL Since kprobe_int3_handler() is called in do_int3(), probing do_int3() can cause a breakpoint recursion and crash the kernel. Therefore, do_int3() should be marked as NOKPROBE_SYMBOL. Fixes: 21e28290b317 ("x86/traps: Split int3 handler up") Signed-off-by: Li Huafei Signed-off-by: Borislav Petkov Acked-by: Masami Hiramatsu Cc: Link: https://lore.kernel.org/r/20220310120915.63349-1-lihuafei1@huawei.com --- arch/x86/kernel/traps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c9d566dcf89a..8143693a7ea6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -659,6 +659,7 @@ static bool do_int3(struct pt_regs *regs) return res == NOTIFY_STOP; } +NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { From 173ce1ca47c489135b2799f70f550e1319ba36d8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 15:58:21 +0000 Subject: [PATCH 762/773] afs: Fix potential thrashing in afs writeback In afs_writepages_region(), if the dirty page we find is undergoing writeback or write to cache, but the sync_mode is WB_SYNC_NONE, we go round the loop trying the same page again and again with no pausing or waiting unless and until another thread manages to clear the writeback and fscache flags. Fix this with three measures: (1) Advance start to after the page we found. (2) Break out of the loop and return if rescheduling is requested. (3) Arbitrarily give up after a maximum of 5 skips. Fixes: 31143d5d515e ("AFS: implement basic file write support") Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne Acked-by: Marc Dionne Link: https://lore.kernel.org/r/164692725757.2097000.2060513769492301854.stgit@warthog.procyon.org.uk/ # v1 Signed-off-by: Linus Torvalds --- fs/afs/write.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 5e9157d0da29..f447c902318d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -703,7 +703,7 @@ static int afs_writepages_region(struct address_space *mapping, struct folio *folio; struct page *head_page; ssize_t ret; - int n; + int n, skips = 0; _enter("%llx,%llx,", start, end); @@ -754,8 +754,15 @@ static int afs_writepages_region(struct address_space *mapping, #ifdef CONFIG_AFS_FSCACHE folio_wait_fscache(folio); #endif + } else { + start += folio_size(folio); } folio_put(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) + break; + skips++; + } continue; } From 413a4a6b0b5553f2423d210f65e98c211b99c3f8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 16:02:18 +0000 Subject: [PATCH 763/773] cachefiles: Fix volume coherency attribute A network filesystem may set coherency data on a volume cookie, and if given, cachefiles will store this in an xattr on the directory in the cache corresponding to the volume. The function that sets the xattr just stores the contents of the volume coherency buffer directly into the xattr, with nothing added; the checking function, on the other hand, has a cut'n'paste error whereby it tries to interpret the xattr contents as would be the xattr on an ordinary file (using the cachefiles_xattr struct). This results in a failure to match the coherency data because the buffer ends up being shifted by 18 bytes. Fix this by defining a structure specifically for the volume xattr and making both the setting and checking functions use it. Since the volume coherency doesn't work if used, take the opportunity to insert a reserved field for future use, set it to 0 and check that it is 0. Log mismatch through the appropriate tracepoint. Note that this only affects cifs; 9p, afs, ceph and nfs don't use the volume coherency data at the moment. Fixes: 32e150037dce ("fscache, cachefiles: Store the volume coherency data") Reported-by: Rohith Surabattula Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Steve French cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Signed-off-by: Linus Torvalds --- fs/cachefiles/xattr.c | 23 ++++++++++++++++++++--- include/trace/events/cachefiles.h | 2 ++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 83f41bd0c3a9..35465109d9c4 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -28,6 +28,11 @@ struct cachefiles_xattr { static const char cachefiles_xattr_cache[] = XATTR_USER_PREFIX "CacheFiles.cache"; +struct cachefiles_vol_xattr { + __be32 reserved; /* Reserved, should be 0 */ + __u8 data[]; /* netfs volume coherency data */ +} __packed; + /* * set the state xattr on a cache file */ @@ -185,6 +190,7 @@ void cachefiles_prepare_to_write(struct fscache_cookie *cookie) */ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) { + struct cachefiles_vol_xattr *buf; unsigned int len = volume->vcookie->coherency_len; const void *p = volume->vcookie->coherency; struct dentry *dentry = volume->dentry; @@ -192,10 +198,17 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) _enter("%x,#%d", volume->vcookie->debug_id, len); + len += sizeof(*buf); + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return false; + buf->reserved = cpu_to_be32(0); + memcpy(buf->data, p, len); + ret = cachefiles_inject_write_error(); if (ret == 0) ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - p, len, 0); + buf, len, 0); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, cachefiles_trace_setxattr_error); @@ -209,6 +222,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) cachefiles_coherency_vol_set_ok); } + kfree(buf); _leave(" = %d", ret); return ret == 0; } @@ -218,7 +232,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) */ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) { - struct cachefiles_xattr *buf; + struct cachefiles_vol_xattr *buf; struct dentry *dentry = volume->dentry; unsigned int len = volume->vcookie->coherency_len; const void *p = volume->vcookie->coherency; @@ -228,6 +242,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) _enter(""); + len += sizeof(*buf); buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -245,7 +260,9 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) "Failed to read xattr with error %zd", xlen); } why = cachefiles_coherency_vol_check_xattr; - } else if (memcmp(buf->data, p, len) != 0) { + } else if (buf->reserved != cpu_to_be32(0)) { + why = cachefiles_coherency_vol_check_resv; + } else if (memcmp(buf->data, p, len - sizeof(*buf)) != 0) { why = cachefiles_coherency_vol_check_cmp; } else { why = cachefiles_coherency_vol_check_ok; diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index c6f5aa74db89..2c530637e10a 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -56,6 +56,7 @@ enum cachefiles_coherency_trace { cachefiles_coherency_set_ok, cachefiles_coherency_vol_check_cmp, cachefiles_coherency_vol_check_ok, + cachefiles_coherency_vol_check_resv, cachefiles_coherency_vol_check_xattr, cachefiles_coherency_vol_set_fail, cachefiles_coherency_vol_set_ok, @@ -139,6 +140,7 @@ enum cachefiles_error_trace { EM(cachefiles_coherency_set_ok, "SET ok ") \ EM(cachefiles_coherency_vol_check_cmp, "VOL BAD cmp ") \ EM(cachefiles_coherency_vol_check_ok, "VOL OK ") \ + EM(cachefiles_coherency_vol_check_resv, "VOL BAD resv") \ EM(cachefiles_coherency_vol_check_xattr,"VOL BAD xatt") \ EM(cachefiles_coherency_vol_set_fail, "VOL SET fail") \ E_(cachefiles_coherency_vol_set_ok, "VOL SET ok ") From 08999b2489b4c9b939d7483dbd03702ee4576d96 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Fri, 4 Mar 2022 00:38:58 +0200 Subject: [PATCH 764/773] x86/sgx: Free backing memory after faulting the enclave page There is a limited amount of SGX memory (EPC) on each system. When that memory is used up, SGX has its own swapping mechanism which is similar in concept but totally separate from the core mm/* code. Instead of swapping to disk, SGX swaps from EPC to normal RAM. That normal RAM comes from a shared memory pseudo-file and can itself be swapped by the core mm code. There is a hierarchy like this: EPC <-> shmem <-> disk After data is swapped back in from shmem to EPC, the shmem backing storage needs to be freed. Currently, the backing shmem is not freed. This effectively wastes the shmem while the enclave is running. The memory is recovered when the enclave is destroyed and the backing storage freed. Sort this out by freeing memory with shmem_truncate_range(), as soon as a page is faulted back to the EPC. In addition, free the memory for PCMD pages as soon as all PCMD's in a page have been marked as unused by zeroing its contents. Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Dave Hansen Signed-off-by: Jarkko Sakkinen Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220303223859.273187-1-jarkko@kernel.org --- arch/x86/kernel/cpu/sgx/encl.c | 57 ++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 48afe96ae0f0..7c63a1911fae 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,30 @@ #include "encls.h" #include "sgx.h" +/* + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's + * follow right after the EPC data in the backing storage. In addition to the + * visible enclave pages, there's one extra page slot for SECS, before PCMD + * structs. + */ +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, + unsigned long page_index) +{ + pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); + + return epc_end_off + page_index * sizeof(struct sgx_pcmd); +} + +/* + * Free a page from the backing storage in the given page index. + */ +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) +{ + struct inode *inode = file_inode(encl->backing); + + shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); +} + /* * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC * Pages" in the SDM. @@ -22,9 +46,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, { unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; + pgoff_t page_index, page_pcmd_off; struct sgx_pageinfo pginfo; struct sgx_backing b; - pgoff_t page_index; + bool pcmd_page_empty; + u8 *pcmd_page; int ret; if (secs_page) @@ -32,14 +58,16 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + ret = sgx_encl_get_backing(encl, page_index, &b); if (ret) return ret; pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + - b.pcmd_offset; + pcmd_page = kmap_atomic(b.pcmd); + pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; if (secs_page) pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); @@ -55,11 +83,24 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = -EFAULT; } - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + + /* + * The area for the PCMD in the page was zeroed above. Check if the + * whole page is now empty meaning that all PCMD's have been zeroed: + */ + pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); + + kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); sgx_encl_put_backing(&b, false); + sgx_encl_truncate_backing_page(encl, page_index); + + if (pcmd_page_empty) + sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + return ret; } @@ -579,7 +620,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); struct page *contents; struct page *pcmd; @@ -587,7 +628,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, if (IS_ERR(contents)) return PTR_ERR(contents); - pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); if (IS_ERR(pcmd)) { put_page(contents); return PTR_ERR(pcmd); @@ -596,9 +637,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; - backing->pcmd_offset = - (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * - sizeof(struct sgx_pcmd); + backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); return 0; } From 6c7cb60bff7aec24b834343ff433125f469886a3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Mar 2022 17:13:17 +0000 Subject: [PATCH 765/773] ARM: fix Thumb2 regression with Spectre BHB When building for Thumb2, the vectors make use of a local label. Sadly, the Spectre BHB code also uses a local label with the same number which results in the Thumb2 reference pointing at the wrong place. Fix this by changing the number used for the Spectre BHB local label. Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Tested-by: Nathan Chancellor Signed-off-by: Russell King (Oracle) Signed-off-by: Linus Torvalds --- arch/arm/kernel/entry-armv.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 676703cbfe4b..ee3f7a599181 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1040,9 +1040,9 @@ vector_bhb_loop8_\name: @ bhb workaround mov r0, #8 -1: b . + 4 +3: b . + 4 subs r0, r0, #1 - bne 1b + bne 3b dsb isb b 2b From 68453767131a5deec1e8f9ac92a9042f929e585d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 11 Mar 2022 11:49:12 -0800 Subject: [PATCH 766/773] ARM: Spectre-BHB: provide empty stub for non-config When CONFIG_GENERIC_CPU_VULNERABILITIES is not set, references to spectre_v2_update_state() cause a build error, so provide an empty stub for that function when the Kconfig option is not set. Fixes this build error: arm-linux-gnueabi-ld: arch/arm/mm/proc-v7-bugs.o: in function `cpu_v7_bugs_init': proc-v7-bugs.c:(.text+0x52): undefined reference to `spectre_v2_update_state' arm-linux-gnueabi-ld: proc-v7-bugs.c:(.text+0x82): undefined reference to `spectre_v2_update_state' Fixes: b9baf5c8c5c3 ("ARM: Spectre-BHB workaround") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Russell King Cc: Catalin Marinas Cc: linux-arm-kernel@lists.infradead.org Cc: patches@armlinux.org.uk Acked-by: Russell King (Oracle) Signed-off-by: Linus Torvalds --- arch/arm/include/asm/spectre.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/include/asm/spectre.h b/arch/arm/include/asm/spectre.h index d1fa5607d3aa..85f9e538fb32 100644 --- a/arch/arm/include/asm/spectre.h +++ b/arch/arm/include/asm/spectre.h @@ -25,7 +25,13 @@ enum { SPECTRE_V2_METHOD_LOOP8 = BIT(__SPECTRE_V2_METHOD_LOOP8), }; +#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES void spectre_v2_update_state(unsigned int state, unsigned int methods); +#else +static inline void spectre_v2_update_state(unsigned int state, + unsigned int methods) +{} +#endif int spectre_bhb_update_vectors(unsigned int method); From 3755d35ee1d2454b20b8a1e20d790e56201678a4 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 3 Feb 2022 10:39:22 +0100 Subject: [PATCH 767/773] drm/panel: Select DRM_DP_HELPER for DRM_PANEL_EDP As reported in [1], DRM_PANEL_EDP depends on DRM_DP_HELPER. Select the option to fix the build failure. The error message is shown below. arm-linux-gnueabihf-ld: drivers/gpu/drm/panel/panel-edp.o: in function `panel_edp_probe': panel-edp.c:(.text+0xb74): undefined reference to `drm_panel_dp_aux_backlight' make[1]: *** [/builds/linux/Makefile:1222: vmlinux] Error 1 The issue has been reported before, when DisplayPort helpers were hidden behind the option CONFIG_DRM_KMS_HELPER. [2] v2: * fix and expand commit description (Arnd) Signed-off-by: Thomas Zimmermann Fixes: 9d6366e743f3 ("drm: fb_helper: improve CONFIG_FB dependency") Reported-by: Naresh Kamboju Reported-by: Linux Kernel Functional Testing Reviewed-by: Lyude Paul Acked-by: Sam Ravnborg Link: https://lore.kernel.org/dri-devel/CA+G9fYvN0NyaVkRQmA1O6rX7H8PPaZrUAD7=RDy33QY9rUU-9g@mail.gmail.com/ # [1] Link: https://lore.kernel.org/all/20211117062704.14671-1-rdunlap@infradead.org/ # [2] Cc: Thomas Zimmermann Cc: Lyude Paul Cc: Daniel Vetter Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: dri-devel@lists.freedesktop.org Link: https://patchwork.freedesktop.org/patch/msgid/20220203093922.20754-1-tzimmermann@suse.de Signed-off-by: Dave Airlie --- drivers/gpu/drm/panel/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index 434c2861bb40..0aec5a10b064 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -106,6 +106,7 @@ config DRM_PANEL_EDP depends on PM select VIDEOMODE_HELPERS select DRM_DP_AUX_BUS + select DRM_DP_HELPER help DRM panel driver for dumb eDP panels that need at most a regulator and a GPIO to be powered up. Optionally a backlight can be attached so From 3ec94eeaff9ad58a76be2232068b4a2546b2f6bb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 12:53:44 -0300 Subject: [PATCH 768/773] tools kvm headers arm64: Update KVM headers from the kernel sources To pick the changes from: a5905d6af492ee6a ("KVM: arm64: Allow SMCCC_ARCH_WORKAROUND_3 to be discovered and migrated") That don't causes any changes in tooling (when built on x86), only addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/kvm.h' differs from latest version at 'arch/arm64/include/uapi/asm/kvm.h' diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: James Morse Link: https://lore.kernel.org/lkml/YiyhAK6sVPc83FaI@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/kvm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index b3edde68bc3e..323e251ed37b 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -281,6 +281,11 @@ struct kvm_arm_copy_mte_tags { #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED 3 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED (1U << 4) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 KVM_REG_ARM_FW_REG(3) +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL 0 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL 1 +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED 2 + /* SVE registers */ #define KVM_REG_ARM64_SVE (0x15 << KVM_REG_ARM_COPROC_SHIFT) From ec9d50ace39925f7fd0302bf0fad640e2c9826ea Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: [PATCH 769/773] tools headers cpufeatures: Sync with the kernel sources To pick the changes from: d45476d983240937 ("x86/speculation: Rename RETPOLINE_AMD to RETPOLINE_LFENCE") Its just a comment fixup. This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Borislav Petkov Cc: Peter Zijlstra (Intel) Link: https://lore.kernel.org/lkml/YiyiHatGaJQM7l/Y@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index ab4e53715d8a..65d147974f8d 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -204,7 +204,7 @@ /* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCEs for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ From a7a72631f62445e3671b7cab5ad01f856c1aa90d Mon Sep 17 00:00:00 2001 From: Weiguo Li Date: Fri, 11 Mar 2022 21:06:57 +0800 Subject: [PATCH 770/773] perf parse-events: Fix NULL check against wrong variable We did a null check after "tmp->symbol = strdup(...)", but we checked "list->symbol" other than "tmp->symbol". Reviewed-by: John Garry Signed-off-by: Weiguo Li Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/tencent_DF39269807EC9425E24787E6DB632441A405@qq.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9739b05b999e..dfb50a5f83d0 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2193,7 +2193,7 @@ int perf_pmu__test_parse_init(void) for (i = 0; i < ARRAY_SIZE(symbols); i++, tmp++) { tmp->type = symbols[i].type; tmp->symbol = strdup(symbols[i].symbol); - if (!list->symbol) + if (!tmp->symbol) goto err_free; } From 073a15c3512f6b8d36c0c05992cf31e845f4dfe0 Mon Sep 17 00:00:00 2001 From: Weiguo Li Date: Fri, 11 Mar 2022 21:07:16 +0800 Subject: [PATCH 771/773] perf bench: Fix NULL check against wrong variable We did a NULL check after "epollfdp = calloc(...)", but we checked "epollfd" instead of "epollfdp". Signed-off-by: Weiguo Li Acked-by: Davidlohr Bueso Cc: Alexander Shishkin Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/tencent_B5D64530EB9C7DBB8D2C88A0C790F1489D0A@qq.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/epoll-ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index 740ae764537e..134612bde0cb 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -106,7 +106,7 @@ static void nest_epollfd(void) printinfo("Nesting level(s): %d\n", nested); epollfdp = calloc(nested, sizeof(int)); - if (!epollfd) + if (!epollfdp) err(EXIT_FAILURE, "calloc"); for (i = 0; i < nested; i++) { From 91c9923a473a694eb1c5c01ab778a77114969707 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Mon, 7 Mar 2022 23:16:27 +0800 Subject: [PATCH 772/773] perf parse: Fix event parser error for hybrid systems This bug happened on hybrid systems when both cpu_core and cpu_atom have the same event name such as "UOPS_RETIRED.MS" while their event terms are different, then during perf stat, the event for cpu_atom will parse fail and then no output for cpu_atom. UOPS_RETIRED.MS -> cpu_core/period=0x1e8483,umask=0x4,event=0xc2,frontend=0x8/ UOPS_RETIRED.MS -> cpu_atom/period=0x1e8483,umask=0x1,event=0xc2/ It is because event terms in the "head" of parse_events_multi_pmu_add will be changed to event terms for cpu_core after parsing UOPS_RETIRED.MS for cpu_core, then when parsing the same event for cpu_atom, it still uses the event terms for cpu_core, but event terms for cpu_atom are different with cpu_core, the event parses for cpu_atom will fail. This patch fixes it, the event terms should be parsed from the original event. This patch can work for the hybrid systems that have the same event in more than 2 PMUs. It also can work in non-hybrid systems. Before: # perf stat -v -e UOPS_RETIRED.MS -a sleep 1 Using CPUID GenuineIntel-6-97-1 UOPS_RETIRED.MS -> cpu_core/period=0x1e8483,umask=0x4,event=0xc2,frontend=0x8/ Control descriptor is not initialized UOPS_RETIRED.MS: 2737845 16068518485 16068518485 Performance counter stats for 'system wide': 2,737,845 cpu_core/UOPS_RETIRED.MS/ 1.002553850 seconds time elapsed After: # perf stat -v -e UOPS_RETIRED.MS -a sleep 1 Using CPUID GenuineIntel-6-97-1 UOPS_RETIRED.MS -> cpu_core/period=0x1e8483,umask=0x4,event=0xc2,frontend=0x8/ UOPS_RETIRED.MS -> cpu_atom/period=0x1e8483,umask=0x1,event=0xc2/ Control descriptor is not initialized UOPS_RETIRED.MS: 1977555 16076950711 16076950711 UOPS_RETIRED.MS: 568684 8038694234 8038694234 Performance counter stats for 'system wide': 1,977,555 cpu_core/UOPS_RETIRED.MS/ 568,684 cpu_atom/UOPS_RETIRED.MS/ 1.004758259 seconds time elapsed Fixes: fb0811535e92c6c1 ("perf parse-events: Allow config on kernel PMU events") Reviewed-by: Kan Liang Signed-off-by: Zhengjun Xing Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20220307151627.30049-1-zhengjun.xing@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index dfb50a5f83d0..24997925ae00 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1648,6 +1648,7 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, { struct parse_events_term *term; struct list_head *list = NULL; + struct list_head *orig_head = NULL; struct perf_pmu *pmu = NULL; int ok = 0; char *config; @@ -1674,7 +1675,6 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, } list_add_tail(&term->list, head); - /* Add it for all PMUs that support the alias */ list = malloc(sizeof(struct list_head)); if (!list) @@ -1687,13 +1687,15 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, list_for_each_entry(alias, &pmu->aliases, list) { if (!strcasecmp(alias->name, str)) { + parse_events_copy_term_list(head, &orig_head); if (!parse_events_add_pmu(parse_state, list, - pmu->name, head, + pmu->name, orig_head, true, true)) { pr_debug("%s -> %s/%s/\n", str, pmu->name, alias->str); ok++; } + parse_events_terms__delete(orig_head); } } } From 09688c0166e76ce2fb85e86b9d99be8b0084cdf9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Mar 2022 13:23:37 -0700 Subject: [PATCH 773/773] Linux 5.17-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 87f672473523..55a30ca69350 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Superb Owl # *DOCUMENTATION*