From 1111b7836c80ec4094318e1dfb3a5abe6df95afb Mon Sep 17 00:00:00 2001 From: Xie Yisheng Date: Thu, 31 May 2018 19:11:15 +0800 Subject: [PATCH 01/12] cpufreq: intel_pstate: use match_string() helper match_string() returns the index of an array for a matching string, which can be used instead of open coded variant. Reviewed-by: Andy Shevchenko Signed-off-by: Yisheng Xie Acked-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index ece120da3353..a5c368425e36 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -657,21 +657,18 @@ static ssize_t store_energy_performance_preference( { struct cpudata *cpu_data = all_cpu_data[policy->cpu]; char str_preference[21]; - int ret, i = 0; + int ret; ret = sscanf(buf, "%20s", str_preference); if (ret != 1) return -EINVAL; - while (energy_perf_strings[i] != NULL) { - if (!strcmp(str_preference, energy_perf_strings[i])) { - intel_pstate_set_energy_pref_index(cpu_data, i); - return count; - } - ++i; - } + ret = match_string(energy_perf_strings, -1, str_preference); + if (ret < 0) + return ret; - return -EINVAL; + intel_pstate_set_energy_pref_index(cpu_data, ret); + return count; } static ssize_t show_energy_performance_preference( From a1d0015423920672c3f32e69bd9feef2ec629b57 Mon Sep 17 00:00:00 2001 From: Bastian Stender Date: Fri, 8 Jun 2018 11:06:39 +0200 Subject: [PATCH 02/12] cpufreq: imx6q/thermal: imx: register cooling device depending on OF The cooling device should be part of the i.MX cpufreq driver, but it cannot be removed for the sake of DT stability. So turn the cooling device registration into a separate function and perform the registration only if the CPU OF node does not have the #cooling-cells property. Use of_cpufreq_power_cooling_register in imx_thermal code to link the cooling device to the device tree node provided. This makes it possible to bind the cpufreq cooling device to a custom thermal zone via a cooling-maps entry like: cooling-maps { map0 { trip = <&board_alert>; cooling-device = <&cpu0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; }; }; Assuming a cpu node exists with label "cpu0" and #cooling-cells property. Signed-off-by: Bastian Stender Reviewed-by: Lucas Stach Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/imx6q-cpufreq.c | 21 +++++++++++++++++++++ drivers/thermal/imx_thermal.c | 28 ++++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index 8b3c2a79ad6c..b2ff423ad7f8 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,7 @@ static struct clk_bulk_data clks[] = { }; static struct device *cpu_dev; +static struct thermal_cooling_device *cdev; static bool free_opp; static struct cpufreq_frequency_table *freq_table; static unsigned int max_freq; @@ -191,6 +193,16 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index) return 0; } +static void imx6q_cpufreq_ready(struct cpufreq_policy *policy) +{ + cdev = of_cpufreq_cooling_register(policy); + + if (!cdev) + dev_err(cpu_dev, + "running cpufreq without cooling device: %ld\n", + PTR_ERR(cdev)); +} + static int imx6q_cpufreq_init(struct cpufreq_policy *policy) { int ret; @@ -202,13 +214,22 @@ static int imx6q_cpufreq_init(struct cpufreq_policy *policy) return ret; } +static int imx6q_cpufreq_exit(struct cpufreq_policy *policy) +{ + cpufreq_cooling_unregister(cdev); + + return 0; +} + static struct cpufreq_driver imx6q_cpufreq_driver = { .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = imx6q_set_target, .get = cpufreq_generic_get, .init = imx6q_cpufreq_init, + .exit = imx6q_cpufreq_exit, .name = "imx6q-cpufreq", + .ready = imx6q_cpufreq_ready, .attr = cpufreq_generic_attr, .suspend = cpufreq_generic_suspend, }; diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 334d98be03b9..cbfcca828cd7 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -3,6 +3,7 @@ // Copyright 2013 Freescale Semiconductor, Inc. #include +#include #include #include #include @@ -644,6 +645,27 @@ static const struct of_device_id of_imx_thermal_match[] = { }; MODULE_DEVICE_TABLE(of, of_imx_thermal_match); +/* + * Create cooling device in case no #cooling-cells property is available in + * CPU node + */ +static int imx_thermal_register_legacy_cooling(struct imx_thermal_data *data) +{ + struct device_node *np = of_get_cpu_node(data->policy->cpu, NULL); + int ret; + + if (!np || !of_find_property(np, "#cooling-cells", NULL)) { + data->cdev = cpufreq_cooling_register(data->policy); + if (IS_ERR(data->cdev)) { + ret = PTR_ERR(data->cdev); + cpufreq_cpu_put(data->policy); + return ret; + } + } + + return 0; +} + static int imx_thermal_probe(struct platform_device *pdev) { struct imx_thermal_data *data; @@ -724,12 +746,10 @@ static int imx_thermal_probe(struct platform_device *pdev) return -EPROBE_DEFER; } - data->cdev = cpufreq_cooling_register(data->policy); - if (IS_ERR(data->cdev)) { - ret = PTR_ERR(data->cdev); + ret = imx_thermal_register_legacy_cooling(data); + if (ret) { dev_err(&pdev->dev, "failed to register cpufreq cooling device: %d\n", ret); - cpufreq_cpu_put(data->policy); return ret; } From 2851b907091f2a04d19f50a5fc6a3aeed9e88dd3 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Wed, 4 Jul 2018 15:25:24 +0200 Subject: [PATCH 03/12] dt-bindings: marvell: Add documentation for the Armada 3700 AVS binding Extend the documentation of the Armada 37xx SoC with the Adaptive Voltage Scaling (AVS) registers. Signed-off-by: Gregory CLEMENT Acked-by: Viresh Kumar Reviewed-by: Rob Herring Signed-off-by: Rafael J. Wysocki --- .../bindings/arm/marvell/armada-37xx.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt index 35c3c3460d17..eddde4faef01 100644 --- a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt +++ b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt @@ -33,3 +33,18 @@ nb_pm: syscon@14000 { compatible = "marvell,armada-3700-nb-pm", "syscon"; reg = <0x14000 0x60>; } + +AVS +--- + +For AVS an other component is needed: + +Required properties: +- compatible : should contain "marvell,armada-3700-avs", "syscon"; +- reg : the register start and length for the AVS + +Example: +avs: avs@11500 { + compatible = "marvell,armada-3700-avs", "syscon"; + reg = <0x11500 0x40>; +} From 1c3528232f4ba608cc2c31c7a8a55e0dbd6cb200 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 19 Jun 2018 14:44:01 +0200 Subject: [PATCH 04/12] cpufreq: armada-37xx: Add AVS support Armada 37xx supports Adaptive Voltage Scaling and thanks to this patch a voltage is associated to each load level. Signed-off-by: Gregory CLEMENT Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/armada-37xx-cpufreq.c | 163 +++++++++++++++++++++++++- 1 file changed, 160 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c index 739da90ff3f6..75491fc841a6 100644 --- a/drivers/cpufreq/armada-37xx-cpufreq.c +++ b/drivers/cpufreq/armada-37xx-cpufreq.c @@ -51,6 +51,16 @@ #define ARMADA_37XX_DVFS_LOAD_2 2 #define ARMADA_37XX_DVFS_LOAD_3 3 +/* AVS register set */ +#define ARMADA_37XX_AVS_CTL0 0x0 +#define ARMADA_37XX_AVS_ENABLE BIT(30) +#define ARMADA_37XX_AVS_HIGH_VDD_LIMIT 16 +#define ARMADA_37XX_AVS_LOW_VDD_LIMIT 22 +#define ARMADA_37XX_AVS_VDD_MASK 0x3F +#define ARMADA_37XX_AVS_CTL2 0x8 +#define ARMADA_37XX_AVS_LOW_VDD_EN BIT(6) +#define ARMADA_37XX_AVS_VSET(x) (0x1C + 4 * (x)) + /* * On Armada 37xx the Power management manages 4 level of CPU load, * each level can be associated with a CPU clock source, a CPU @@ -58,6 +68,17 @@ */ #define LOAD_LEVEL_NR 4 +#define MIN_VOLT_MV 1000 + +/* AVS value for the corresponding voltage (in mV) */ +static int avs_map[] = { + 747, 758, 770, 782, 793, 805, 817, 828, 840, 852, 863, 875, 887, 898, + 910, 922, 933, 945, 957, 968, 980, 992, 1003, 1015, 1027, 1038, 1050, + 1062, 1073, 1085, 1097, 1108, 1120, 1132, 1143, 1155, 1167, 1178, 1190, + 1202, 1213, 1225, 1237, 1248, 1260, 1272, 1283, 1295, 1307, 1318, 1330, + 1342 +}; + struct armada37xx_cpufreq_state { struct regmap *regmap; u32 nb_l0l1; @@ -71,6 +92,7 @@ static struct armada37xx_cpufreq_state *armada37xx_cpufreq_state; struct armada_37xx_dvfs { u32 cpu_freq_max; u8 divider[LOAD_LEVEL_NR]; + u32 avs[LOAD_LEVEL_NR]; }; static struct armada_37xx_dvfs armada_37xx_dvfs[] = { @@ -148,6 +170,128 @@ static void __init armada37xx_cpufreq_dvfs_setup(struct regmap *base, clk_set_parent(clk, parent); } +/* + * Find out the armada 37x supported AVS value whose voltage value is + * the round-up closest to the target voltage value. + */ +static u32 armada_37xx_avs_val_match(int target_vm) +{ + u32 avs; + + /* Find out the round-up closest supported voltage value */ + for (avs = 0; avs < ARRAY_SIZE(avs_map); avs++) + if (avs_map[avs] >= target_vm) + break; + + /* + * If all supported voltages are smaller than target one, + * choose the largest supported voltage + */ + if (avs == ARRAY_SIZE(avs_map)) + avs = ARRAY_SIZE(avs_map) - 1; + + return avs; +} + +/* + * For Armada 37xx soc, L0(VSET0) VDD AVS value is set to SVC revision + * value or a default value when SVC is not supported. + * - L0 can be read out from the register of AVS_CTRL_0 and L0 voltage + * can be got from the mapping table of avs_map. + * - L1 voltage should be about 100mv smaller than L0 voltage + * - L2 & L3 voltage should be about 150mv smaller than L0 voltage. + * This function calculates L1 & L2 & L3 AVS values dynamically based + * on L0 voltage and fill all AVS values to the AVS value table. + */ +static void __init armada37xx_cpufreq_avs_configure(struct regmap *base, + struct armada_37xx_dvfs *dvfs) +{ + unsigned int target_vm; + int load_level = 0; + u32 l0_vdd_min; + + if (base == NULL) + return; + + /* Get L0 VDD min value */ + regmap_read(base, ARMADA_37XX_AVS_CTL0, &l0_vdd_min); + l0_vdd_min = (l0_vdd_min >> ARMADA_37XX_AVS_LOW_VDD_LIMIT) & + ARMADA_37XX_AVS_VDD_MASK; + if (l0_vdd_min >= ARRAY_SIZE(avs_map)) { + pr_err("L0 VDD MIN %d is not correct.\n", l0_vdd_min); + return; + } + dvfs->avs[0] = l0_vdd_min; + + if (avs_map[l0_vdd_min] <= MIN_VOLT_MV) { + /* + * If L0 voltage is smaller than 1000mv, then all VDD sets + * use L0 voltage; + */ + u32 avs_min = armada_37xx_avs_val_match(MIN_VOLT_MV); + + for (load_level = 1; load_level < LOAD_LEVEL_NR; load_level++) + dvfs->avs[load_level] = avs_min; + + return; + } + + /* + * L1 voltage is equal to L0 voltage - 100mv and it must be + * larger than 1000mv + */ + + target_vm = avs_map[l0_vdd_min] - 100; + target_vm = target_vm > MIN_VOLT_MV ? target_vm : MIN_VOLT_MV; + dvfs->avs[1] = armada_37xx_avs_val_match(target_vm); + + /* + * L2 & L3 voltage is equal to L0 voltage - 150mv and it must + * be larger than 1000mv + */ + target_vm = avs_map[l0_vdd_min] - 150; + target_vm = target_vm > MIN_VOLT_MV ? target_vm : MIN_VOLT_MV; + dvfs->avs[2] = dvfs->avs[3] = armada_37xx_avs_val_match(target_vm); +} + +static void __init armada37xx_cpufreq_avs_setup(struct regmap *base, + struct armada_37xx_dvfs *dvfs) +{ + unsigned int avs_val = 0, freq; + int load_level = 0; + + if (base == NULL) + return; + + /* Disable AVS before the configuration */ + regmap_update_bits(base, ARMADA_37XX_AVS_CTL0, + ARMADA_37XX_AVS_ENABLE, 0); + + + /* Enable low voltage mode */ + regmap_update_bits(base, ARMADA_37XX_AVS_CTL2, + ARMADA_37XX_AVS_LOW_VDD_EN, + ARMADA_37XX_AVS_LOW_VDD_EN); + + + for (load_level = 1; load_level < LOAD_LEVEL_NR; load_level++) { + freq = dvfs->cpu_freq_max / dvfs->divider[load_level]; + + avs_val = dvfs->avs[load_level]; + regmap_update_bits(base, ARMADA_37XX_AVS_VSET(load_level-1), + ARMADA_37XX_AVS_VDD_MASK << ARMADA_37XX_AVS_HIGH_VDD_LIMIT | + ARMADA_37XX_AVS_VDD_MASK << ARMADA_37XX_AVS_LOW_VDD_LIMIT, + avs_val << ARMADA_37XX_AVS_HIGH_VDD_LIMIT | + avs_val << ARMADA_37XX_AVS_LOW_VDD_LIMIT); + } + + /* Enable AVS after the configuration */ + regmap_update_bits(base, ARMADA_37XX_AVS_CTL0, + ARMADA_37XX_AVS_ENABLE, + ARMADA_37XX_AVS_ENABLE); + +} + static void armada37xx_cpufreq_disable_dvfs(struct regmap *base) { unsigned int reg = ARMADA_37XX_NB_DYN_MOD, @@ -216,7 +360,7 @@ static int __init armada37xx_cpufreq_driver_init(void) struct platform_device *pdev; unsigned long freq; unsigned int cur_frequency; - struct regmap *nb_pm_base; + struct regmap *nb_pm_base, *avs_base; struct device *cpu_dev; int load_lvl, ret; struct clk *clk; @@ -227,6 +371,14 @@ static int __init armada37xx_cpufreq_driver_init(void) if (IS_ERR(nb_pm_base)) return -ENODEV; + avs_base = + syscon_regmap_lookup_by_compatible("marvell,armada-3700-avs"); + + /* if AVS is not present don't use it but still try to setup dvfs */ + if (IS_ERR(avs_base)) { + pr_info("Syscon failed for Adapting Voltage Scaling: skip it\n"); + avs_base = NULL; + } /* Before doing any configuration on the DVFS first, disable it */ armada37xx_cpufreq_disable_dvfs(nb_pm_base); @@ -270,16 +422,21 @@ static int __init armada37xx_cpufreq_driver_init(void) armada37xx_cpufreq_state->regmap = nb_pm_base; + armada37xx_cpufreq_avs_configure(avs_base, dvfs); + armada37xx_cpufreq_avs_setup(avs_base, dvfs); + armada37xx_cpufreq_dvfs_setup(nb_pm_base, clk, dvfs->divider); clk_put(clk); for (load_lvl = ARMADA_37XX_DVFS_LOAD_0; load_lvl < LOAD_LEVEL_NR; load_lvl++) { + unsigned long u_volt = avs_map[dvfs->avs[load_lvl]] * 1000; freq = cur_frequency / dvfs->divider[load_lvl]; - - ret = dev_pm_opp_add(cpu_dev, freq, 0); + ret = dev_pm_opp_add(cpu_dev, freq, u_volt); if (ret) goto remove_opp; + + } /* Now that everything is setup, enable the DVFS at hardware level */ From 33477d84c26bbfa626da2c032e567a90dd70a528 Mon Sep 17 00:00:00 2001 From: George Cherian Date: Wed, 11 Jul 2018 23:07:55 -0700 Subject: [PATCH 05/12] cpufreq / CPPC: Add cpuinfo_cur_freq support for CPPC Per Section 8.4.7.1.3 of ACPI 6.2, the platform provides performance feedback via set of performance counters. To determine the actual performance level delivered over time, OSPM may read a set of performance counters from the Reference Performance Counter Register and the Delivered Performance Counter Register. OSPM calculates the delivered performance over a given time period by taking a beginning and ending snapshot of both the reference and delivered performance counters, and calculating: delivered_perf = reference_perf X (delta of delivered_perf counter / delta of reference_perf counter). Implement the above and hook this up to the cpufreq->get method. Signed-off-by: George Cherian Acked-by: Viresh Kumar Acked-by: Prashanth Prakash Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index a9d3eec32795..30f302149730 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -296,10 +296,62 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) return ret; } +static inline u64 get_delta(u64 t1, u64 t0) +{ + if (t1 > t0 || t0 > ~(u32)0) + return t1 - t0; + + return (u32)t1 - (u32)t0; +} + +static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu, + struct cppc_perf_fb_ctrs fb_ctrs_t0, + struct cppc_perf_fb_ctrs fb_ctrs_t1) +{ + u64 delta_reference, delta_delivered; + u64 reference_perf, delivered_perf; + + reference_perf = fb_ctrs_t0.reference_perf; + + delta_reference = get_delta(fb_ctrs_t1.reference, + fb_ctrs_t0.reference); + delta_delivered = get_delta(fb_ctrs_t1.delivered, + fb_ctrs_t0.delivered); + + /* Check to avoid divide-by zero */ + if (delta_reference || delta_delivered) + delivered_perf = (reference_perf * delta_delivered) / + delta_reference; + else + delivered_perf = cpu->perf_ctrls.desired_perf; + + return cppc_cpufreq_perf_to_khz(cpu, delivered_perf); +} + +static unsigned int cppc_cpufreq_get_rate(unsigned int cpunum) +{ + struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0}; + struct cppc_cpudata *cpu = all_cpu_data[cpunum]; + int ret; + + ret = cppc_get_perf_ctrs(cpunum, &fb_ctrs_t0); + if (ret) + return ret; + + udelay(2); /* 2usec delay between sampling */ + + ret = cppc_get_perf_ctrs(cpunum, &fb_ctrs_t1); + if (ret) + return ret; + + return cppc_get_rate_from_fbctrs(cpu, fb_ctrs_t0, fb_ctrs_t1); +} + static struct cpufreq_driver cppc_cpufreq_driver = { .flags = CPUFREQ_CONST_LOOPS, .verify = cppc_verify_policy, .target = cppc_cpufreq_set_target, + .get = cppc_cpufreq_get_rate, .init = cppc_cpufreq_cpu_init, .stop_cpu = cppc_cpufreq_stop_cpu, .name = "cppc_cpufreq", From f54ab690ad68e4c5a5b4d4b5dbb28a35018546c5 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 17 Jul 2018 22:48:21 +0200 Subject: [PATCH 06/12] cpufreq: qcom-kryo: Silently error out on EPROBE_DEFER If of_nvmem_cell_get() fails due to probe deferal, we shouldn't print an error message. Just be silent in this case. Signed-off-by: Niklas Cassel Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/qcom-cpufreq-kryo.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-kryo.c b/drivers/cpufreq/qcom-cpufreq-kryo.c index 29389accf3e9..b8d1e6875f16 100644 --- a/drivers/cpufreq/qcom-cpufreq-kryo.c +++ b/drivers/cpufreq/qcom-cpufreq-kryo.c @@ -109,8 +109,9 @@ static int qcom_cpufreq_kryo_probe(struct platform_device *pdev) speedbin_nvmem = of_nvmem_cell_get(np, NULL); of_node_put(np); if (IS_ERR(speedbin_nvmem)) { - dev_err(cpu_dev, "Could not get nvmem cell: %ld\n", - PTR_ERR(speedbin_nvmem)); + if (PTR_ERR(speedbin_nvmem) != -EPROBE_DEFER) + dev_err(cpu_dev, "Could not get nvmem cell: %ld\n", + PTR_ERR(speedbin_nvmem)); return PTR_ERR(speedbin_nvmem); } From 4d81b0f9e631f751bf231213893e202a51f76687 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 18 Jul 2018 10:15:07 +0200 Subject: [PATCH 07/12] cpufreq: pcc-cpufreq: Disable dynamic scaling on many-CPU systems The firmware interface used by the pcc-cpufreq driver is fundamentally not scalable and using it for dynamic CPU performance scaling on systems with many CPUs leads to degraded performance. For this reason, disable dynamic CPU performance scaling on systems with pcc-cpufreq where the number of CPUs present at the driver init time is greater than 4. Also make the driver print corresponding complaints to the kernel log. Reported-by: Andreas Herrmann Tested-by: Andreas Herrmann Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/pcc-cpufreq.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c index 3f0ce2ae35ee..ef8d984178a5 100644 --- a/drivers/cpufreq/pcc-cpufreq.c +++ b/drivers/cpufreq/pcc-cpufreq.c @@ -589,6 +589,15 @@ static int __init pcc_cpufreq_init(void) return ret; } + if (num_present_cpus() > 4) { + pcc_cpufreq_driver.flags |= CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING; + pr_err("%s: Too many CPUs, dynamic performance scaling disabled\n", + __func__); + pr_err("%s: Try to enable another scaling driver through BIOS settings\n", + __func__); + pr_err("%s: and complain to the system vendor\n", __func__); + } + ret = cpufreq_register_driver(&pcc_cpufreq_driver); return ret; From eea033d07543a177fc2ab35a6d633b2aa9684b0f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 18 Jul 2018 14:51:59 -0700 Subject: [PATCH 08/12] cpufreq: intel_pstate: Show different max frequency with turbo 3 and HWP On HWP platforms with Turbo 3.0, the HWP capability max ratio shows the maximum ratio of that core, which can be different than other cores. If we show the correct maximum frequency in cpufreq sysfs via cpuinfo_max_freq and scaling_max_freq then, user can know which cores can run faster for pinning some high priority tasks. Currently the max turbo frequency is shown as max frequency, which is the max of all cores, even if some cores can't reach that frequency even for single threaded workload. But it is possible that max ratio in HWP capabilities is set as 0xFF or some high invalid value (E.g. One KBL NUC). Since the actual performance can never exceed 1 core turbo frequency from MSR TURBO_RATIO_LIMIT, we use this as a bound check. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index a5c368425e36..2584dd00e3c9 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2069,6 +2069,15 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; policy->cpuinfo.max_freq *= cpu->pstate.scaling; + if (hwp_active) { + unsigned int max_freq; + + max_freq = global.turbo_disabled ? + cpu->pstate.max_freq : cpu->pstate.turbo_freq; + if (max_freq < policy->cpuinfo.max_freq) + policy->cpuinfo.max_freq = max_freq; + } + intel_pstate_init_acpi_perf_limits(policy); policy->fast_switch_possible = true; From 601b218568a107370086dc5c7a1b283f8d463268 Mon Sep 17 00:00:00 2001 From: Ruchi Kandoi Date: Tue, 24 Jul 2018 10:35:44 -0700 Subject: [PATCH 09/12] cpufreq: trace frequency limits change systrace used for tracing for Android systems has carried a patch for many years in the Android tree that traces when the cpufreq limits change. With the help of this information, systrace can know when the policy limits change and can visually display the data. Lets add upstream support for the same. Signed-off-by: Ruchi Kandoi Signed-off-by: Joel Fernandes (Google) Acked-by: Viresh Kumar Acked-by: Steven Rostedt (VMware) Signed-off-by: Rafael J. Wysocki --- Documentation/trace/events-power.rst | 1 + drivers/cpufreq/cpufreq.c | 1 + include/trace/events/power.h | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/Documentation/trace/events-power.rst b/Documentation/trace/events-power.rst index a77daca75e30..2ef318962e29 100644 --- a/Documentation/trace/events-power.rst +++ b/Documentation/trace/events-power.rst @@ -27,6 +27,7 @@ cpufreq. cpu_idle "state=%lu cpu_id=%lu" cpu_frequency "state=%lu cpu_id=%lu" + cpu_frequency_limits "min=%lu max=%lu cpu_id=%lu" A suspend event is used to indicate the system going in and out of the suspend mode: diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b0dfd3222013..52566f1f1050 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2236,6 +2236,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, policy->min = new_policy->min; policy->max = new_policy->max; + trace_cpu_frequency_limits(policy); policy->cached_target_freq = UINT_MAX; diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 908977d69783..f7aece721aed 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -5,6 +5,7 @@ #if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_POWER_H +#include #include #include #include @@ -148,6 +149,30 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); +TRACE_EVENT(cpu_frequency_limits, + + TP_PROTO(struct cpufreq_policy *policy), + + TP_ARGS(policy), + + TP_STRUCT__entry( + __field(u32, min_freq) + __field(u32, max_freq) + __field(u32, cpu_id) + ), + + TP_fast_assign( + __entry->min_freq = policy->min; + __entry->max_freq = policy->max; + __entry->cpu_id = policy->cpu; + ), + + TP_printk("min=%lu max=%lu cpu_id=%lu", + (unsigned long)__entry->min_freq, + (unsigned long)__entry->max_freq, + (unsigned long)__entry->cpu_id) +); + TRACE_EVENT(device_pm_callback_start, TP_PROTO(struct device *dev, const char *pm_ops, int event), From 6f4ceee9305dc3fe74099159b460f4b56b506f1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 24 Jul 2018 14:26:04 -0400 Subject: [PATCH 10/12] cpu/hotplug: Add a cpus_read_trylock() function There are use cases where it can be useful to have a cpus_read_trylock() function to work around circular lock dependency problem involving the cpu_hotplug_lock. Signed-off-by: Waiman Long Signed-off-by: Rafael J. Wysocki --- include/linux/cpu.h | 2 ++ kernel/cpu.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index a97a63eef59f..e850bfea3e84 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -103,6 +103,7 @@ extern void cpus_write_lock(void); extern void cpus_write_unlock(void); extern void cpus_read_lock(void); extern void cpus_read_unlock(void); +extern int cpus_read_trylock(void); extern void lockdep_assert_cpus_held(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); @@ -115,6 +116,7 @@ static inline void cpus_write_lock(void) { } static inline void cpus_write_unlock(void) { } static inline void cpus_read_lock(void) { } static inline void cpus_read_unlock(void) { } +static inline int cpus_read_trylock(void) { return true; } static inline void lockdep_assert_cpus_held(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..307486baa477 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -290,6 +290,12 @@ void cpus_read_lock(void) } EXPORT_SYMBOL_GPL(cpus_read_lock); +int cpus_read_trylock(void) +{ + return percpu_down_read_trylock(&cpu_hotplug_lock); +} +EXPORT_SYMBOL_GPL(cpus_read_trylock); + void cpus_read_unlock(void) { percpu_up_read(&cpu_hotplug_lock); From 9b3d9bb3e4deef41095e513c2ffbebab20f9a982 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 24 Jul 2018 14:26:05 -0400 Subject: [PATCH 11/12] cpufreq: Fix a circular lock dependency problem With lockdep turned on, the following circular lock dependency problem was reported: [ 57.470040] ====================================================== [ 57.502900] WARNING: possible circular locking dependency detected [ 57.535208] 4.18.0-0.rc3.1.el8+7.x86_64+debug #1 Tainted: G [ 57.577761] ------------------------------------------------------ [ 57.609714] tuned/1505 is trying to acquire lock: [ 57.633808] 00000000559deec5 (cpu_hotplug_lock.rw_sem){++++}, at: store+0x27/0x120 [ 57.672880] [ 57.672880] but task is already holding lock: [ 57.702184] 000000002136ca64 (kn->count#118){++++}, at: kernfs_fop_write+0x1d0/0x410 [ 57.742176] [ 57.742176] which lock already depends on the new lock. [ 57.742176] [ 57.785220] [ 57.785220] the existing dependency chain (in reverse order) is: : [ 58.932512] other info that might help us debug this: [ 58.932512] [ 58.973344] Chain exists of: [ 58.973344] cpu_hotplug_lock.rw_sem --> subsys mutex#5 --> kn->count#118 [ 58.973344] [ 59.030795] Possible unsafe locking scenario: [ 59.030795] [ 59.061248] CPU0 CPU1 [ 59.085377] ---- ---- [ 59.108160] lock(kn->count#118); [ 59.124935] lock(subsys mutex#5); [ 59.156330] lock(kn->count#118); [ 59.186088] lock(cpu_hotplug_lock.rw_sem); [ 59.208541] [ 59.208541] *** DEADLOCK *** In the cpufreq_register_driver() function, the lock sequence is: cpus_read_lock --> kn->count For the cpufreq sysfs store method, the lock sequence is: kn->count --> cpus_read_lock These sequences are actually safe as they are taking a share lock on cpu_hotplug_lock. However, the current lockdep code doesn't check for share locking when detecting circular lock dependency. Fixing that could be a substantial effort. Instead, we can work around this problem by using cpus_read_trylock() in the store method which is much simpler. The chance of not getting the read lock is very small. If that happens, the userspace application that writes the sysfs file will get an error. Signed-off-by: Waiman Long Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 52566f1f1050..f53fb41efb7b 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -923,7 +923,12 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct freq_attr *fattr = to_attr(attr); ssize_t ret = -EINVAL; - cpus_read_lock(); + /* + * cpus_read_trylock() is used here to work around a circular lock + * dependency problem with respect to the cpufreq_register_driver(). + */ + if (!cpus_read_trylock()) + return -EBUSY; if (cpu_online(policy->cpu)) { down_write(&policy->rwsem); From d3264f752a1aedac98aa90e50853df149d1346f2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 1 Aug 2018 17:26:06 -0700 Subject: [PATCH 12/12] cpufreq: intel_pstate: Ignore turbo active ratio in HWP When HWP is active turbo active ratio is not used, so we should allow policy max frequency above turbo activation ratio to be set. When HWP is not active, then any policy max frequency above turbo activation ratio can result upto max one-core turbo frequency. This fix helps better thermal control in turbo region when other methods like "Running Average Power Limit" is not available to use. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 7bfb98380db9..b6a1aadaff9f 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2008,7 +2008,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) static void intel_pstate_adjust_policy_max(struct cpufreq_policy *policy, struct cpudata *cpu) { - if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate && + if (!hwp_active && + cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate && policy->max < policy->cpuinfo.max_freq && policy->max > cpu->pstate.max_freq) { pr_debug("policy->max > max non turbo frequency\n");