Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - refcount conversions - Solve the rq->leaf_cfs_rq_list can of worms for real. - improve power-aware scheduling - add sysctl knob for Energy Aware Scheduling - documentation updates - misc other changes" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits) kthread: Do not use TIMER_IRQSAFE kthread: Convert worker lock to raw spinlock sched/fair: Use non-atomic cpumask_{set,clear}_cpu() sched/fair: Remove unused 'sd' parameter from select_idle_smt() sched/wait: Use freezable_schedule() when possible sched/fair: Prune, fix and simplify the nohz_balancer_kick() comment block sched/fair: Explain LLC nohz kick condition sched/fair: Simplify nohz_balancer_kick() sched/topology: Fix percpu data types in struct sd_data & struct s_data sched/fair: Simplify post_init_entity_util_avg() by calling it with a task_struct pointer argument sched/fair: Fix O(nr_cgroups) in the load balancing path sched/fair: Optimize update_blocked_averages() sched/fair: Fix insertion in rq->leaf_cfs_rq_list sched/fair: Add tmp_alone_branch assertion sched/core: Use READ_ONCE()/WRITE_ONCE() in move_queued_task()/task_rq_lock() sched/debug: Initialize sd_sysctl_cpus if !CONFIG_CPUMASK_OFFSTACK sched/pelt: Skip updating util_est when utilization is higher than CPU's capacity sched/fair: Update scale invariance of PELT sched/fair: Move the rq_of() helper function sched/core: Convert task_struct.stack_refcount to refcount_t ...
This commit is contained in:
commit
45802da05e
144
Documentation/power/energy-model.txt
Normal file
144
Documentation/power/energy-model.txt
Normal file
@ -0,0 +1,144 @@
|
||||
====================
|
||||
Energy Model of CPUs
|
||||
====================
|
||||
|
||||
1. Overview
|
||||
-----------
|
||||
|
||||
The Energy Model (EM) framework serves as an interface between drivers knowing
|
||||
the power consumed by CPUs at various performance levels, and the kernel
|
||||
subsystems willing to use that information to make energy-aware decisions.
|
||||
|
||||
The source of the information about the power consumed by CPUs can vary greatly
|
||||
from one platform to another. These power costs can be estimated using
|
||||
devicetree data in some cases. In others, the firmware will know better.
|
||||
Alternatively, userspace might be best positioned. And so on. In order to avoid
|
||||
each and every client subsystem to re-implement support for each and every
|
||||
possible source of information on its own, the EM framework intervenes as an
|
||||
abstraction layer which standardizes the format of power cost tables in the
|
||||
kernel, hence enabling to avoid redundant work.
|
||||
|
||||
The figure below depicts an example of drivers (Arm-specific here, but the
|
||||
approach is applicable to any architecture) providing power costs to the EM
|
||||
framework, and interested clients reading the data from it.
|
||||
|
||||
+---------------+ +-----------------+ +---------------+
|
||||
| Thermal (IPA) | | Scheduler (EAS) | | Other |
|
||||
+---------------+ +-----------------+ +---------------+
|
||||
| | em_pd_energy() |
|
||||
| | em_cpu_get() |
|
||||
+---------+ | +---------+
|
||||
| | |
|
||||
v v v
|
||||
+---------------------+
|
||||
| Energy Model |
|
||||
| Framework |
|
||||
+---------------------+
|
||||
^ ^ ^
|
||||
| | | em_register_perf_domain()
|
||||
+----------+ | +---------+
|
||||
| | |
|
||||
+---------------+ +---------------+ +--------------+
|
||||
| cpufreq-dt | | arm_scmi | | Other |
|
||||
+---------------+ +---------------+ +--------------+
|
||||
^ ^ ^
|
||||
| | |
|
||||
+--------------+ +---------------+ +--------------+
|
||||
| Device Tree | | Firmware | | ? |
|
||||
+--------------+ +---------------+ +--------------+
|
||||
|
||||
The EM framework manages power cost tables per 'performance domain' in the
|
||||
system. A performance domain is a group of CPUs whose performance is scaled
|
||||
together. Performance domains generally have a 1-to-1 mapping with CPUFreq
|
||||
policies. All CPUs in a performance domain are required to have the same
|
||||
micro-architecture. CPUs in different performance domains can have different
|
||||
micro-architectures.
|
||||
|
||||
|
||||
2. Core APIs
|
||||
------------
|
||||
|
||||
2.1 Config options
|
||||
|
||||
CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
|
||||
|
||||
|
||||
2.2 Registration of performance domains
|
||||
|
||||
Drivers are expected to register performance domains into the EM framework by
|
||||
calling the following API:
|
||||
|
||||
int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
|
||||
struct em_data_callback *cb);
|
||||
|
||||
Drivers must specify the CPUs of the performance domains using the cpumask
|
||||
argument, and provide a callback function returning <frequency, power> tuples
|
||||
for each capacity state. The callback function provided by the driver is free
|
||||
to fetch data from any relevant location (DT, firmware, ...), and by any mean
|
||||
deemed necessary. See Section 3. for an example of driver implementing this
|
||||
callback, and kernel/power/energy_model.c for further documentation on this
|
||||
API.
|
||||
|
||||
|
||||
2.3 Accessing performance domains
|
||||
|
||||
Subsystems interested in the energy model of a CPU can retrieve it using the
|
||||
em_cpu_get() API. The energy model tables are allocated once upon creation of
|
||||
the performance domains, and kept in memory untouched.
|
||||
|
||||
The energy consumed by a performance domain can be estimated using the
|
||||
em_pd_energy() API. The estimation is performed assuming that the schedutil
|
||||
CPUfreq governor is in use.
|
||||
|
||||
More details about the above APIs can be found in include/linux/energy_model.h.
|
||||
|
||||
|
||||
3. Example driver
|
||||
-----------------
|
||||
|
||||
This section provides a simple example of a CPUFreq driver registering a
|
||||
performance domain in the Energy Model framework using the (fake) 'foo'
|
||||
protocol. The driver implements an est_power() function to be provided to the
|
||||
EM framework.
|
||||
|
||||
-> drivers/cpufreq/foo_cpufreq.c
|
||||
|
||||
01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
|
||||
02 {
|
||||
03 long freq, power;
|
||||
04
|
||||
05 /* Use the 'foo' protocol to ceil the frequency */
|
||||
06 freq = foo_get_freq_ceil(cpu, *KHz);
|
||||
07 if (freq < 0);
|
||||
08 return freq;
|
||||
09
|
||||
10 /* Estimate the power cost for the CPU at the relevant freq. */
|
||||
11 power = foo_estimate_power(cpu, freq);
|
||||
12 if (power < 0);
|
||||
13 return power;
|
||||
14
|
||||
15 /* Return the values to the EM framework */
|
||||
16 *mW = power;
|
||||
17 *KHz = freq;
|
||||
18
|
||||
19 return 0;
|
||||
20 }
|
||||
21
|
||||
22 static int foo_cpufreq_init(struct cpufreq_policy *policy)
|
||||
23 {
|
||||
24 struct em_data_callback em_cb = EM_DATA_CB(est_power);
|
||||
25 int nr_opp, ret;
|
||||
26
|
||||
27 /* Do the actual CPUFreq init work ... */
|
||||
28 ret = do_foo_cpufreq_init(policy);
|
||||
29 if (ret)
|
||||
30 return ret;
|
||||
31
|
||||
32 /* Find the number of OPPs for this policy */
|
||||
33 nr_opp = foo_get_nr_opp(policy);
|
||||
34
|
||||
35 /* And register the new performance domain */
|
||||
36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
|
||||
37
|
||||
38 return 0;
|
||||
39 }
|
425
Documentation/scheduler/sched-energy.txt
Normal file
425
Documentation/scheduler/sched-energy.txt
Normal file
@ -0,0 +1,425 @@
|
||||
=======================
|
||||
Energy Aware Scheduling
|
||||
=======================
|
||||
|
||||
1. Introduction
|
||||
---------------
|
||||
|
||||
Energy Aware Scheduling (or EAS) gives the scheduler the ability to predict
|
||||
the impact of its decisions on the energy consumed by CPUs. EAS relies on an
|
||||
Energy Model (EM) of the CPUs to select an energy efficient CPU for each task,
|
||||
with a minimal impact on throughput. This document aims at providing an
|
||||
introduction on how EAS works, what are the main design decisions behind it, and
|
||||
details what is needed to get it to run.
|
||||
|
||||
Before going any further, please note that at the time of writing:
|
||||
|
||||
/!\ EAS does not support platforms with symmetric CPU topologies /!\
|
||||
|
||||
EAS operates only on heterogeneous CPU topologies (such as Arm big.LITTLE)
|
||||
because this is where the potential for saving energy through scheduling is
|
||||
the highest.
|
||||
|
||||
The actual EM used by EAS is _not_ maintained by the scheduler, but by a
|
||||
dedicated framework. For details about this framework and what it provides,
|
||||
please refer to its documentation (see Documentation/power/energy-model.txt).
|
||||
|
||||
|
||||
2. Background and Terminology
|
||||
-----------------------------
|
||||
|
||||
To make it clear from the start:
|
||||
- energy = [joule] (resource like a battery on powered devices)
|
||||
- power = energy/time = [joule/second] = [watt]
|
||||
|
||||
The goal of EAS is to minimize energy, while still getting the job done. That
|
||||
is, we want to maximize:
|
||||
|
||||
performance [inst/s]
|
||||
--------------------
|
||||
power [W]
|
||||
|
||||
which is equivalent to minimizing:
|
||||
|
||||
energy [J]
|
||||
-----------
|
||||
instruction
|
||||
|
||||
while still getting 'good' performance. It is essentially an alternative
|
||||
optimization objective to the current performance-only objective for the
|
||||
scheduler. This alternative considers two objectives: energy-efficiency and
|
||||
performance.
|
||||
|
||||
The idea behind introducing an EM is to allow the scheduler to evaluate the
|
||||
implications of its decisions rather than blindly applying energy-saving
|
||||
techniques that may have positive effects only on some platforms. At the same
|
||||
time, the EM must be as simple as possible to minimize the scheduler latency
|
||||
impact.
|
||||
|
||||
In short, EAS changes the way CFS tasks are assigned to CPUs. When it is time
|
||||
for the scheduler to decide where a task should run (during wake-up), the EM
|
||||
is used to break the tie between several good CPU candidates and pick the one
|
||||
that is predicted to yield the best energy consumption without harming the
|
||||
system's throughput. The predictions made by EAS rely on specific elements of
|
||||
knowledge about the platform's topology, which include the 'capacity' of CPUs,
|
||||
and their respective energy costs.
|
||||
|
||||
|
||||
3. Topology information
|
||||
-----------------------
|
||||
|
||||
EAS (as well as the rest of the scheduler) uses the notion of 'capacity' to
|
||||
differentiate CPUs with different computing throughput. The 'capacity' of a CPU
|
||||
represents the amount of work it can absorb when running at its highest
|
||||
frequency compared to the most capable CPU of the system. Capacity values are
|
||||
normalized in a 1024 range, and are comparable with the utilization signals of
|
||||
tasks and CPUs computed by the Per-Entity Load Tracking (PELT) mechanism. Thanks
|
||||
to capacity and utilization values, EAS is able to estimate how big/busy a
|
||||
task/CPU is, and to take this into consideration when evaluating performance vs
|
||||
energy trade-offs. The capacity of CPUs is provided via arch-specific code
|
||||
through the arch_scale_cpu_capacity() callback.
|
||||
|
||||
The rest of platform knowledge used by EAS is directly read from the Energy
|
||||
Model (EM) framework. The EM of a platform is composed of a power cost table
|
||||
per 'performance domain' in the system (see Documentation/power/energy-model.txt
|
||||
for futher details about performance domains).
|
||||
|
||||
The scheduler manages references to the EM objects in the topology code when the
|
||||
scheduling domains are built, or re-built. For each root domain (rd), the
|
||||
scheduler maintains a singly linked list of all performance domains intersecting
|
||||
the current rd->span. Each node in the list contains a pointer to a struct
|
||||
em_perf_domain as provided by the EM framework.
|
||||
|
||||
The lists are attached to the root domains in order to cope with exclusive
|
||||
cpuset configurations. Since the boundaries of exclusive cpusets do not
|
||||
necessarily match those of performance domains, the lists of different root
|
||||
domains can contain duplicate elements.
|
||||
|
||||
Example 1.
|
||||
Let us consider a platform with 12 CPUs, split in 3 performance domains
|
||||
(pd0, pd4 and pd8), organized as follows:
|
||||
|
||||
CPUs: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
PDs: |--pd0--|--pd4--|---pd8---|
|
||||
RDs: |----rd1----|-----rd2-----|
|
||||
|
||||
Now, consider that userspace decided to split the system with two
|
||||
exclusive cpusets, hence creating two independent root domains, each
|
||||
containing 6 CPUs. The two root domains are denoted rd1 and rd2 in the
|
||||
above figure. Since pd4 intersects with both rd1 and rd2, it will be
|
||||
present in the linked list '->pd' attached to each of them:
|
||||
* rd1->pd: pd0 -> pd4
|
||||
* rd2->pd: pd4 -> pd8
|
||||
|
||||
Please note that the scheduler will create two duplicate list nodes for
|
||||
pd4 (one for each list). However, both just hold a pointer to the same
|
||||
shared data structure of the EM framework.
|
||||
|
||||
Since the access to these lists can happen concurrently with hotplug and other
|
||||
things, they are protected by RCU, like the rest of topology structures
|
||||
manipulated by the scheduler.
|
||||
|
||||
EAS also maintains a static key (sched_energy_present) which is enabled when at
|
||||
least one root domain meets all conditions for EAS to start. Those conditions
|
||||
are summarized in Section 6.
|
||||
|
||||
|
||||
4. Energy-Aware task placement
|
||||
------------------------------
|
||||
|
||||
EAS overrides the CFS task wake-up balancing code. It uses the EM of the
|
||||
platform and the PELT signals to choose an energy-efficient target CPU during
|
||||
wake-up balance. When EAS is enabled, select_task_rq_fair() calls
|
||||
find_energy_efficient_cpu() to do the placement decision. This function looks
|
||||
for the CPU with the highest spare capacity (CPU capacity - CPU utilization) in
|
||||
each performance domain since it is the one which will allow us to keep the
|
||||
frequency the lowest. Then, the function checks if placing the task there could
|
||||
save energy compared to leaving it on prev_cpu, i.e. the CPU where the task ran
|
||||
in its previous activation.
|
||||
|
||||
find_energy_efficient_cpu() uses compute_energy() to estimate what will be the
|
||||
energy consumed by the system if the waking task was migrated. compute_energy()
|
||||
looks at the current utilization landscape of the CPUs and adjusts it to
|
||||
'simulate' the task migration. The EM framework provides the em_pd_energy() API
|
||||
which computes the expected energy consumption of each performance domain for
|
||||
the given utilization landscape.
|
||||
|
||||
An example of energy-optimized task placement decision is detailed below.
|
||||
|
||||
Example 2.
|
||||
Let us consider a (fake) platform with 2 independent performance domains
|
||||
composed of two CPUs each. CPU0 and CPU1 are little CPUs; CPU2 and CPU3
|
||||
are big.
|
||||
|
||||
The scheduler must decide where to place a task P whose util_avg = 200
|
||||
and prev_cpu = 0.
|
||||
|
||||
The current utilization landscape of the CPUs is depicted on the graph
|
||||
below. CPUs 0-3 have a util_avg of 400, 100, 600 and 500 respectively
|
||||
Each performance domain has three Operating Performance Points (OPPs).
|
||||
The CPU capacity and power cost associated with each OPP is listed in
|
||||
the Energy Model table. The util_avg of P is shown on the figures
|
||||
below as 'PP'.
|
||||
|
||||
CPU util.
|
||||
1024 - - - - - - - Energy Model
|
||||
+-----------+-------------+
|
||||
| Little | Big |
|
||||
768 ============= +-----+-----+------+------+
|
||||
| Cap | Pwr | Cap | Pwr |
|
||||
+-----+-----+------+------+
|
||||
512 =========== - ##- - - - - | 170 | 50 | 512 | 400 |
|
||||
## ## | 341 | 150 | 768 | 800 |
|
||||
341 -PP - - - - ## ## | 512 | 300 | 1024 | 1700 |
|
||||
PP ## ## +-----+-----+------+------+
|
||||
170 -## - - - - ## ##
|
||||
## ## ## ##
|
||||
------------ -------------
|
||||
CPU0 CPU1 CPU2 CPU3
|
||||
|
||||
Current OPP: ===== Other OPP: - - - util_avg (100 each): ##
|
||||
|
||||
|
||||
find_energy_efficient_cpu() will first look for the CPUs with the
|
||||
maximum spare capacity in the two performance domains. In this example,
|
||||
CPU1 and CPU3. Then it will estimate the energy of the system if P was
|
||||
placed on either of them, and check if that would save some energy
|
||||
compared to leaving P on CPU0. EAS assumes that OPPs follow utilization
|
||||
(which is coherent with the behaviour of the schedutil CPUFreq
|
||||
governor, see Section 6. for more details on this topic).
|
||||
|
||||
Case 1. P is migrated to CPU1
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
1024 - - - - - - -
|
||||
|
||||
Energy calculation:
|
||||
768 ============= * CPU0: 200 / 341 * 150 = 88
|
||||
* CPU1: 300 / 341 * 150 = 131
|
||||
* CPU2: 600 / 768 * 800 = 625
|
||||
512 - - - - - - - ##- - - - - * CPU3: 500 / 768 * 800 = 520
|
||||
## ## => total_energy = 1364
|
||||
341 =========== ## ##
|
||||
PP ## ##
|
||||
170 -## - - PP- ## ##
|
||||
## ## ## ##
|
||||
------------ -------------
|
||||
CPU0 CPU1 CPU2 CPU3
|
||||
|
||||
|
||||
Case 2. P is migrated to CPU3
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
1024 - - - - - - -
|
||||
|
||||
Energy calculation:
|
||||
768 ============= * CPU0: 200 / 341 * 150 = 88
|
||||
* CPU1: 100 / 341 * 150 = 43
|
||||
PP * CPU2: 600 / 768 * 800 = 625
|
||||
512 - - - - - - - ##- - -PP - * CPU3: 700 / 768 * 800 = 729
|
||||
## ## => total_energy = 1485
|
||||
341 =========== ## ##
|
||||
## ##
|
||||
170 -## - - - - ## ##
|
||||
## ## ## ##
|
||||
------------ -------------
|
||||
CPU0 CPU1 CPU2 CPU3
|
||||
|
||||
|
||||
Case 3. P stays on prev_cpu / CPU 0
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
1024 - - - - - - -
|
||||
|
||||
Energy calculation:
|
||||
768 ============= * CPU0: 400 / 512 * 300 = 234
|
||||
* CPU1: 100 / 512 * 300 = 58
|
||||
* CPU2: 600 / 768 * 800 = 625
|
||||
512 =========== - ##- - - - - * CPU3: 500 / 768 * 800 = 520
|
||||
## ## => total_energy = 1437
|
||||
341 -PP - - - - ## ##
|
||||
PP ## ##
|
||||
170 -## - - - - ## ##
|
||||
## ## ## ##
|
||||
------------ -------------
|
||||
CPU0 CPU1 CPU2 CPU3
|
||||
|
||||
|
||||
From these calculations, the Case 1 has the lowest total energy. So CPU 1
|
||||
is be the best candidate from an energy-efficiency standpoint.
|
||||
|
||||
Big CPUs are generally more power hungry than the little ones and are thus used
|
||||
mainly when a task doesn't fit the littles. However, little CPUs aren't always
|
||||
necessarily more energy-efficient than big CPUs. For some systems, the high OPPs
|
||||
of the little CPUs can be less energy-efficient than the lowest OPPs of the
|
||||
bigs, for example. So, if the little CPUs happen to have enough utilization at
|
||||
a specific point in time, a small task waking up at that moment could be better
|
||||
of executing on the big side in order to save energy, even though it would fit
|
||||
on the little side.
|
||||
|
||||
And even in the case where all OPPs of the big CPUs are less energy-efficient
|
||||
than those of the little, using the big CPUs for a small task might still, under
|
||||
specific conditions, save energy. Indeed, placing a task on a little CPU can
|
||||
result in raising the OPP of the entire performance domain, and that will
|
||||
increase the cost of the tasks already running there. If the waking task is
|
||||
placed on a big CPU, its own execution cost might be higher than if it was
|
||||
running on a little, but it won't impact the other tasks of the little CPUs
|
||||
which will keep running at a lower OPP. So, when considering the total energy
|
||||
consumed by CPUs, the extra cost of running that one task on a big core can be
|
||||
smaller than the cost of raising the OPP on the little CPUs for all the other
|
||||
tasks.
|
||||
|
||||
The examples above would be nearly impossible to get right in a generic way, and
|
||||
for all platforms, without knowing the cost of running at different OPPs on all
|
||||
CPUs of the system. Thanks to its EM-based design, EAS should cope with them
|
||||
correctly without too many troubles. However, in order to ensure a minimal
|
||||
impact on throughput for high-utilization scenarios, EAS also implements another
|
||||
mechanism called 'over-utilization'.
|
||||
|
||||
|
||||
5. Over-utilization
|
||||
-------------------
|
||||
|
||||
From a general standpoint, the use-cases where EAS can help the most are those
|
||||
involving a light/medium CPU utilization. Whenever long CPU-bound tasks are
|
||||
being run, they will require all of the available CPU capacity, and there isn't
|
||||
much that can be done by the scheduler to save energy without severly harming
|
||||
throughput. In order to avoid hurting performance with EAS, CPUs are flagged as
|
||||
'over-utilized' as soon as they are used at more than 80% of their compute
|
||||
capacity. As long as no CPUs are over-utilized in a root domain, load balancing
|
||||
is disabled and EAS overridess the wake-up balancing code. EAS is likely to load
|
||||
the most energy efficient CPUs of the system more than the others if that can be
|
||||
done without harming throughput. So, the load-balancer is disabled to prevent
|
||||
it from breaking the energy-efficient task placement found by EAS. It is safe to
|
||||
do so when the system isn't overutilized since being below the 80% tipping point
|
||||
implies that:
|
||||
|
||||
a. there is some idle time on all CPUs, so the utilization signals used by
|
||||
EAS are likely to accurately represent the 'size' of the various tasks
|
||||
in the system;
|
||||
b. all tasks should already be provided with enough CPU capacity,
|
||||
regardless of their nice values;
|
||||
c. since there is spare capacity all tasks must be blocking/sleeping
|
||||
regularly and balancing at wake-up is sufficient.
|
||||
|
||||
As soon as one CPU goes above the 80% tipping point, at least one of the three
|
||||
assumptions above becomes incorrect. In this scenario, the 'overutilized' flag
|
||||
is raised for the entire root domain, EAS is disabled, and the load-balancer is
|
||||
re-enabled. By doing so, the scheduler falls back onto load-based algorithms for
|
||||
wake-up and load balance under CPU-bound conditions. This provides a better
|
||||
respect of the nice values of tasks.
|
||||
|
||||
Since the notion of overutilization largely relies on detecting whether or not
|
||||
there is some idle time in the system, the CPU capacity 'stolen' by higher
|
||||
(than CFS) scheduling classes (as well as IRQ) must be taken into account. As
|
||||
such, the detection of overutilization accounts for the capacity used not only
|
||||
by CFS tasks, but also by the other scheduling classes and IRQ.
|
||||
|
||||
|
||||
6. Dependencies and requirements for EAS
|
||||
----------------------------------------
|
||||
|
||||
Energy Aware Scheduling depends on the CPUs of the system having specific
|
||||
hardware properties and on other features of the kernel being enabled. This
|
||||
section lists these dependencies and provides hints as to how they can be met.
|
||||
|
||||
|
||||
6.1 - Asymmetric CPU topology
|
||||
|
||||
As mentioned in the introduction, EAS is only supported on platforms with
|
||||
asymmetric CPU topologies for now. This requirement is checked at run-time by
|
||||
looking for the presence of the SD_ASYM_CPUCAPACITY flag when the scheduling
|
||||
domains are built.
|
||||
|
||||
The flag is set/cleared automatically by the scheduler topology code whenever
|
||||
there are CPUs with different capacities in a root domain. The capacities of
|
||||
CPUs are provided by arch-specific code through the arch_scale_cpu_capacity()
|
||||
callback. As an example, arm and arm64 share an implementation of this callback
|
||||
which uses a combination of CPUFreq data and device-tree bindings to compute the
|
||||
capacity of CPUs (see drivers/base/arch_topology.c for more details).
|
||||
|
||||
So, in order to use EAS on your platform your architecture must implement the
|
||||
arch_scale_cpu_capacity() callback, and some of the CPUs must have a lower
|
||||
capacity than others.
|
||||
|
||||
Please note that EAS is not fundamentally incompatible with SMP, but no
|
||||
significant savings on SMP platforms have been observed yet. This restriction
|
||||
could be amended in the future if proven otherwise.
|
||||
|
||||
|
||||
6.2 - Energy Model presence
|
||||
|
||||
EAS uses the EM of a platform to estimate the impact of scheduling decisions on
|
||||
energy. So, your platform must provide power cost tables to the EM framework in
|
||||
order to make EAS start. To do so, please refer to documentation of the
|
||||
independent EM framework in Documentation/power/energy-model.txt.
|
||||
|
||||
Please also note that the scheduling domains need to be re-built after the
|
||||
EM has been registered in order to start EAS.
|
||||
|
||||
|
||||
6.3 - Energy Model complexity
|
||||
|
||||
The task wake-up path is very latency-sensitive. When the EM of a platform is
|
||||
too complex (too many CPUs, too many performance domains, too many performance
|
||||
states, ...), the cost of using it in the wake-up path can become prohibitive.
|
||||
The energy-aware wake-up algorithm has a complexity of:
|
||||
|
||||
C = Nd * (Nc + Ns)
|
||||
|
||||
with: Nd the number of performance domains; Nc the number of CPUs; and Ns the
|
||||
total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8).
|
||||
|
||||
A complexity check is performed at the root domain level, when scheduling
|
||||
domains are built. EAS will not start on a root domain if its C happens to be
|
||||
higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the
|
||||
time of writing).
|
||||
|
||||
If you really want to use EAS but the complexity of your platform's Energy
|
||||
Model is too high to be used with a single root domain, you're left with only
|
||||
two possible options:
|
||||
|
||||
1. split your system into separate, smaller, root domains using exclusive
|
||||
cpusets and enable EAS locally on each of them. This option has the
|
||||
benefit to work out of the box but the drawback of preventing load
|
||||
balance between root domains, which can result in an unbalanced system
|
||||
overall;
|
||||
2. submit patches to reduce the complexity of the EAS wake-up algorithm,
|
||||
hence enabling it to cope with larger EMs in reasonable time.
|
||||
|
||||
|
||||
6.4 - Schedutil governor
|
||||
|
||||
EAS tries to predict at which OPP will the CPUs be running in the close future
|
||||
in order to estimate their energy consumption. To do so, it is assumed that OPPs
|
||||
of CPUs follow their utilization.
|
||||
|
||||
Although it is very difficult to provide hard guarantees regarding the accuracy
|
||||
of this assumption in practice (because the hardware might not do what it is
|
||||
told to do, for example), schedutil as opposed to other CPUFreq governors at
|
||||
least _requests_ frequencies calculated using the utilization signals.
|
||||
Consequently, the only sane governor to use together with EAS is schedutil,
|
||||
because it is the only one providing some degree of consistency between
|
||||
frequency requests and energy predictions.
|
||||
|
||||
Using EAS with any other governor than schedutil is not supported.
|
||||
|
||||
|
||||
6.5 Scale-invariant utilization signals
|
||||
|
||||
In order to make accurate prediction across CPUs and for all performance
|
||||
states, EAS needs frequency-invariant and CPU-invariant PELT signals. These can
|
||||
be obtained using the architecture-defined arch_scale{cpu,freq}_capacity()
|
||||
callbacks.
|
||||
|
||||
Using EAS on a platform that doesn't implement these two callbacks is not
|
||||
supported.
|
||||
|
||||
|
||||
6.6 Multithreading (SMT)
|
||||
|
||||
EAS in its current form is SMT unaware and is not able to leverage
|
||||
multithreaded hardware to save energy. EAS considers threads as independent
|
||||
CPUs, which can actually be counter-productive for both performance and energy.
|
||||
|
||||
EAS on SMT is not supported.
|
@ -79,6 +79,7 @@ show up in /proc/sys/kernel:
|
||||
- reboot-cmd [ SPARC only ]
|
||||
- rtsig-max
|
||||
- rtsig-nr
|
||||
- sched_energy_aware
|
||||
- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst
|
||||
- sem
|
||||
- sem_next_id [ sysv ipc ]
|
||||
@ -890,6 +891,17 @@ rtsig-nr shows the number of RT signals currently queued.
|
||||
|
||||
==============================================================
|
||||
|
||||
sched_energy_aware:
|
||||
|
||||
Enables/disables Energy Aware Scheduling (EAS). EAS starts
|
||||
automatically on platforms where it can run (that is,
|
||||
platforms with asymmetric CPU topologies and having an Energy
|
||||
Model available). If your platform happens to meet the
|
||||
requirements for EAS but you do not want to use it, change
|
||||
this value to 0.
|
||||
|
||||
==============================================================
|
||||
|
||||
sched_schedstats:
|
||||
|
||||
Enables/disables scheduler statistics. Enabling this feature
|
||||
|
@ -12280,14 +12280,6 @@ S: Maintained
|
||||
F: drivers/net/ppp/pptp.c
|
||||
W: http://sourceforge.net/projects/accel-pptp
|
||||
|
||||
PREEMPTIBLE KERNEL
|
||||
M: Robert Love <rml@tech9.net>
|
||||
L: kpreempt-tech@lists.sourceforge.net
|
||||
W: https://www.kernel.org/pub/linux/kernel/people/rml/preempt-kernel
|
||||
S: Supported
|
||||
F: Documentation/preempt-locking.txt
|
||||
F: include/linux/preempt.h
|
||||
|
||||
PRINTK
|
||||
M: Petr Mladek <pmladek@suse.com>
|
||||
M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
@ -13525,6 +13517,7 @@ F: kernel/sched/
|
||||
F: include/linux/sched.h
|
||||
F: include/uapi/linux/sched.h
|
||||
F: include/linux/wait.h
|
||||
F: include/linux/preempt.h
|
||||
|
||||
SCR24X CHIP CARD INTERFACE DRIVER
|
||||
M: Lubomir Rintel <lkundrak@v3.sk>
|
||||
|
@ -1189,7 +1189,7 @@ no_thread_group:
|
||||
flush_itimer_signals();
|
||||
#endif
|
||||
|
||||
if (atomic_read(&oldsighand->count) != 1) {
|
||||
if (refcount_read(&oldsighand->count) != 1) {
|
||||
struct sighand_struct *newsighand;
|
||||
/*
|
||||
* This ->sighand is shared with the CLONE_SIGHAND
|
||||
@ -1199,7 +1199,7 @@ no_thread_group:
|
||||
if (!newsighand)
|
||||
return -ENOMEM;
|
||||
|
||||
atomic_set(&newsighand->count, 1);
|
||||
refcount_set(&newsighand->count, 1);
|
||||
memcpy(newsighand->action, oldsighand->action,
|
||||
sizeof(newsighand->action));
|
||||
|
||||
|
@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
|
||||
else
|
||||
bytes += kobjsize(current->files);
|
||||
|
||||
if (current->sighand && atomic_read(¤t->sighand->count) > 1)
|
||||
if (current->sighand && refcount_read(¤t->sighand->count) > 1)
|
||||
sbytes += kobjsize(current->sighand);
|
||||
else
|
||||
bytes += kobjsize(current->sighand);
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/securebits.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/refcount.h>
|
||||
#include <linux/sched/autogroup.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <linux/sched/rt.h>
|
||||
|
@ -86,7 +86,7 @@ enum {
|
||||
|
||||
struct kthread_worker {
|
||||
unsigned int flags;
|
||||
spinlock_t lock;
|
||||
raw_spinlock_t lock;
|
||||
struct list_head work_list;
|
||||
struct list_head delayed_work_list;
|
||||
struct task_struct *task;
|
||||
@ -107,7 +107,7 @@ struct kthread_delayed_work {
|
||||
};
|
||||
|
||||
#define KTHREAD_WORKER_INIT(worker) { \
|
||||
.lock = __SPIN_LOCK_UNLOCKED((worker).lock), \
|
||||
.lock = __RAW_SPIN_LOCK_UNLOCKED((worker).lock), \
|
||||
.work_list = LIST_HEAD_INIT((worker).work_list), \
|
||||
.delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\
|
||||
}
|
||||
@ -165,9 +165,8 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
|
||||
#define kthread_init_delayed_work(dwork, fn) \
|
||||
do { \
|
||||
kthread_init_work(&(dwork)->work, (fn)); \
|
||||
__init_timer(&(dwork)->timer, \
|
||||
kthread_delayed_work_timer_fn, \
|
||||
TIMER_IRQSAFE); \
|
||||
timer_setup(&(dwork)->timer, \
|
||||
kthread_delayed_work_timer_fn, 0); \
|
||||
} while (0)
|
||||
|
||||
int kthread_worker_fn(void *worker_ptr);
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include <linux/seccomp.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/refcount.h>
|
||||
#include <linux/resource.h>
|
||||
#include <linux/latencytop.h>
|
||||
#include <linux/sched/prio.h>
|
||||
@ -356,12 +357,6 @@ struct util_est {
|
||||
* For cfs_rq, it is the aggregated load_avg of all runnable and
|
||||
* blocked sched_entities.
|
||||
*
|
||||
* load_avg may also take frequency scaling into account:
|
||||
*
|
||||
* load_avg = runnable% * scale_load_down(load) * freq%
|
||||
*
|
||||
* where freq% is the CPU frequency normalized to the highest frequency.
|
||||
*
|
||||
* [util_avg definition]
|
||||
*
|
||||
* util_avg = running% * SCHED_CAPACITY_SCALE
|
||||
@ -370,17 +365,14 @@ struct util_est {
|
||||
* a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
|
||||
* and blocked sched_entities.
|
||||
*
|
||||
* util_avg may also factor frequency scaling and CPU capacity scaling:
|
||||
* load_avg and util_avg don't direcly factor frequency scaling and CPU
|
||||
* capacity scaling. The scaling is done through the rq_clock_pelt that
|
||||
* is used for computing those signals (see update_rq_clock_pelt())
|
||||
*
|
||||
* util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
|
||||
*
|
||||
* where freq% is the same as above, and capacity% is the CPU capacity
|
||||
* normalized to the greatest capacity (due to uarch differences, etc).
|
||||
*
|
||||
* N.B., the above ratios (runnable%, running%, freq%, and capacity%)
|
||||
* themselves are in the range of [0, 1]. To do fixed point arithmetics,
|
||||
* we therefore scale them to as large a range as necessary. This is for
|
||||
* example reflected by util_avg's SCHED_CAPACITY_SCALE.
|
||||
* N.B., the above ratios (runnable% and running%) themselves are in the
|
||||
* range of [0, 1]. To do fixed point arithmetics, we therefore scale them
|
||||
* to as large a range as necessary. This is for example reflected by
|
||||
* util_avg's SCHED_CAPACITY_SCALE.
|
||||
*
|
||||
* [Overflow issue]
|
||||
*
|
||||
@ -607,7 +599,7 @@ struct task_struct {
|
||||
randomized_struct_fields_start
|
||||
|
||||
void *stack;
|
||||
atomic_t usage;
|
||||
refcount_t usage;
|
||||
/* Per task flags (PF_*), defined further below: */
|
||||
unsigned int flags;
|
||||
unsigned int ptrace;
|
||||
@ -1187,7 +1179,7 @@ struct task_struct {
|
||||
#endif
|
||||
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
||||
/* A live task holds one reference: */
|
||||
atomic_t stack_refcount;
|
||||
refcount_t stack_refcount;
|
||||
#endif
|
||||
#ifdef CONFIG_LIVEPATCH
|
||||
int patch_state;
|
||||
@ -1403,7 +1395,6 @@ extern struct pid *cad_pid;
|
||||
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
|
||||
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
|
||||
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
|
||||
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
|
||||
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
|
||||
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
|
||||
|
||||
@ -1753,9 +1744,9 @@ static __always_inline bool need_resched(void)
|
||||
static inline unsigned int task_cpu(const struct task_struct *p)
|
||||
{
|
||||
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
||||
return p->cpu;
|
||||
return READ_ONCE(p->cpu);
|
||||
#else
|
||||
return task_thread_info(p)->cpu;
|
||||
return READ_ONCE(task_thread_info(p)->cpu);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -8,13 +8,14 @@
|
||||
#include <linux/sched/jobctl.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/refcount.h>
|
||||
|
||||
/*
|
||||
* Types defining task->signal and task->sighand and APIs using them:
|
||||
*/
|
||||
|
||||
struct sighand_struct {
|
||||
atomic_t count;
|
||||
refcount_t count;
|
||||
struct k_sigaction action[_NSIG];
|
||||
spinlock_t siglock;
|
||||
wait_queue_head_t signalfd_wqh;
|
||||
@ -82,7 +83,7 @@ struct multiprocess_signals {
|
||||
* the locking of signal_struct.
|
||||
*/
|
||||
struct signal_struct {
|
||||
atomic_t sigcnt;
|
||||
refcount_t sigcnt;
|
||||
atomic_t live;
|
||||
int nr_threads;
|
||||
struct list_head thread_head;
|
||||
|
@ -83,4 +83,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos);
|
||||
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
extern unsigned int sysctl_sched_energy_aware;
|
||||
extern int sched_energy_aware_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos);
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_SCHED_SYSCTL_H */
|
||||
|
@ -88,13 +88,13 @@ extern void sched_exec(void);
|
||||
#define sched_exec() {}
|
||||
#endif
|
||||
|
||||
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
|
||||
#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0)
|
||||
|
||||
extern void __put_task_struct(struct task_struct *t);
|
||||
|
||||
static inline void put_task_struct(struct task_struct *t)
|
||||
{
|
||||
if (atomic_dec_and_test(&t->usage))
|
||||
if (refcount_dec_and_test(&t->usage))
|
||||
__put_task_struct(t);
|
||||
}
|
||||
|
||||
|
@ -61,7 +61,7 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
|
||||
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
||||
static inline void *try_get_task_stack(struct task_struct *tsk)
|
||||
{
|
||||
return atomic_inc_not_zero(&tsk->stack_refcount) ?
|
||||
return refcount_inc_not_zero(&tsk->stack_refcount) ?
|
||||
task_stack_page(tsk) : NULL;
|
||||
}
|
||||
|
||||
|
@ -176,10 +176,10 @@ typedef int (*sched_domain_flags_f)(void);
|
||||
#define SDTL_OVERLAP 0x01
|
||||
|
||||
struct sd_data {
|
||||
struct sched_domain **__percpu sd;
|
||||
struct sched_domain_shared **__percpu sds;
|
||||
struct sched_group **__percpu sg;
|
||||
struct sched_group_capacity **__percpu sgc;
|
||||
struct sched_domain *__percpu *sd;
|
||||
struct sched_domain_shared *__percpu *sds;
|
||||
struct sched_group *__percpu *sg;
|
||||
struct sched_group_capacity *__percpu *sgc;
|
||||
};
|
||||
|
||||
struct sched_domain_topology_level {
|
||||
|
@ -308,7 +308,7 @@ do { \
|
||||
|
||||
#define __wait_event_freezable(wq_head, condition) \
|
||||
___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
|
||||
schedule(); try_to_freeze())
|
||||
freezable_schedule())
|
||||
|
||||
/**
|
||||
* wait_event_freezable - sleep (or freeze) until a condition gets true
|
||||
@ -367,7 +367,7 @@ do { \
|
||||
#define __wait_event_freezable_timeout(wq_head, condition, timeout) \
|
||||
___wait_event(wq_head, ___wait_cond_timeout(condition), \
|
||||
TASK_INTERRUPTIBLE, 0, timeout, \
|
||||
__ret = schedule_timeout(__ret); try_to_freeze())
|
||||
__ret = freezable_schedule_timeout(__ret))
|
||||
|
||||
/*
|
||||
* like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
|
||||
@ -588,7 +588,7 @@ do { \
|
||||
|
||||
#define __wait_event_freezable_exclusive(wq, condition) \
|
||||
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
|
||||
schedule(); try_to_freeze())
|
||||
freezable_schedule())
|
||||
|
||||
#define wait_event_freezable_exclusive(wq, condition) \
|
||||
({ \
|
||||
|
@ -44,7 +44,7 @@ static struct signal_struct init_signals = {
|
||||
};
|
||||
|
||||
static struct sighand_struct init_sighand = {
|
||||
.count = ATOMIC_INIT(1),
|
||||
.count = REFCOUNT_INIT(1),
|
||||
.action = { { { .sa_handler = SIG_DFL, } }, },
|
||||
.siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
|
||||
.signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
|
||||
@ -61,11 +61,11 @@ struct task_struct init_task
|
||||
= {
|
||||
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
||||
.thread_info = INIT_THREAD_INFO(init_task),
|
||||
.stack_refcount = ATOMIC_INIT(1),
|
||||
.stack_refcount = REFCOUNT_INIT(1),
|
||||
#endif
|
||||
.state = 0,
|
||||
.stack = init_stack,
|
||||
.usage = ATOMIC_INIT(2),
|
||||
.usage = REFCOUNT_INIT(2),
|
||||
.flags = PF_KTHREAD,
|
||||
.prio = MAX_PRIO - 20,
|
||||
.static_prio = MAX_PRIO - 20,
|
||||
|
@ -429,7 +429,7 @@ static void release_task_stack(struct task_struct *tsk)
|
||||
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
||||
void put_task_stack(struct task_struct *tsk)
|
||||
{
|
||||
if (atomic_dec_and_test(&tsk->stack_refcount))
|
||||
if (refcount_dec_and_test(&tsk->stack_refcount))
|
||||
release_task_stack(tsk);
|
||||
}
|
||||
#endif
|
||||
@ -447,7 +447,7 @@ void free_task(struct task_struct *tsk)
|
||||
* If the task had a separate stack allocation, it should be gone
|
||||
* by now.
|
||||
*/
|
||||
WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
|
||||
WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
|
||||
#endif
|
||||
rt_mutex_debug_task_free(tsk);
|
||||
ftrace_graph_exit_task(tsk);
|
||||
@ -710,14 +710,14 @@ static inline void free_signal_struct(struct signal_struct *sig)
|
||||
|
||||
static inline void put_signal_struct(struct signal_struct *sig)
|
||||
{
|
||||
if (atomic_dec_and_test(&sig->sigcnt))
|
||||
if (refcount_dec_and_test(&sig->sigcnt))
|
||||
free_signal_struct(sig);
|
||||
}
|
||||
|
||||
void __put_task_struct(struct task_struct *tsk)
|
||||
{
|
||||
WARN_ON(!tsk->exit_state);
|
||||
WARN_ON(atomic_read(&tsk->usage));
|
||||
WARN_ON(refcount_read(&tsk->usage));
|
||||
WARN_ON(tsk == current);
|
||||
|
||||
cgroup_free(tsk);
|
||||
@ -867,7 +867,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
|
||||
tsk->stack_vm_area = stack_vm_area;
|
||||
#endif
|
||||
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
||||
atomic_set(&tsk->stack_refcount, 1);
|
||||
refcount_set(&tsk->stack_refcount, 1);
|
||||
#endif
|
||||
|
||||
if (err)
|
||||
@ -896,7 +896,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
|
||||
* One for us, one for whoever does the "release_task()" (usually
|
||||
* parent)
|
||||
*/
|
||||
atomic_set(&tsk->usage, 2);
|
||||
refcount_set(&tsk->usage, 2);
|
||||
#ifdef CONFIG_BLK_DEV_IO_TRACE
|
||||
tsk->btrace_seq = 0;
|
||||
#endif
|
||||
@ -1463,7 +1463,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
|
||||
struct sighand_struct *sig;
|
||||
|
||||
if (clone_flags & CLONE_SIGHAND) {
|
||||
atomic_inc(¤t->sighand->count);
|
||||
refcount_inc(¤t->sighand->count);
|
||||
return 0;
|
||||
}
|
||||
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
|
||||
@ -1471,7 +1471,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
|
||||
if (!sig)
|
||||
return -ENOMEM;
|
||||
|
||||
atomic_set(&sig->count, 1);
|
||||
refcount_set(&sig->count, 1);
|
||||
spin_lock_irq(¤t->sighand->siglock);
|
||||
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
|
||||
spin_unlock_irq(¤t->sighand->siglock);
|
||||
@ -1480,7 +1480,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
|
||||
|
||||
void __cleanup_sighand(struct sighand_struct *sighand)
|
||||
{
|
||||
if (atomic_dec_and_test(&sighand->count)) {
|
||||
if (refcount_dec_and_test(&sighand->count)) {
|
||||
signalfd_cleanup(sighand);
|
||||
/*
|
||||
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
|
||||
@ -1527,7 +1527,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
|
||||
|
||||
sig->nr_threads = 1;
|
||||
atomic_set(&sig->live, 1);
|
||||
atomic_set(&sig->sigcnt, 1);
|
||||
refcount_set(&sig->sigcnt, 1);
|
||||
|
||||
/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
|
||||
sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
|
||||
@ -2082,7 +2082,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
} else {
|
||||
current->signal->nr_threads++;
|
||||
atomic_inc(¤t->signal->live);
|
||||
atomic_inc(¤t->signal->sigcnt);
|
||||
refcount_inc(¤t->signal->sigcnt);
|
||||
task_join_group_stop(p);
|
||||
list_add_tail_rcu(&p->thread_group,
|
||||
&p->group_leader->thread_group);
|
||||
@ -2439,7 +2439,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
|
||||
return -EINVAL;
|
||||
}
|
||||
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
|
||||
if (atomic_read(¤t->sighand->count) > 1)
|
||||
if (refcount_read(¤t->sighand->count) > 1)
|
||||
return -EINVAL;
|
||||
}
|
||||
if (unshare_flags & CLONE_VM) {
|
||||
|
@ -605,7 +605,7 @@ void __kthread_init_worker(struct kthread_worker *worker,
|
||||
struct lock_class_key *key)
|
||||
{
|
||||
memset(worker, 0, sizeof(struct kthread_worker));
|
||||
spin_lock_init(&worker->lock);
|
||||
raw_spin_lock_init(&worker->lock);
|
||||
lockdep_set_class_and_name(&worker->lock, key, name);
|
||||
INIT_LIST_HEAD(&worker->work_list);
|
||||
INIT_LIST_HEAD(&worker->delayed_work_list);
|
||||
@ -647,21 +647,21 @@ repeat:
|
||||
|
||||
if (kthread_should_stop()) {
|
||||
__set_current_state(TASK_RUNNING);
|
||||
spin_lock_irq(&worker->lock);
|
||||
raw_spin_lock_irq(&worker->lock);
|
||||
worker->task = NULL;
|
||||
spin_unlock_irq(&worker->lock);
|
||||
raw_spin_unlock_irq(&worker->lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
work = NULL;
|
||||
spin_lock_irq(&worker->lock);
|
||||
raw_spin_lock_irq(&worker->lock);
|
||||
if (!list_empty(&worker->work_list)) {
|
||||
work = list_first_entry(&worker->work_list,
|
||||
struct kthread_work, node);
|
||||
list_del_init(&work->node);
|
||||
}
|
||||
worker->current_work = work;
|
||||
spin_unlock_irq(&worker->lock);
|
||||
raw_spin_unlock_irq(&worker->lock);
|
||||
|
||||
if (work) {
|
||||
__set_current_state(TASK_RUNNING);
|
||||
@ -818,12 +818,12 @@ bool kthread_queue_work(struct kthread_worker *worker,
|
||||
bool ret = false;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
raw_spin_lock_irqsave(&worker->lock, flags);
|
||||
if (!queuing_blocked(worker, work)) {
|
||||
kthread_insert_work(worker, work, &worker->work_list);
|
||||
ret = true;
|
||||
}
|
||||
spin_unlock_irqrestore(&worker->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&worker->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kthread_queue_work);
|
||||
@ -841,6 +841,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
|
||||
struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
|
||||
struct kthread_work *work = &dwork->work;
|
||||
struct kthread_worker *worker = work->worker;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* This might happen when a pending work is reinitialized.
|
||||
@ -849,7 +850,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
|
||||
if (WARN_ON_ONCE(!worker))
|
||||
return;
|
||||
|
||||
spin_lock(&worker->lock);
|
||||
raw_spin_lock_irqsave(&worker->lock, flags);
|
||||
/* Work must not be used with >1 worker, see kthread_queue_work(). */
|
||||
WARN_ON_ONCE(work->worker != worker);
|
||||
|
||||
@ -858,7 +859,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
|
||||
list_del_init(&work->node);
|
||||
kthread_insert_work(worker, work, &worker->work_list);
|
||||
|
||||
spin_unlock(&worker->lock);
|
||||
raw_spin_unlock_irqrestore(&worker->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
|
||||
|
||||
@ -914,14 +915,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker,
|
||||
unsigned long flags;
|
||||
bool ret = false;
|
||||
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
raw_spin_lock_irqsave(&worker->lock, flags);
|
||||
|
||||
if (!queuing_blocked(worker, work)) {
|
||||
__kthread_queue_delayed_work(worker, dwork, delay);
|
||||
ret = true;
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&worker->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&worker->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
|
||||
@ -957,7 +958,7 @@ void kthread_flush_work(struct kthread_work *work)
|
||||
if (!worker)
|
||||
return;
|
||||
|
||||
spin_lock_irq(&worker->lock);
|
||||
raw_spin_lock_irq(&worker->lock);
|
||||
/* Work must not be used with >1 worker, see kthread_queue_work(). */
|
||||
WARN_ON_ONCE(work->worker != worker);
|
||||
|
||||
@ -969,7 +970,7 @@ void kthread_flush_work(struct kthread_work *work)
|
||||
else
|
||||
noop = true;
|
||||
|
||||
spin_unlock_irq(&worker->lock);
|
||||
raw_spin_unlock_irq(&worker->lock);
|
||||
|
||||
if (!noop)
|
||||
wait_for_completion(&fwork.done);
|
||||
@ -1002,9 +1003,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
|
||||
* any queuing is blocked by setting the canceling counter.
|
||||
*/
|
||||
work->canceling++;
|
||||
spin_unlock_irqrestore(&worker->lock, *flags);
|
||||
raw_spin_unlock_irqrestore(&worker->lock, *flags);
|
||||
del_timer_sync(&dwork->timer);
|
||||
spin_lock_irqsave(&worker->lock, *flags);
|
||||
raw_spin_lock_irqsave(&worker->lock, *flags);
|
||||
work->canceling--;
|
||||
}
|
||||
|
||||
@ -1051,7 +1052,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
|
||||
unsigned long flags;
|
||||
int ret = false;
|
||||
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
raw_spin_lock_irqsave(&worker->lock, flags);
|
||||
|
||||
/* Do not bother with canceling when never queued. */
|
||||
if (!work->worker)
|
||||
@ -1068,7 +1069,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
|
||||
fast_queue:
|
||||
__kthread_queue_delayed_work(worker, dwork, delay);
|
||||
out:
|
||||
spin_unlock_irqrestore(&worker->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&worker->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
|
||||
@ -1082,7 +1083,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
|
||||
if (!worker)
|
||||
goto out;
|
||||
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
raw_spin_lock_irqsave(&worker->lock, flags);
|
||||
/* Work must not be used with >1 worker, see kthread_queue_work(). */
|
||||
WARN_ON_ONCE(work->worker != worker);
|
||||
|
||||
@ -1096,13 +1097,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
|
||||
* In the meantime, block any queuing by setting the canceling counter.
|
||||
*/
|
||||
work->canceling++;
|
||||
spin_unlock_irqrestore(&worker->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&worker->lock, flags);
|
||||
kthread_flush_work(work);
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
raw_spin_lock_irqsave(&worker->lock, flags);
|
||||
work->canceling--;
|
||||
|
||||
out_fast:
|
||||
spin_unlock_irqrestore(&worker->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&worker->lock, flags);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
|
||||
* [L] ->on_rq
|
||||
* RELEASE (rq->lock)
|
||||
*
|
||||
* If we observe the old CPU in task_rq_lock, the acquire of
|
||||
* If we observe the old CPU in task_rq_lock(), the acquire of
|
||||
* the old rq->lock will fully serialize against the stores.
|
||||
*
|
||||
* If we observe the new CPU in task_rq_lock, the acquire will
|
||||
* pair with the WMB to ensure we must then also see migrating.
|
||||
* If we observe the new CPU in task_rq_lock(), the address
|
||||
* dependency headed by '[L] rq = task_rq()' and the acquire
|
||||
* will pair with the WMB to ensure we then also see migrating.
|
||||
*/
|
||||
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
|
||||
rq_pin_lock(rq, rf);
|
||||
@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
||||
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
|
||||
update_irq_load_avg(rq, irq_delta + steal);
|
||||
#endif
|
||||
update_rq_clock_pelt(rq, delta);
|
||||
}
|
||||
|
||||
void update_rq_clock(struct rq *rq)
|
||||
@ -956,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
|
||||
{
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
p->on_rq = TASK_ON_RQ_MIGRATING;
|
||||
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
|
||||
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
|
||||
set_task_cpu(p, new_cpu);
|
||||
rq_unlock(rq, rf);
|
||||
@ -2459,7 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
|
||||
#endif
|
||||
rq = __task_rq_lock(p, &rf);
|
||||
update_rq_clock(rq);
|
||||
post_init_entity_util_avg(&p->se);
|
||||
post_init_entity_util_avg(p);
|
||||
|
||||
activate_task(rq, p, ENQUEUE_NOCLOCK);
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
|
@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
deadline_queue_push_tasks(rq);
|
||||
|
||||
if (rq->curr->sched_class != &dl_sched_class)
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
|
||||
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
|
||||
|
||||
return p;
|
||||
}
|
||||
@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
update_curr_dl(rq);
|
||||
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
|
||||
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
|
||||
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
|
||||
enqueue_pushable_dl_task(rq, p);
|
||||
}
|
||||
@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
|
||||
{
|
||||
update_curr_dl(rq);
|
||||
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
|
||||
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
|
||||
/*
|
||||
* Even when we have runtime, update_curr_dl() might have resulted in us
|
||||
* not being the leftmost task anymore. In that case NEED_RESCHED will
|
||||
|
@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void)
|
||||
{
|
||||
static struct ctl_table *cpu_entries;
|
||||
static struct ctl_table **cpu_idx;
|
||||
static bool init_done = false;
|
||||
char buf[32];
|
||||
int i;
|
||||
|
||||
@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void)
|
||||
if (!cpumask_available(sd_sysctl_cpus)) {
|
||||
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
|
||||
return;
|
||||
}
|
||||
|
||||
if (!init_done) {
|
||||
init_done = true;
|
||||
/* init to possible to not have holes in @cpu_entries */
|
||||
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
|
||||
}
|
||||
|
@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
||||
/* cpu runqueue to which this cfs_rq is attached */
|
||||
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return cfs_rq->rq;
|
||||
}
|
||||
|
||||
static inline struct task_struct *task_of(struct sched_entity *se)
|
||||
{
|
||||
SCHED_WARN_ON(!entity_is_task(se));
|
||||
@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
|
||||
return grp->my_q;
|
||||
}
|
||||
|
||||
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (!cfs_rq->on_list) {
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
int cpu = cpu_of(rq);
|
||||
/*
|
||||
* Ensure we either appear before our parent (if already
|
||||
* enqueued) or force our parent to appear after us when it is
|
||||
* enqueued. The fact that we always enqueue bottom-up
|
||||
* reduces this to two cases and a special case for the root
|
||||
* cfs_rq. Furthermore, it also means that we will always reset
|
||||
* tmp_alone_branch either when the branch is connected
|
||||
* to a tree or when we reach the beg of the tree
|
||||
*/
|
||||
if (cfs_rq->tg->parent &&
|
||||
cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
|
||||
/*
|
||||
* If parent is already on the list, we add the child
|
||||
* just before. Thanks to circular linked property of
|
||||
* the list, this means to put the child at the tail
|
||||
* of the list that starts by parent.
|
||||
*/
|
||||
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
||||
&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
|
||||
/*
|
||||
* The branch is now connected to its tree so we can
|
||||
* reset tmp_alone_branch to the beginning of the
|
||||
* list.
|
||||
*/
|
||||
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
||||
} else if (!cfs_rq->tg->parent) {
|
||||
/*
|
||||
* cfs rq without parent should be put
|
||||
* at the tail of the list.
|
||||
*/
|
||||
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
||||
&rq->leaf_cfs_rq_list);
|
||||
/*
|
||||
* We have reach the beg of a tree so we can reset
|
||||
* tmp_alone_branch to the beginning of the list.
|
||||
*/
|
||||
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
||||
} else {
|
||||
/*
|
||||
* The parent has not already been added so we want to
|
||||
* make sure that it will be put after us.
|
||||
* tmp_alone_branch points to the beg of the branch
|
||||
* where we will add parent.
|
||||
*/
|
||||
list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
|
||||
rq->tmp_alone_branch);
|
||||
/*
|
||||
* update tmp_alone_branch to points to the new beg
|
||||
* of the branch
|
||||
*/
|
||||
rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
|
||||
}
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
int cpu = cpu_of(rq);
|
||||
|
||||
cfs_rq->on_list = 1;
|
||||
if (cfs_rq->on_list)
|
||||
return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
|
||||
|
||||
cfs_rq->on_list = 1;
|
||||
|
||||
/*
|
||||
* Ensure we either appear before our parent (if already
|
||||
* enqueued) or force our parent to appear after us when it is
|
||||
* enqueued. The fact that we always enqueue bottom-up
|
||||
* reduces this to two cases and a special case for the root
|
||||
* cfs_rq. Furthermore, it also means that we will always reset
|
||||
* tmp_alone_branch either when the branch is connected
|
||||
* to a tree or when we reach the top of the tree
|
||||
*/
|
||||
if (cfs_rq->tg->parent &&
|
||||
cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
|
||||
/*
|
||||
* If parent is already on the list, we add the child
|
||||
* just before. Thanks to circular linked property of
|
||||
* the list, this means to put the child at the tail
|
||||
* of the list that starts by parent.
|
||||
*/
|
||||
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
||||
&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
|
||||
/*
|
||||
* The branch is now connected to its tree so we can
|
||||
* reset tmp_alone_branch to the beginning of the
|
||||
* list.
|
||||
*/
|
||||
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!cfs_rq->tg->parent) {
|
||||
/*
|
||||
* cfs rq without parent should be put
|
||||
* at the tail of the list.
|
||||
*/
|
||||
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
||||
&rq->leaf_cfs_rq_list);
|
||||
/*
|
||||
* We have reach the top of a tree so we can reset
|
||||
* tmp_alone_branch to the beginning of the list.
|
||||
*/
|
||||
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* The parent has not already been added so we want to
|
||||
* make sure that it will be put after us.
|
||||
* tmp_alone_branch points to the begin of the branch
|
||||
* where we will add parent.
|
||||
*/
|
||||
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
|
||||
/*
|
||||
* update tmp_alone_branch to points to the new begin
|
||||
* of the branch
|
||||
*/
|
||||
rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (cfs_rq->on_list) {
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
|
||||
/*
|
||||
* With cfs_rq being unthrottled/throttled during an enqueue,
|
||||
* it can happen the tmp_alone_branch points the a leaf that
|
||||
* we finally want to del. In this case, tmp_alone_branch moves
|
||||
* to the prev element but it will point to rq->leaf_cfs_rq_list
|
||||
* at the end of the enqueue.
|
||||
*/
|
||||
if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
|
||||
rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
|
||||
|
||||
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
|
||||
cfs_rq->on_list = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Iterate through all leaf cfs_rq's on a runqueue: */
|
||||
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
|
||||
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
|
||||
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
|
||||
{
|
||||
SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
|
||||
}
|
||||
|
||||
/* Iterate thr' all leaf cfs_rq's on a runqueue */
|
||||
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
|
||||
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
|
||||
leaf_cfs_rq_list)
|
||||
|
||||
/* Do the two (enqueued) entities belong to the same group ? */
|
||||
static inline struct cfs_rq *
|
||||
@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
|
||||
return container_of(se, struct task_struct, se);
|
||||
}
|
||||
|
||||
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return container_of(cfs_rq, struct rq, cfs);
|
||||
}
|
||||
|
||||
|
||||
#define for_each_sched_entity(se) \
|
||||
for (; se; se = NULL)
|
||||
|
||||
@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
}
|
||||
|
||||
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
|
||||
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
|
||||
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
|
||||
{
|
||||
}
|
||||
|
||||
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
|
||||
for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
|
||||
|
||||
static inline struct sched_entity *parent_entity(struct sched_entity *se)
|
||||
{
|
||||
@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
return calc_delta_fair(sched_slice(cfs_rq, se), se);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
#include "pelt.h"
|
||||
#include "sched-pelt.h"
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
|
||||
static unsigned long task_h_load(struct task_struct *p);
|
||||
@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
|
||||
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
|
||||
* if util_avg > util_avg_cap.
|
||||
*/
|
||||
void post_init_entity_util_avg(struct sched_entity *se)
|
||||
void post_init_entity_util_avg(struct task_struct *p)
|
||||
{
|
||||
struct sched_entity *se = &p->se;
|
||||
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
struct sched_avg *sa = &se->avg;
|
||||
long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
|
||||
@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se)
|
||||
}
|
||||
}
|
||||
|
||||
if (entity_is_task(se)) {
|
||||
struct task_struct *p = task_of(se);
|
||||
if (p->sched_class != &fair_sched_class) {
|
||||
/*
|
||||
* For !fair tasks do:
|
||||
*
|
||||
update_cfs_rq_load_avg(now, cfs_rq);
|
||||
attach_entity_load_avg(cfs_rq, se, 0);
|
||||
switched_from_fair(rq, p);
|
||||
*
|
||||
* such that the next switched_to_fair() has the
|
||||
* expected state.
|
||||
*/
|
||||
se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
|
||||
return;
|
||||
}
|
||||
if (p->sched_class != &fair_sched_class) {
|
||||
/*
|
||||
* For !fair tasks do:
|
||||
*
|
||||
update_cfs_rq_load_avg(now, cfs_rq);
|
||||
attach_entity_load_avg(cfs_rq, se, 0);
|
||||
switched_from_fair(rq, p);
|
||||
*
|
||||
* such that the next switched_to_fair() has the
|
||||
* expected state.
|
||||
*/
|
||||
se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
|
||||
return;
|
||||
}
|
||||
|
||||
attach_entity_cfs_rq(se);
|
||||
@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
|
||||
void init_entity_runnable_average(struct sched_entity *se)
|
||||
{
|
||||
}
|
||||
void post_init_entity_util_avg(struct sched_entity *se)
|
||||
void post_init_entity_util_avg(struct task_struct *p)
|
||||
{
|
||||
}
|
||||
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
|
||||
@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
|
||||
unsigned int sysctl_numa_balancing_scan_delay = 1000;
|
||||
|
||||
struct numa_group {
|
||||
atomic_t refcount;
|
||||
refcount_t refcount;
|
||||
|
||||
spinlock_t lock; /* nr_tasks, tasks */
|
||||
int nr_tasks;
|
||||
@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p)
|
||||
unsigned long shared = group_faults_shared(ng);
|
||||
unsigned long private = group_faults_priv(ng);
|
||||
|
||||
period *= atomic_read(&ng->refcount);
|
||||
period *= refcount_read(&ng->refcount);
|
||||
period *= shared + 1;
|
||||
period /= private + shared + 1;
|
||||
}
|
||||
@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p)
|
||||
unsigned long private = group_faults_priv(ng);
|
||||
unsigned long period = smax;
|
||||
|
||||
period *= atomic_read(&ng->refcount);
|
||||
period *= refcount_read(&ng->refcount);
|
||||
period *= shared + 1;
|
||||
period /= private + shared + 1;
|
||||
|
||||
@ -2203,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p)
|
||||
|
||||
static inline int get_numa_group(struct numa_group *grp)
|
||||
{
|
||||
return atomic_inc_not_zero(&grp->refcount);
|
||||
return refcount_inc_not_zero(&grp->refcount);
|
||||
}
|
||||
|
||||
static inline void put_numa_group(struct numa_group *grp)
|
||||
{
|
||||
if (atomic_dec_and_test(&grp->refcount))
|
||||
if (refcount_dec_and_test(&grp->refcount))
|
||||
kfree_rcu(grp, rcu);
|
||||
}
|
||||
|
||||
@ -2229,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
||||
if (!grp)
|
||||
return;
|
||||
|
||||
atomic_set(&grp->refcount, 1);
|
||||
refcount_set(&grp->refcount, 1);
|
||||
grp->active_nodes = 1;
|
||||
grp->max_faults_cpu = 0;
|
||||
spin_lock_init(&grp->lock);
|
||||
@ -3122,7 +3135,7 @@ void set_task_rq_fair(struct sched_entity *se,
|
||||
p_last_update_time = prev->avg.last_update_time;
|
||||
n_last_update_time = next->avg.last_update_time;
|
||||
#endif
|
||||
__update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
|
||||
__update_load_avg_blocked_se(p_last_update_time, se);
|
||||
se->avg.last_update_time = n_last_update_time;
|
||||
}
|
||||
|
||||
@ -3257,11 +3270,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
|
||||
|
||||
/*
|
||||
* runnable_sum can't be lower than running_sum
|
||||
* As running sum is scale with CPU capacity wehreas the runnable sum
|
||||
* is not we rescale running_sum 1st
|
||||
* Rescale running sum to be in the same range as runnable sum
|
||||
* running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
|
||||
* runnable_sum is in [0 : LOAD_AVG_MAX]
|
||||
*/
|
||||
running_sum = se->avg.util_sum /
|
||||
arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
|
||||
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
|
||||
runnable_sum = max(runnable_sum, running_sum);
|
||||
|
||||
load_sum = (s64)se_weight(se) * runnable_sum;
|
||||
@ -3364,7 +3377,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
|
||||
|
||||
/**
|
||||
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
|
||||
* @now: current time, as per cfs_rq_clock_task()
|
||||
* @now: current time, as per cfs_rq_clock_pelt()
|
||||
* @cfs_rq: cfs_rq to update
|
||||
*
|
||||
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
|
||||
@ -3409,7 +3422,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
||||
decayed = 1;
|
||||
}
|
||||
|
||||
decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
|
||||
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
smp_wmb();
|
||||
@ -3499,9 +3512,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
|
||||
/* Update task and its cfs_rq load average */
|
||||
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
u64 now = cfs_rq_clock_task(cfs_rq);
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
int cpu = cpu_of(rq);
|
||||
u64 now = cfs_rq_clock_pelt(cfs_rq);
|
||||
int decayed;
|
||||
|
||||
/*
|
||||
@ -3509,7 +3520,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
|
||||
* track group sched_entity load average for task_h_load calc in migration
|
||||
*/
|
||||
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
|
||||
__update_load_avg_se(now, cpu, cfs_rq, se);
|
||||
__update_load_avg_se(now, cfs_rq, se);
|
||||
|
||||
decayed = update_cfs_rq_load_avg(now, cfs_rq);
|
||||
decayed |= propagate_entity_load_avg(se);
|
||||
@ -3561,7 +3572,7 @@ void sync_entity_load_avg(struct sched_entity *se)
|
||||
u64 last_update_time;
|
||||
|
||||
last_update_time = cfs_rq_last_update_time(cfs_rq);
|
||||
__update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
|
||||
__update_load_avg_blocked_se(last_update_time, se);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3577,10 +3588,6 @@ void remove_entity_load_avg(struct sched_entity *se)
|
||||
* tasks cannot exit without having gone through wake_up_new_task() ->
|
||||
* post_init_entity_util_avg() which will have added things to the
|
||||
* cfs_rq, so we can remove unconditionally.
|
||||
*
|
||||
* Similarly for groups, they will have passed through
|
||||
* post_init_entity_util_avg() before unregister_sched_fair_group()
|
||||
* calls this.
|
||||
*/
|
||||
|
||||
sync_entity_load_avg(se);
|
||||
@ -3654,6 +3661,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
|
||||
{
|
||||
long last_ewma_diff;
|
||||
struct util_est ue;
|
||||
int cpu;
|
||||
|
||||
if (!sched_feat(UTIL_EST))
|
||||
return;
|
||||
@ -3687,6 +3695,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
|
||||
if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* To avoid overestimation of actual task utilization, skip updates if
|
||||
* we cannot grant there is idle time in this CPU.
|
||||
*/
|
||||
cpu = cpu_of(rq_of(cfs_rq));
|
||||
if (task_util(p) > capacity_orig_of(cpu))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Update Task's estimated utilization
|
||||
*
|
||||
@ -4429,6 +4445,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
|
||||
/* adjust cfs_rq_clock_task() */
|
||||
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
|
||||
cfs_rq->throttled_clock_task;
|
||||
|
||||
/* Add cfs_rq with already running entity in the list */
|
||||
if (cfs_rq->nr_running >= 1)
|
||||
list_add_leaf_cfs_rq(cfs_rq);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -4440,8 +4460,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
|
||||
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
|
||||
|
||||
/* group is entering throttled state, stop time */
|
||||
if (!cfs_rq->throttle_count)
|
||||
if (!cfs_rq->throttle_count) {
|
||||
cfs_rq->throttled_clock_task = rq_clock_task(rq);
|
||||
list_del_leaf_cfs_rq(cfs_rq);
|
||||
}
|
||||
cfs_rq->throttle_count++;
|
||||
|
||||
return 0;
|
||||
@ -4544,6 +4566,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
break;
|
||||
}
|
||||
|
||||
assert_list_leaf_cfs_rq(rq);
|
||||
|
||||
if (!se)
|
||||
add_nr_running(rq, task_delta);
|
||||
|
||||
@ -4565,7 +4589,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
struct rq_flags rf;
|
||||
|
||||
rq_lock(rq, &rf);
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
if (!cfs_rq_throttled(cfs_rq))
|
||||
goto next;
|
||||
|
||||
@ -4582,7 +4606,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
|
||||
unthrottle_cfs_rq(cfs_rq);
|
||||
|
||||
next:
|
||||
rq_unlock(rq, &rf);
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
|
||||
if (!remaining)
|
||||
break;
|
||||
@ -4598,7 +4622,7 @@ next:
|
||||
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
|
||||
* used to track this state.
|
||||
*/
|
||||
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
||||
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
|
||||
{
|
||||
u64 runtime, runtime_expires;
|
||||
int throttled;
|
||||
@ -4640,11 +4664,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
||||
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
|
||||
runtime = cfs_b->runtime;
|
||||
cfs_b->distribute_running = 1;
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
/* we can't nest cfs_b->lock while distributing bandwidth */
|
||||
runtime = distribute_cfs_runtime(cfs_b, runtime,
|
||||
runtime_expires);
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
|
||||
cfs_b->distribute_running = 0;
|
||||
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
|
||||
@ -4753,17 +4777,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
|
||||
unsigned long flags;
|
||||
u64 expires;
|
||||
|
||||
/* confirm we're still not at a refresh boundary */
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
if (cfs_b->distribute_running) {
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
return;
|
||||
}
|
||||
|
||||
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -4774,18 +4799,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
||||
if (runtime)
|
||||
cfs_b->distribute_running = 1;
|
||||
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
|
||||
if (!runtime)
|
||||
return;
|
||||
|
||||
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
|
||||
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
if (expires == cfs_b->runtime_expires)
|
||||
lsub_positive(&cfs_b->runtime, runtime);
|
||||
cfs_b->distribute_running = 0;
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -4863,20 +4888,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
|
||||
{
|
||||
struct cfs_bandwidth *cfs_b =
|
||||
container_of(timer, struct cfs_bandwidth, period_timer);
|
||||
unsigned long flags;
|
||||
int overrun;
|
||||
int idle = 0;
|
||||
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
for (;;) {
|
||||
overrun = hrtimer_forward_now(timer, cfs_b->period);
|
||||
if (!overrun)
|
||||
break;
|
||||
|
||||
idle = do_sched_cfs_period_timer(cfs_b, overrun);
|
||||
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
|
||||
}
|
||||
if (idle)
|
||||
cfs_b->period_active = 0;
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
|
||||
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
|
||||
}
|
||||
@ -4986,6 +5012,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
|
||||
}
|
||||
|
||||
#else /* CONFIG_CFS_BANDWIDTH */
|
||||
|
||||
static inline bool cfs_bandwidth_used(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return rq_clock_task(rq_of(cfs_rq));
|
||||
@ -5177,6 +5209,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
|
||||
}
|
||||
|
||||
if (cfs_bandwidth_used()) {
|
||||
/*
|
||||
* When bandwidth control is enabled; the cfs_rq_throttled()
|
||||
* breaks in the above iteration can result in incomplete
|
||||
* leaf list maintenance, resulting in triggering the assertion
|
||||
* below.
|
||||
*/
|
||||
for_each_sched_entity(se) {
|
||||
cfs_rq = cfs_rq_of(se);
|
||||
|
||||
if (list_add_leaf_cfs_rq(cfs_rq))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_list_leaf_cfs_rq(rq);
|
||||
|
||||
hrtick_update(rq);
|
||||
}
|
||||
|
||||
@ -5556,11 +5605,6 @@ static unsigned long capacity_of(int cpu)
|
||||
return cpu_rq(cpu)->cpu_capacity;
|
||||
}
|
||||
|
||||
static unsigned long capacity_orig_of(int cpu)
|
||||
{
|
||||
return cpu_rq(cpu)->cpu_capacity_orig;
|
||||
}
|
||||
|
||||
static unsigned long cpu_avg_load_per_task(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
@ -6053,7 +6097,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
|
||||
bool idle = true;
|
||||
|
||||
for_each_cpu(cpu, cpu_smt_mask(core)) {
|
||||
cpumask_clear_cpu(cpu, cpus);
|
||||
__cpumask_clear_cpu(cpu, cpus);
|
||||
if (!available_idle_cpu(cpu))
|
||||
idle = false;
|
||||
}
|
||||
@ -6073,7 +6117,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
|
||||
/*
|
||||
* Scan the local SMT mask for idle CPUs.
|
||||
*/
|
||||
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
static int select_idle_smt(struct task_struct *p, int target)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
@ -6097,7 +6141,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
|
||||
return -1;
|
||||
}
|
||||
|
||||
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
static inline int select_idle_smt(struct task_struct *p, int target)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
@ -6202,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
if ((unsigned)i < nr_cpumask_bits)
|
||||
return i;
|
||||
|
||||
i = select_idle_smt(p, sd, target);
|
||||
i = select_idle_smt(p, target);
|
||||
if ((unsigned)i < nr_cpumask_bits)
|
||||
return i;
|
||||
|
||||
@ -6608,7 +6652,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
||||
if (sd_flag & SD_BALANCE_WAKE) {
|
||||
record_wakee(p);
|
||||
|
||||
if (static_branch_unlikely(&sched_energy_present)) {
|
||||
if (sched_energy_enabled()) {
|
||||
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
|
||||
if (new_cpu >= 0)
|
||||
return new_cpu;
|
||||
@ -7027,6 +7071,12 @@ idle:
|
||||
if (new_tasks > 0)
|
||||
goto again;
|
||||
|
||||
/*
|
||||
* rq is about to be idle, check if we need to update the
|
||||
* lost_idle_time of clock_pelt
|
||||
*/
|
||||
update_idle_rq_clock_pelt(rq);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -7647,10 +7697,27 @@ static inline bool others_have_blocked(struct rq *rq)
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
||||
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (cfs_rq->load.weight)
|
||||
return false;
|
||||
|
||||
if (cfs_rq->avg.load_sum)
|
||||
return false;
|
||||
|
||||
if (cfs_rq->avg.util_sum)
|
||||
return false;
|
||||
|
||||
if (cfs_rq->avg.runnable_load_sum)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void update_blocked_averages(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct cfs_rq *cfs_rq;
|
||||
struct cfs_rq *cfs_rq, *pos;
|
||||
const struct sched_class *curr_class;
|
||||
struct rq_flags rf;
|
||||
bool done = true;
|
||||
@ -7662,14 +7729,10 @@ static void update_blocked_averages(int cpu)
|
||||
* Iterates the task_group tree in a bottom up fashion, see
|
||||
* list_add_leaf_cfs_rq() for details.
|
||||
*/
|
||||
for_each_leaf_cfs_rq(rq, cfs_rq) {
|
||||
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
|
||||
struct sched_entity *se;
|
||||
|
||||
/* throttled entities do not contribute to load */
|
||||
if (throttled_hierarchy(cfs_rq))
|
||||
continue;
|
||||
|
||||
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
|
||||
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
|
||||
update_tg_load_avg(cfs_rq, 0);
|
||||
|
||||
/* Propagate pending load changes to the parent, if any: */
|
||||
@ -7677,14 +7740,21 @@ static void update_blocked_averages(int cpu)
|
||||
if (se && !skip_blocked_update(se))
|
||||
update_load_avg(cfs_rq_of(se), se, 0);
|
||||
|
||||
/*
|
||||
* There can be a lot of idle CPU cgroups. Don't let fully
|
||||
* decayed cfs_rqs linger on the list.
|
||||
*/
|
||||
if (cfs_rq_is_decayed(cfs_rq))
|
||||
list_del_leaf_cfs_rq(cfs_rq);
|
||||
|
||||
/* Don't need periodic decay once load/util_avg are null */
|
||||
if (cfs_rq_has_blocked(cfs_rq))
|
||||
done = false;
|
||||
}
|
||||
|
||||
curr_class = rq->curr->sched_class;
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
|
||||
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
|
||||
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
|
||||
update_irq_load_avg(rq, 0);
|
||||
/* Don't need periodic decay once load/util_avg are null */
|
||||
if (others_have_blocked(rq))
|
||||
@ -7754,11 +7824,11 @@ static inline void update_blocked_averages(int cpu)
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
update_rq_clock(rq);
|
||||
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
|
||||
update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
|
||||
|
||||
curr_class = rq->curr->sched_class;
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
|
||||
update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
|
||||
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
|
||||
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
|
||||
update_irq_load_avg(rq, 0);
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
rq->last_blocked_load_update_tick = jiffies;
|
||||
@ -8452,9 +8522,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
|
||||
if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
|
||||
return 0;
|
||||
|
||||
env->imbalance = DIV_ROUND_CLOSEST(
|
||||
sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
|
||||
SCHED_CAPACITY_SCALE);
|
||||
env->imbalance = sds->busiest_stat.group_load;
|
||||
|
||||
return 1;
|
||||
}
|
||||
@ -8636,7 +8704,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
||||
*/
|
||||
update_sd_lb_stats(env, &sds);
|
||||
|
||||
if (static_branch_unlikely(&sched_energy_present)) {
|
||||
if (sched_energy_enabled()) {
|
||||
struct root_domain *rd = env->dst_rq->rd;
|
||||
|
||||
if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
|
||||
@ -8827,21 +8895,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
||||
*/
|
||||
#define MAX_PINNED_INTERVAL 512
|
||||
|
||||
static int need_active_balance(struct lb_env *env)
|
||||
static inline bool
|
||||
asym_active_balance(struct lb_env *env)
|
||||
{
|
||||
/*
|
||||
* ASYM_PACKING needs to force migrate tasks from busy but
|
||||
* lower priority CPUs in order to pack all tasks in the
|
||||
* highest priority CPUs.
|
||||
*/
|
||||
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
|
||||
sched_asym_prefer(env->dst_cpu, env->src_cpu);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
voluntary_active_balance(struct lb_env *env)
|
||||
{
|
||||
struct sched_domain *sd = env->sd;
|
||||
|
||||
if (env->idle == CPU_NEWLY_IDLE) {
|
||||
|
||||
/*
|
||||
* ASYM_PACKING needs to force migrate tasks from busy but
|
||||
* lower priority CPUs in order to pack all tasks in the
|
||||
* highest priority CPUs.
|
||||
*/
|
||||
if ((sd->flags & SD_ASYM_PACKING) &&
|
||||
sched_asym_prefer(env->dst_cpu, env->src_cpu))
|
||||
return 1;
|
||||
}
|
||||
if (asym_active_balance(env))
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
|
||||
@ -8859,6 +8931,16 @@ static int need_active_balance(struct lb_env *env)
|
||||
if (env->src_grp_type == group_misfit_task)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int need_active_balance(struct lb_env *env)
|
||||
{
|
||||
struct sched_domain *sd = env->sd;
|
||||
|
||||
if (voluntary_active_balance(env))
|
||||
return 1;
|
||||
|
||||
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
|
||||
}
|
||||
|
||||
@ -9023,7 +9105,7 @@ more_balance:
|
||||
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
|
||||
|
||||
/* Prevent to re-select dst_cpu via env's CPUs */
|
||||
cpumask_clear_cpu(env.dst_cpu, env.cpus);
|
||||
__cpumask_clear_cpu(env.dst_cpu, env.cpus);
|
||||
|
||||
env.dst_rq = cpu_rq(env.new_dst_cpu);
|
||||
env.dst_cpu = env.new_dst_cpu;
|
||||
@ -9050,7 +9132,7 @@ more_balance:
|
||||
|
||||
/* All tasks on this runqueue were pinned by CPU affinity */
|
||||
if (unlikely(env.flags & LBF_ALL_PINNED)) {
|
||||
cpumask_clear_cpu(cpu_of(busiest), cpus);
|
||||
__cpumask_clear_cpu(cpu_of(busiest), cpus);
|
||||
/*
|
||||
* Attempting to continue load balancing at the current
|
||||
* sched_domain level only makes sense if there are
|
||||
@ -9120,7 +9202,7 @@ more_balance:
|
||||
} else
|
||||
sd->nr_balance_failed = 0;
|
||||
|
||||
if (likely(!active_balance)) {
|
||||
if (likely(!active_balance) || voluntary_active_balance(&env)) {
|
||||
/* We were unbalanced, so reset the balancing interval */
|
||||
sd->balance_interval = sd->min_interval;
|
||||
} else {
|
||||
@ -9469,15 +9551,8 @@ static void kick_ilb(unsigned int flags)
|
||||
}
|
||||
|
||||
/*
|
||||
* Current heuristic for kicking the idle load balancer in the presence
|
||||
* of an idle cpu in the system.
|
||||
* - This rq has more than one task.
|
||||
* - This rq has at least one CFS task and the capacity of the CPU is
|
||||
* significantly reduced because of RT tasks or IRQs.
|
||||
* - At parent of LLC scheduler domain level, this cpu's scheduler group has
|
||||
* multiple busy cpu.
|
||||
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
|
||||
* domain span are idle.
|
||||
* Current decision point for kicking the idle load balancer in the presence
|
||||
* of idle CPUs in the system.
|
||||
*/
|
||||
static void nohz_balancer_kick(struct rq *rq)
|
||||
{
|
||||
@ -9519,8 +9594,13 @@ static void nohz_balancer_kick(struct rq *rq)
|
||||
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
|
||||
if (sds) {
|
||||
/*
|
||||
* XXX: write a coherent comment on why we do this.
|
||||
* See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
|
||||
* If there is an imbalance between LLC domains (IOW we could
|
||||
* increase the overall cache use), we need some less-loaded LLC
|
||||
* domain to pull some load. Likewise, we may need to spread
|
||||
* load within the current LLC domain (e.g. packed SMT cores but
|
||||
* other CPUs are idle). We can't really know from here how busy
|
||||
* the others are - so just get a nohz balance going if it looks
|
||||
* like this LLC domain has tasks we could move.
|
||||
*/
|
||||
nr_busy = atomic_read(&sds->nr_busy_cpus);
|
||||
if (nr_busy > 1) {
|
||||
@ -9533,7 +9613,7 @@ static void nohz_balancer_kick(struct rq *rq)
|
||||
sd = rcu_dereference(rq->sd);
|
||||
if (sd) {
|
||||
if ((rq->cfs.h_nr_running >= 1) &&
|
||||
check_cpu_capacity(rq, sd)) {
|
||||
check_cpu_capacity(rq, sd)) {
|
||||
flags = NOHZ_KICK_MASK;
|
||||
goto unlock;
|
||||
}
|
||||
@ -9541,11 +9621,7 @@ static void nohz_balancer_kick(struct rq *rq)
|
||||
|
||||
sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
|
||||
if (sd) {
|
||||
for_each_cpu(i, sched_domain_span(sd)) {
|
||||
if (i == cpu ||
|
||||
!cpumask_test_cpu(i, nohz.idle_cpus_mask))
|
||||
continue;
|
||||
|
||||
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
|
||||
if (sched_asym_prefer(i, cpu)) {
|
||||
flags = NOHZ_KICK_MASK;
|
||||
goto unlock;
|
||||
@ -10546,10 +10622,10 @@ const struct sched_class fair_sched_class = {
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
void print_cfs_stats(struct seq_file *m, int cpu)
|
||||
{
|
||||
struct cfs_rq *cfs_rq;
|
||||
struct cfs_rq *cfs_rq, *pos;
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
|
||||
for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
|
||||
print_cfs_rq(m, cpu, cfs_rq);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
@ -80,7 +80,7 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
|
||||
cpumask_andnot(housekeeping_mask,
|
||||
cpu_possible_mask, non_housekeeping_mask);
|
||||
if (cpumask_empty(housekeeping_mask))
|
||||
cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
|
||||
__cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
|
||||
} else {
|
||||
cpumask_var_t tmp;
|
||||
|
||||
|
@ -26,7 +26,6 @@
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include "sched.h"
|
||||
#include "sched-pelt.h"
|
||||
#include "pelt.h"
|
||||
|
||||
/*
|
||||
@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
|
||||
* n=1
|
||||
*/
|
||||
static __always_inline u32
|
||||
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
||||
accumulate_sum(u64 delta, struct sched_avg *sa,
|
||||
unsigned long load, unsigned long runnable, int running)
|
||||
{
|
||||
unsigned long scale_freq, scale_cpu;
|
||||
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
|
||||
u64 periods;
|
||||
|
||||
scale_freq = arch_scale_freq_capacity(cpu);
|
||||
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
|
||||
|
||||
delta += sa->period_contrib;
|
||||
periods = delta / 1024; /* A period is 1024us (~1ms) */
|
||||
|
||||
@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
||||
}
|
||||
sa->period_contrib = delta;
|
||||
|
||||
contrib = cap_scale(contrib, scale_freq);
|
||||
if (load)
|
||||
sa->load_sum += load * contrib;
|
||||
if (runnable)
|
||||
sa->runnable_load_sum += runnable * contrib;
|
||||
if (running)
|
||||
sa->util_sum += contrib * scale_cpu;
|
||||
sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
|
||||
|
||||
return periods;
|
||||
}
|
||||
@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
||||
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
|
||||
*/
|
||||
static __always_inline int
|
||||
___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
|
||||
___update_load_sum(u64 now, struct sched_avg *sa,
|
||||
unsigned long load, unsigned long runnable, int running)
|
||||
{
|
||||
u64 delta;
|
||||
@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
|
||||
* Step 1: accumulate *_sum since last_update_time. If we haven't
|
||||
* crossed period boundaries, finish.
|
||||
*/
|
||||
if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
|
||||
if (!accumulate_sum(delta, sa, load, runnable, running))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
|
||||
* runnable_load_avg = \Sum se->avg.runable_load_avg
|
||||
*/
|
||||
|
||||
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
|
||||
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
|
||||
{
|
||||
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
|
||||
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
return 1;
|
||||
}
|
||||
@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
|
||||
if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
|
||||
cfs_rq->curr == se)) {
|
||||
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
|
||||
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (___update_load_sum(now, cpu, &cfs_rq->avg,
|
||||
if (___update_load_sum(now, &cfs_rq->avg,
|
||||
scale_load_down(cfs_rq->load.weight),
|
||||
scale_load_down(cfs_rq->runnable_weight),
|
||||
cfs_rq->curr != NULL)) {
|
||||
@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
|
||||
|
||||
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
{
|
||||
if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
|
||||
if (___update_load_sum(now, &rq->avg_rt,
|
||||
running,
|
||||
running,
|
||||
running)) {
|
||||
@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
|
||||
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
{
|
||||
if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
|
||||
if (___update_load_sum(now, &rq->avg_dl,
|
||||
running,
|
||||
running,
|
||||
running)) {
|
||||
@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
||||
int update_irq_load_avg(struct rq *rq, u64 running)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* We can't use clock_pelt because irq time is not accounted in
|
||||
* clock_task. Instead we directly scale the running time to
|
||||
* reflect the real amount of computation
|
||||
*/
|
||||
running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
|
||||
running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
|
||||
|
||||
/*
|
||||
* We know the time that has been used by interrupt since last update
|
||||
* but we don't when. Let be pessimistic and assume that interrupt has
|
||||
* happened just before the update. This is not so far from reality
|
||||
* because interrupt will most probably wake up task and trig an update
|
||||
* of rq clock during which the metric si updated.
|
||||
* of rq clock during which the metric is updated.
|
||||
* We start to decay with normal context time and then we add the
|
||||
* interrupt context time.
|
||||
* We can safely remove running from rq->clock because
|
||||
* rq->clock += delta with delta >= running
|
||||
*/
|
||||
ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
|
||||
ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
|
||||
0,
|
||||
0,
|
||||
0);
|
||||
ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
|
||||
ret += ___update_load_sum(rq->clock, &rq->avg_irq,
|
||||
1,
|
||||
1,
|
||||
1);
|
||||
|
@ -1,8 +1,9 @@
|
||||
#ifdef CONFIG_SMP
|
||||
#include "sched-pelt.h"
|
||||
|
||||
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
|
||||
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
|
||||
int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
|
||||
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
|
||||
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
|
||||
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
|
||||
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
|
||||
@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
|
||||
WRITE_ONCE(avg->util_est.enqueued, enqueued);
|
||||
}
|
||||
|
||||
/*
|
||||
* The clock_pelt scales the time to reflect the effective amount of
|
||||
* computation done during the running delta time but then sync back to
|
||||
* clock_task when rq is idle.
|
||||
*
|
||||
*
|
||||
* absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
|
||||
* @ max capacity ------******---------------******---------------
|
||||
* @ half capacity ------************---------************---------
|
||||
* clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
|
||||
*
|
||||
*/
|
||||
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
|
||||
{
|
||||
if (unlikely(is_idle_task(rq->curr))) {
|
||||
/* The rq is idle, we can sync to clock_task */
|
||||
rq->clock_pelt = rq_clock_task(rq);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* When a rq runs at a lower compute capacity, it will need
|
||||
* more time to do the same amount of work than at max
|
||||
* capacity. In order to be invariant, we scale the delta to
|
||||
* reflect how much work has been really done.
|
||||
* Running longer results in stealing idle time that will
|
||||
* disturb the load signal compared to max capacity. This
|
||||
* stolen idle time will be automatically reflected when the
|
||||
* rq will be idle and the clock will be synced with
|
||||
* rq_clock_task.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Scale the elapsed time to reflect the real amount of
|
||||
* computation
|
||||
*/
|
||||
delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
|
||||
delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
|
||||
|
||||
rq->clock_pelt += delta;
|
||||
}
|
||||
|
||||
/*
|
||||
* When rq becomes idle, we have to check if it has lost idle time
|
||||
* because it was fully busy. A rq is fully used when the /Sum util_sum
|
||||
* is greater or equal to:
|
||||
* (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
|
||||
* For optimization and computing rounding purpose, we don't take into account
|
||||
* the position in the current window (period_contrib) and we use the higher
|
||||
* bound of util_sum to decide.
|
||||
*/
|
||||
static inline void update_idle_rq_clock_pelt(struct rq *rq)
|
||||
{
|
||||
u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
|
||||
u32 util_sum = rq->cfs.avg.util_sum;
|
||||
util_sum += rq->avg_rt.util_sum;
|
||||
util_sum += rq->avg_dl.util_sum;
|
||||
|
||||
/*
|
||||
* Reflecting stolen time makes sense only if the idle
|
||||
* phase would be present at max capacity. As soon as the
|
||||
* utilization of a rq has reached the maximum value, it is
|
||||
* considered as an always runnig rq without idle time to
|
||||
* steal. This potential idle time is considered as lost in
|
||||
* this case. We keep track of this lost idle time compare to
|
||||
* rq's clock_task.
|
||||
*/
|
||||
if (util_sum >= divider)
|
||||
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
|
||||
}
|
||||
|
||||
static inline u64 rq_clock_pelt(struct rq *rq)
|
||||
{
|
||||
lockdep_assert_held(&rq->lock);
|
||||
assert_clock_updated(rq);
|
||||
|
||||
return rq->clock_pelt - rq->lost_idle_time;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
|
||||
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
if (unlikely(cfs_rq->throttle_count))
|
||||
return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
|
||||
|
||||
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
|
||||
}
|
||||
#else
|
||||
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return rq_clock_pelt(rq_of(cfs_rq));
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
static inline int
|
||||
@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline u64 rq_clock_pelt(struct rq *rq)
|
||||
{
|
||||
return rq_clock_task(rq);
|
||||
}
|
||||
|
||||
static inline void
|
||||
update_rq_clock_pelt(struct rq *rq, s64 delta) { }
|
||||
|
||||
static inline void
|
||||
update_idle_rq_clock_pelt(struct rq *rq) { }
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||
* rt task
|
||||
*/
|
||||
if (rq->curr->sched_class != &rt_sched_class)
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
|
||||
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
|
||||
|
||||
return p;
|
||||
}
|
||||
@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
update_curr_rt(rq);
|
||||
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
|
||||
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
|
||||
|
||||
/*
|
||||
* The previous task needs to be made eligible for pushing
|
||||
@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
|
||||
struct sched_rt_entity *rt_se = &p->rt;
|
||||
|
||||
update_curr_rt(rq);
|
||||
update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
|
||||
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
|
||||
|
||||
watchdog(rq, p);
|
||||
|
||||
|
@ -861,7 +861,10 @@ struct rq {
|
||||
|
||||
unsigned int clock_update_flags;
|
||||
u64 clock;
|
||||
u64 clock_task;
|
||||
/* Ensure that all clocks are in the same cache line */
|
||||
u64 clock_task ____cacheline_aligned;
|
||||
u64 clock_pelt;
|
||||
unsigned long lost_idle_time;
|
||||
|
||||
atomic_t nr_iowait;
|
||||
|
||||
@ -951,6 +954,22 @@ struct rq {
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
||||
/* CPU runqueue to which this cfs_rq is attached */
|
||||
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return cfs_rq->rq;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return container_of(cfs_rq, struct rq, cfs);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int cpu_of(struct rq *rq)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
||||
*/
|
||||
smp_wmb();
|
||||
#ifdef CONFIG_THREAD_INFO_IN_TASK
|
||||
p->cpu = cpu;
|
||||
WRITE_ONCE(p->cpu, cpu);
|
||||
#else
|
||||
task_thread_info(p)->cpu = cpu;
|
||||
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
|
||||
#endif
|
||||
p->wake_cpu = cpu;
|
||||
#endif
|
||||
@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p)
|
||||
|
||||
static inline int task_on_rq_migrating(struct task_struct *p)
|
||||
{
|
||||
return p->on_rq == TASK_ON_RQ_MIGRATING;
|
||||
return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1781,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
|
||||
unsigned long to_ratio(u64 period, u64 runtime);
|
||||
|
||||
extern void init_entity_runnable_average(struct sched_entity *se);
|
||||
extern void post_init_entity_util_avg(struct sched_entity *se);
|
||||
extern void post_init_entity_util_avg(struct task_struct *p);
|
||||
|
||||
#ifdef CONFIG_NO_HZ_FULL
|
||||
extern bool sched_can_stop_tick(struct rq *rq);
|
||||
@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
|
||||
# define arch_scale_freq_invariant() false
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static inline unsigned long capacity_orig_of(int cpu)
|
||||
{
|
||||
return cpu_rq(cpu)->cpu_capacity_orig;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
|
||||
/**
|
||||
* enum schedutil_type - CPU utilization type
|
||||
@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
|
||||
#else
|
||||
#define perf_domain_span(pd) NULL
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
extern struct static_key_false sched_energy_present;
|
||||
#endif
|
||||
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(sched_energy_present);
|
||||
|
||||
static inline bool sched_energy_enabled(void)
|
||||
{
|
||||
return static_branch_unlikely(&sched_energy_present);
|
||||
}
|
||||
|
||||
#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
|
||||
|
||||
#define perf_domain_span(pd) NULL
|
||||
static inline bool sched_energy_enabled(void) { return false; }
|
||||
|
||||
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
|
||||
|
@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
||||
return 1;
|
||||
}
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
|
||||
unsigned int sysctl_sched_energy_aware = 1;
|
||||
DEFINE_MUTEX(sched_energy_mutex);
|
||||
bool sched_energy_update;
|
||||
|
||||
#ifdef CONFIG_PROC_SYSCTL
|
||||
int sched_energy_aware_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int ret, state;
|
||||
|
||||
if (write && !capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (!ret && write) {
|
||||
state = static_branch_unlikely(&sched_energy_present);
|
||||
if (state != sysctl_sched_energy_aware) {
|
||||
mutex_lock(&sched_energy_mutex);
|
||||
sched_energy_update = 1;
|
||||
rebuild_sched_domains();
|
||||
sched_energy_update = 0;
|
||||
mutex_unlock(&sched_energy_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void free_pd(struct perf_domain *pd)
|
||||
{
|
||||
struct perf_domain *tmp;
|
||||
@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
|
||||
struct cpufreq_policy *policy;
|
||||
struct cpufreq_governor *gov;
|
||||
|
||||
if (!sysctl_sched_energy_aware)
|
||||
goto free;
|
||||
|
||||
/* EAS is enabled for asymmetric CPU capacity topologies. */
|
||||
if (!per_cpu(sd_asym_cpucapacity, cpu)) {
|
||||
if (sched_debug()) {
|
||||
@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
||||
}
|
||||
|
||||
struct s_data {
|
||||
struct sched_domain ** __percpu sd;
|
||||
struct sched_domain * __percpu *sd;
|
||||
struct root_domain *rd;
|
||||
};
|
||||
|
||||
|
@ -472,6 +472,17 @@ static struct ctl_table kern_table[] = {
|
||||
.extra1 = &one,
|
||||
},
|
||||
#endif
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
{
|
||||
.procname = "sched_energy_aware",
|
||||
.data = &sysctl_sched_energy_aware,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = sched_energy_aware_handler,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
{
|
||||
.procname = "prove_locking",
|
||||
|
Loading…
Reference in New Issue
Block a user