Scheduler updates in this cycle are:
- Improve the scalability of the CFS bandwidth unthrottling logic with large number of CPUs. - Fix & rework various cpuidle routines, simplify interaction with the generic scheduler code. Add __cpuidle methods as noinstr to objtool's noinstr detection and fix boatloads of cpuidle bugs & quirks. - Add new ABI: introduce MEMBARRIER_CMD_GET_REGISTRATIONS, to query previously issued registrations. - Limit scheduler slice duration to the sysctl_sched_latency period, to improve scheduling granularity with a large number of SCHED_IDLE tasks. - Debuggability enhancement on sys_exit(): warn about disabled IRQs, but also enable them to prevent a cascade of followup problems and repeat warnings. - Fix the rescheduling logic in prio_changed_dl(). - Micro-optimize cpufreq and sched-util methods. - Micro-optimize ttwu_runnable() - Micro-optimize the idle-scanning in update_numa_stats(), select_idle_capacity() and steal_cookie_task(). - Update the RSEQ code & self-tests - Constify various scheduler methods - Remove unused methods - Refine __init tags - Documentation updates - ... Misc other cleanups, fixes Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmPzbJwRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1iIvA//ZcEaB8Z6ChLRQjM+bsaudKJu3pdLQbPK iYbP8Da+LsAfxbEfYuGV3m+jIp0LlBOtsI/EezxQrXV+V7FvNyAX9Y00eEu/zlj8 7Jn3LMy/DBYTwH7LwVdcU0MyIVI8ZPc6WNnkx0LOtGZn8n+qfHPSDzcP3CW+a5AV UvllPYpYyEmsX0Eby7CF4Ue8mSmbViw/xR3rNr8ZSve0c25XzKabw8O9kE3jiHxP d/zERJoAYeDyYUEuZqhfn5dTlB4an4IjNEkAfRE5SQ09RA8Gkxsa5Ar8gob9e9M1 eQsdd4/bdhnrkM8L5qDZczqmgCTZ2bukQrxkBXhRDhLgoFxwAn77b+2ZjmIW3Lae AyGqRcDSg1q2oxaYm5ZiuO/t26aDOZu9vPHyHRDGt95EGbZlrp+GgeePyfCigJYz UmPdZAAcHdSymnnnlcvdG37WVvaVkpgWZzd8LbtBi23QR+Zc4WQ2IlgnUS5WKNNf VOBcAcP6E1IslDotZDQCc2dPFFQoQQEssVooyUc5oMytm7BsvxXLOeHG+Ncu/8uc H+U8Qn8jnqTxJbC5hkWQIJlhVKCq2FJrHxxySYTKROfUNcDgCmxboFeAcXTCIU1K T0S+sdoTS/CvtLklRkG0j6B8N4N98mOd9cFwUV3tX+/gMLMep3hCQs5L76JagvC5 skkQXoONNaM= =l1nN -----END PGP SIGNATURE----- Merge tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: - Improve the scalability of the CFS bandwidth unthrottling logic with large number of CPUs. - Fix & rework various cpuidle routines, simplify interaction with the generic scheduler code. Add __cpuidle methods as noinstr to objtool's noinstr detection and fix boatloads of cpuidle bugs & quirks. - Add new ABI: introduce MEMBARRIER_CMD_GET_REGISTRATIONS, to query previously issued registrations. - Limit scheduler slice duration to the sysctl_sched_latency period, to improve scheduling granularity with a large number of SCHED_IDLE tasks. - Debuggability enhancement on sys_exit(): warn about disabled IRQs, but also enable them to prevent a cascade of followup problems and repeat warnings. - Fix the rescheduling logic in prio_changed_dl(). - Micro-optimize cpufreq and sched-util methods. - Micro-optimize ttwu_runnable() - Micro-optimize the idle-scanning in update_numa_stats(), select_idle_capacity() and steal_cookie_task(). - Update the RSEQ code & self-tests - Constify various scheduler methods - Remove unused methods - Refine __init tags - Documentation updates - Misc other cleanups, fixes * tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (110 commits) sched/rt: pick_next_rt_entity(): check list_entry sched/deadline: Add more reschedule cases to prio_changed_dl() sched/fair: sanitize vruntime of entity being placed sched/fair: Remove capacity inversion detection sched/fair: unlink misfit task from cpu overutilized objtool: mem*() are not uaccess safe cpuidle: Fix poll_idle() noinstr annotation sched/clock: Make local_clock() noinstr sched/clock/x86: Mark sched_clock() noinstr x86/pvclock: Improve atomic update of last_value in pvclock_clocksource_read() x86/atomics: Always inline arch_atomic64*() cpuidle: tracing, preempt: Squash _rcuidle tracing cpuidle: tracing: Warn about !rcu_is_watching() cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG cpuidle: drivers: firmware: psci: Dont instrument suspend code KVM: selftests: Fix build of rseq test exit: Detect and fix irq disabled state in oops cpuidle, arm64: Fix the ARM64 cpuidle logic cpuidle: mvebu: Fix duplicate flags assignment sched/fair: Limit sched slice duration ...
This commit is contained in:
commit
1f2d9ffc7a
@ -619,6 +619,8 @@ process migrations.
|
||||
and is an example of this type.
|
||||
|
||||
|
||||
.. _cgroupv2-limits-distributor:
|
||||
|
||||
Limits
|
||||
------
|
||||
|
||||
@ -635,6 +637,7 @@ process migrations.
|
||||
"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume
|
||||
on an IO device and is an example of this type.
|
||||
|
||||
.. _cgroupv2-protections-distributor:
|
||||
|
||||
Protections
|
||||
-----------
|
||||
|
@ -15,6 +15,7 @@ Linux Scheduler
|
||||
sched-capacity
|
||||
sched-energy
|
||||
schedutil
|
||||
sched-util-clamp
|
||||
sched-nice-design
|
||||
sched-rt-group
|
||||
sched-stats
|
||||
|
741
Documentation/scheduler/sched-util-clamp.rst
Normal file
741
Documentation/scheduler/sched-util-clamp.rst
Normal file
@ -0,0 +1,741 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================
|
||||
Utilization Clamping
|
||||
====================
|
||||
|
||||
1. Introduction
|
||||
===============
|
||||
|
||||
Utilization clamping, also known as util clamp or uclamp, is a scheduler
|
||||
feature that allows user space to help in managing the performance requirement
|
||||
of tasks. It was introduced in v5.3 release. The CGroup support was merged in
|
||||
v5.4.
|
||||
|
||||
Uclamp is a hinting mechanism that allows the scheduler to understand the
|
||||
performance requirements and restrictions of the tasks, thus it helps the
|
||||
scheduler to make a better decision. And when schedutil cpufreq governor is
|
||||
used, util clamp will influence the CPU frequency selection as well.
|
||||
|
||||
Since the scheduler and schedutil are both driven by PELT (util_avg) signals,
|
||||
util clamp acts on that to achieve its goal by clamping the signal to a certain
|
||||
point; hence the name. That is, by clamping utilization we are making the
|
||||
system run at a certain performance point.
|
||||
|
||||
The right way to view util clamp is as a mechanism to make request or hint on
|
||||
performance constraints. It consists of two tunables:
|
||||
|
||||
* UCLAMP_MIN, which sets the lower bound.
|
||||
* UCLAMP_MAX, which sets the upper bound.
|
||||
|
||||
These two bounds will ensure a task will operate within this performance range
|
||||
of the system. UCLAMP_MIN implies boosting a task, while UCLAMP_MAX implies
|
||||
capping a task.
|
||||
|
||||
One can tell the system (scheduler) that some tasks require a minimum
|
||||
performance point to operate at to deliver the desired user experience. Or one
|
||||
can tell the system that some tasks should be restricted from consuming too
|
||||
much resources and should not go above a specific performance point. Viewing
|
||||
the uclamp values as performance points rather than utilization is a better
|
||||
abstraction from user space point of view.
|
||||
|
||||
As an example, a game can use util clamp to form a feedback loop with its
|
||||
perceived Frames Per Second (FPS). It can dynamically increase the minimum
|
||||
performance point required by its display pipeline to ensure no frame is
|
||||
dropped. It can also dynamically 'prime' up these tasks if it knows in the
|
||||
coming few hundred milliseconds a computationally intensive scene is about to
|
||||
happen.
|
||||
|
||||
On mobile hardware where the capability of the devices varies a lot, this
|
||||
dynamic feedback loop offers a great flexibility to ensure best user experience
|
||||
given the capabilities of any system.
|
||||
|
||||
Of course a static configuration is possible too. The exact usage will depend
|
||||
on the system, application and the desired outcome.
|
||||
|
||||
Another example is in Android where tasks are classified as background,
|
||||
foreground, top-app, etc. Util clamp can be used to constrain how much
|
||||
resources background tasks are consuming by capping the performance point they
|
||||
can run at. This constraint helps reserve resources for important tasks, like
|
||||
the ones belonging to the currently active app (top-app group). Beside this
|
||||
helps in limiting how much power they consume. This can be more obvious in
|
||||
heterogeneous systems (e.g. Arm big.LITTLE); the constraint will help bias the
|
||||
background tasks to stay on the little cores which will ensure that:
|
||||
|
||||
1. The big cores are free to run top-app tasks immediately. top-app
|
||||
tasks are the tasks the user is currently interacting with, hence
|
||||
the most important tasks in the system.
|
||||
2. They don't run on a power hungry core and drain battery even if they
|
||||
are CPU intensive tasks.
|
||||
|
||||
.. note::
|
||||
**little cores**:
|
||||
CPUs with capacity < 1024
|
||||
|
||||
**big cores**:
|
||||
CPUs with capacity = 1024
|
||||
|
||||
By making these uclamp performance requests, or rather hints, user space can
|
||||
ensure system resources are used optimally to deliver the best possible user
|
||||
experience.
|
||||
|
||||
Another use case is to help with **overcoming the ramp up latency inherit in
|
||||
how scheduler utilization signal is calculated**.
|
||||
|
||||
On the other hand, a busy task for instance that requires to run at maximum
|
||||
performance point will suffer a delay of ~200ms (PELT HALFIFE = 32ms) for the
|
||||
scheduler to realize that. This is known to affect workloads like gaming on
|
||||
mobile devices where frames will drop due to slow response time to select the
|
||||
higher frequency required for the tasks to finish their work in time. Setting
|
||||
UCLAMP_MIN=1024 will ensure such tasks will always see the highest performance
|
||||
level when they start running.
|
||||
|
||||
The overall visible effect goes beyond better perceived user
|
||||
experience/performance and stretches to help achieve a better overall
|
||||
performance/watt if used effectively.
|
||||
|
||||
User space can form a feedback loop with the thermal subsystem too to ensure
|
||||
the device doesn't heat up to the point where it will throttle.
|
||||
|
||||
Both SCHED_NORMAL/OTHER and SCHED_FIFO/RR honour uclamp requests/hints.
|
||||
|
||||
In the SCHED_FIFO/RR case, uclamp gives the option to run RT tasks at any
|
||||
performance point rather than being tied to MAX frequency all the time. Which
|
||||
can be useful on general purpose systems that run on battery powered devices.
|
||||
|
||||
Note that by design RT tasks don't have per-task PELT signal and must always
|
||||
run at a constant frequency to combat undeterministic DVFS rampup delays.
|
||||
|
||||
Note that using schedutil always implies a single delay to modify the frequency
|
||||
when an RT task wakes up. This cost is unchanged by using uclamp. Uclamp only
|
||||
helps picking what frequency to request instead of schedutil always requesting
|
||||
MAX for all RT tasks.
|
||||
|
||||
See :ref:`section 3.4 <uclamp-default-values>` for default values and
|
||||
:ref:`3.4.1 <sched-util-clamp-min-rt-default>` on how to change RT tasks
|
||||
default value.
|
||||
|
||||
2. Design
|
||||
=========
|
||||
|
||||
Util clamp is a property of every task in the system. It sets the boundaries of
|
||||
its utilization signal; acting as a bias mechanism that influences certain
|
||||
decisions within the scheduler.
|
||||
|
||||
The actual utilization signal of a task is never clamped in reality. If you
|
||||
inspect PELT signals at any point of time you should continue to see them as
|
||||
they are intact. Clamping happens only when needed, e.g: when a task wakes up
|
||||
and the scheduler needs to select a suitable CPU for it to run on.
|
||||
|
||||
Since the goal of util clamp is to allow requesting a minimum and maximum
|
||||
performance point for a task to run on, it must be able to influence the
|
||||
frequency selection as well as task placement to be most effective. Both of
|
||||
which have implications on the utilization value at CPU runqueue (rq for short)
|
||||
level, which brings us to the main design challenge.
|
||||
|
||||
When a task wakes up on an rq, the utilization signal of the rq will be
|
||||
affected by the uclamp settings of all the tasks enqueued on it. For example if
|
||||
a task requests to run at UTIL_MIN = 512, then the util signal of the rq needs
|
||||
to respect to this request as well as all other requests from all of the
|
||||
enqueued tasks.
|
||||
|
||||
To be able to aggregate the util clamp value of all the tasks attached to the
|
||||
rq, uclamp must do some housekeeping at every enqueue/dequeue, which is the
|
||||
scheduler hot path. Hence care must be taken since any slow down will have
|
||||
significant impact on a lot of use cases and could hinder its usability in
|
||||
practice.
|
||||
|
||||
The way this is handled is by dividing the utilization range into buckets
|
||||
(struct uclamp_bucket) which allows us to reduce the search space from every
|
||||
task on the rq to only a subset of tasks on the top-most bucket.
|
||||
|
||||
When a task is enqueued, the counter in the matching bucket is incremented,
|
||||
and on dequeue it is decremented. This makes keeping track of the effective
|
||||
uclamp value at rq level a lot easier.
|
||||
|
||||
As tasks are enqueued and dequeued, we keep track of the current effective
|
||||
uclamp value of the rq. See :ref:`section 2.1 <uclamp-buckets>` for details on
|
||||
how this works.
|
||||
|
||||
Later at any path that wants to identify the effective uclamp value of the rq,
|
||||
it will simply need to read this effective uclamp value of the rq at that exact
|
||||
moment of time it needs to take a decision.
|
||||
|
||||
For task placement case, only Energy Aware and Capacity Aware Scheduling
|
||||
(EAS/CAS) make use of uclamp for now, which implies that it is applied on
|
||||
heterogeneous systems only.
|
||||
When a task wakes up, the scheduler will look at the current effective uclamp
|
||||
value of every rq and compare it with the potential new value if the task were
|
||||
to be enqueued there. Favoring the rq that will end up with the most energy
|
||||
efficient combination.
|
||||
|
||||
Similarly in schedutil, when it needs to make a frequency update it will look
|
||||
at the current effective uclamp value of the rq which is influenced by the set
|
||||
of tasks currently enqueued there and select the appropriate frequency that
|
||||
will satisfy constraints from requests.
|
||||
|
||||
Other paths like setting overutilization state (which effectively disables EAS)
|
||||
make use of uclamp as well. Such cases are considered necessary housekeeping to
|
||||
allow the 2 main use cases above and will not be covered in detail here as they
|
||||
could change with implementation details.
|
||||
|
||||
.. _uclamp-buckets:
|
||||
|
||||
2.1. Buckets
|
||||
------------
|
||||
|
||||
::
|
||||
|
||||
[struct rq]
|
||||
|
||||
(bottom) (top)
|
||||
|
||||
0 1024
|
||||
| |
|
||||
+-----------+-----------+-----------+---- ----+-----------+
|
||||
| Bucket 0 | Bucket 1 | Bucket 2 | ... | Bucket N |
|
||||
+-----------+-----------+-----------+---- ----+-----------+
|
||||
: : :
|
||||
+- p0 +- p3 +- p4
|
||||
: :
|
||||
+- p1 +- p5
|
||||
:
|
||||
+- p2
|
||||
|
||||
|
||||
.. note::
|
||||
The diagram above is an illustration rather than a true depiction of the
|
||||
internal data structure.
|
||||
|
||||
To reduce the search space when trying to decide the effective uclamp value of
|
||||
an rq as tasks are enqueued/dequeued, the whole utilization range is divided
|
||||
into N buckets where N is configured at compile time by setting
|
||||
CONFIG_UCLAMP_BUCKETS_COUNT. By default it is set to 5.
|
||||
|
||||
The rq has a bucket for each uclamp_id tunables: [UCLAMP_MIN, UCLAMP_MAX].
|
||||
|
||||
The range of each bucket is 1024/N. For example, for the default value of
|
||||
5 there will be 5 buckets, each of which will cover the following range:
|
||||
|
||||
::
|
||||
|
||||
DELTA = round_closest(1024/5) = 204.8 = 205
|
||||
|
||||
Bucket 0: [0:204]
|
||||
Bucket 1: [205:409]
|
||||
Bucket 2: [410:614]
|
||||
Bucket 3: [615:819]
|
||||
Bucket 4: [820:1024]
|
||||
|
||||
When a task p with following tunable parameters
|
||||
|
||||
::
|
||||
|
||||
p->uclamp[UCLAMP_MIN] = 300
|
||||
p->uclamp[UCLAMP_MAX] = 1024
|
||||
|
||||
is enqueued into the rq, bucket 1 will be incremented for UCLAMP_MIN and bucket
|
||||
4 will be incremented for UCLAMP_MAX to reflect the fact the rq has a task in
|
||||
this range.
|
||||
|
||||
The rq then keeps track of its current effective uclamp value for each
|
||||
uclamp_id.
|
||||
|
||||
When a task p is enqueued, the rq value changes to:
|
||||
|
||||
::
|
||||
|
||||
// update bucket logic goes here
|
||||
rq->uclamp[UCLAMP_MIN] = max(rq->uclamp[UCLAMP_MIN], p->uclamp[UCLAMP_MIN])
|
||||
// repeat for UCLAMP_MAX
|
||||
|
||||
Similarly, when p is dequeued the rq value changes to:
|
||||
|
||||
::
|
||||
|
||||
// update bucket logic goes here
|
||||
rq->uclamp[UCLAMP_MIN] = search_top_bucket_for_highest_value()
|
||||
// repeat for UCLAMP_MAX
|
||||
|
||||
When all buckets are empty, the rq uclamp values are reset to system defaults.
|
||||
See :ref:`section 3.4 <uclamp-default-values>` for details on default values.
|
||||
|
||||
|
||||
2.2. Max aggregation
|
||||
--------------------
|
||||
|
||||
Util clamp is tuned to honour the request for the task that requires the
|
||||
highest performance point.
|
||||
|
||||
When multiple tasks are attached to the same rq, then util clamp must make sure
|
||||
the task that needs the highest performance point gets it even if there's
|
||||
another task that doesn't need it or is disallowed from reaching this point.
|
||||
|
||||
For example, if there are multiple tasks attached to an rq with the following
|
||||
values:
|
||||
|
||||
::
|
||||
|
||||
p0->uclamp[UCLAMP_MIN] = 300
|
||||
p0->uclamp[UCLAMP_MAX] = 900
|
||||
|
||||
p1->uclamp[UCLAMP_MIN] = 500
|
||||
p1->uclamp[UCLAMP_MAX] = 500
|
||||
|
||||
then assuming both p0 and p1 are enqueued to the same rq, both UCLAMP_MIN
|
||||
and UCLAMP_MAX become:
|
||||
|
||||
::
|
||||
|
||||
rq->uclamp[UCLAMP_MIN] = max(300, 500) = 500
|
||||
rq->uclamp[UCLAMP_MAX] = max(900, 500) = 900
|
||||
|
||||
As we shall see in :ref:`section 5.1 <uclamp-capping-fail>`, this max
|
||||
aggregation is the cause of one of limitations when using util clamp, in
|
||||
particular for UCLAMP_MAX hint when user space would like to save power.
|
||||
|
||||
2.3. Hierarchical aggregation
|
||||
-----------------------------
|
||||
|
||||
As stated earlier, util clamp is a property of every task in the system. But
|
||||
the actual applied (effective) value can be influenced by more than just the
|
||||
request made by the task or another actor on its behalf (middleware library).
|
||||
|
||||
The effective util clamp value of any task is restricted as follows:
|
||||
|
||||
1. By the uclamp settings defined by the cgroup CPU controller it is attached
|
||||
to, if any.
|
||||
2. The restricted value in (1) is then further restricted by the system wide
|
||||
uclamp settings.
|
||||
|
||||
:ref:`Section 3 <uclamp-interfaces>` discusses the interfaces and will expand
|
||||
further on that.
|
||||
|
||||
For now suffice to say that if a task makes a request, its actual effective
|
||||
value will have to adhere to some restrictions imposed by cgroup and system
|
||||
wide settings.
|
||||
|
||||
The system will still accept the request even if effectively will be beyond the
|
||||
constraints, but as soon as the task moves to a different cgroup or a sysadmin
|
||||
modifies the system settings, the request will be satisfied only if it is
|
||||
within new constraints.
|
||||
|
||||
In other words, this aggregation will not cause an error when a task changes
|
||||
its uclamp values, but rather the system may not be able to satisfy requests
|
||||
based on those factors.
|
||||
|
||||
2.4. Range
|
||||
----------
|
||||
|
||||
Uclamp performance request has the range of 0 to 1024 inclusive.
|
||||
|
||||
For cgroup interface percentage is used (that is 0 to 100 inclusive).
|
||||
Just like other cgroup interfaces, you can use 'max' instead of 100.
|
||||
|
||||
.. _uclamp-interfaces:
|
||||
|
||||
3. Interfaces
|
||||
=============
|
||||
|
||||
3.1. Per task interface
|
||||
-----------------------
|
||||
|
||||
sched_setattr() syscall was extended to accept two new fields:
|
||||
|
||||
* sched_util_min: requests the minimum performance point the system should run
|
||||
at when this task is running. Or lower performance bound.
|
||||
* sched_util_max: requests the maximum performance point the system should run
|
||||
at when this task is running. Or upper performance bound.
|
||||
|
||||
For example, the following scenario have 40% to 80% utilization constraints:
|
||||
|
||||
::
|
||||
|
||||
attr->sched_util_min = 40% * 1024;
|
||||
attr->sched_util_max = 80% * 1024;
|
||||
|
||||
When task @p is running, **the scheduler should try its best to ensure it
|
||||
starts at 40% performance level**. If the task runs for a long enough time so
|
||||
that its actual utilization goes above 80%, the utilization, or performance
|
||||
level, will be capped.
|
||||
|
||||
The special value -1 is used to reset the uclamp settings to the system
|
||||
default.
|
||||
|
||||
Note that resetting the uclamp value to system default using -1 is not the same
|
||||
as manually setting uclamp value to system default. This distinction is
|
||||
important because as we shall see in system interfaces, the default value for
|
||||
RT could be changed. SCHED_NORMAL/OTHER might gain similar knobs too in the
|
||||
future.
|
||||
|
||||
3.2. cgroup interface
|
||||
---------------------
|
||||
|
||||
There are two uclamp related values in the CPU cgroup controller:
|
||||
|
||||
* cpu.uclamp.min
|
||||
* cpu.uclamp.max
|
||||
|
||||
When a task is attached to a CPU controller, its uclamp values will be impacted
|
||||
as follows:
|
||||
|
||||
* cpu.uclamp.min is a protection as described in :ref:`section 3-3 of cgroup
|
||||
v2 documentation <cgroupv2-protections-distributor>`.
|
||||
|
||||
If a task uclamp_min value is lower than cpu.uclamp.min, then the task will
|
||||
inherit the cgroup cpu.uclamp.min value.
|
||||
|
||||
In a cgroup hierarchy, effective cpu.uclamp.min is the max of (child,
|
||||
parent).
|
||||
|
||||
* cpu.uclamp.max is a limit as described in :ref:`section 3-2 of cgroup v2
|
||||
documentation <cgroupv2-limits-distributor>`.
|
||||
|
||||
If a task uclamp_max value is higher than cpu.uclamp.max, then the task will
|
||||
inherit the cgroup cpu.uclamp.max value.
|
||||
|
||||
In a cgroup hierarchy, effective cpu.uclamp.max is the min of (child,
|
||||
parent).
|
||||
|
||||
For example, given following parameters:
|
||||
|
||||
::
|
||||
|
||||
p0->uclamp[UCLAMP_MIN] = // system default;
|
||||
p0->uclamp[UCLAMP_MAX] = // system default;
|
||||
|
||||
p1->uclamp[UCLAMP_MIN] = 40% * 1024;
|
||||
p1->uclamp[UCLAMP_MAX] = 50% * 1024;
|
||||
|
||||
cgroup0->cpu.uclamp.min = 20% * 1024;
|
||||
cgroup0->cpu.uclamp.max = 60% * 1024;
|
||||
|
||||
cgroup1->cpu.uclamp.min = 60% * 1024;
|
||||
cgroup1->cpu.uclamp.max = 100% * 1024;
|
||||
|
||||
when p0 and p1 are attached to cgroup0, the values become:
|
||||
|
||||
::
|
||||
|
||||
p0->uclamp[UCLAMP_MIN] = cgroup0->cpu.uclamp.min = 20% * 1024;
|
||||
p0->uclamp[UCLAMP_MAX] = cgroup0->cpu.uclamp.max = 60% * 1024;
|
||||
|
||||
p1->uclamp[UCLAMP_MIN] = 40% * 1024; // intact
|
||||
p1->uclamp[UCLAMP_MAX] = 50% * 1024; // intact
|
||||
|
||||
when p0 and p1 are attached to cgroup1, these instead become:
|
||||
|
||||
::
|
||||
|
||||
p0->uclamp[UCLAMP_MIN] = cgroup1->cpu.uclamp.min = 60% * 1024;
|
||||
p0->uclamp[UCLAMP_MAX] = cgroup1->cpu.uclamp.max = 100% * 1024;
|
||||
|
||||
p1->uclamp[UCLAMP_MIN] = cgroup1->cpu.uclamp.min = 60% * 1024;
|
||||
p1->uclamp[UCLAMP_MAX] = 50% * 1024; // intact
|
||||
|
||||
Note that cgroup interfaces allows cpu.uclamp.max value to be lower than
|
||||
cpu.uclamp.min. Other interfaces don't allow that.
|
||||
|
||||
3.3. System interface
|
||||
---------------------
|
||||
|
||||
3.3.1 sched_util_clamp_min
|
||||
--------------------------
|
||||
|
||||
System wide limit of allowed UCLAMP_MIN range. By default it is set to 1024,
|
||||
which means that permitted effective UCLAMP_MIN range for tasks is [0:1024].
|
||||
By changing it to 512 for example the range reduces to [0:512]. This is useful
|
||||
to restrict how much boosting tasks are allowed to acquire.
|
||||
|
||||
Requests from tasks to go above this knob value will still succeed, but
|
||||
they won't be satisfied until it is more than p->uclamp[UCLAMP_MIN].
|
||||
|
||||
The value must be smaller than or equal to sched_util_clamp_max.
|
||||
|
||||
3.3.2 sched_util_clamp_max
|
||||
--------------------------
|
||||
|
||||
System wide limit of allowed UCLAMP_MAX range. By default it is set to 1024,
|
||||
which means that permitted effective UCLAMP_MAX range for tasks is [0:1024].
|
||||
|
||||
By changing it to 512 for example the effective allowed range reduces to
|
||||
[0:512]. This means is that no task can run above 512, which implies that all
|
||||
rqs are restricted too. IOW, the whole system is capped to half its performance
|
||||
capacity.
|
||||
|
||||
This is useful to restrict the overall maximum performance point of the system.
|
||||
For example, it can be handy to limit performance when running low on battery
|
||||
or when the system wants to limit access to more energy hungry performance
|
||||
levels when it's in idle state or screen is off.
|
||||
|
||||
Requests from tasks to go above this knob value will still succeed, but they
|
||||
won't be satisfied until it is more than p->uclamp[UCLAMP_MAX].
|
||||
|
||||
The value must be greater than or equal to sched_util_clamp_min.
|
||||
|
||||
.. _uclamp-default-values:
|
||||
|
||||
3.4. Default values
|
||||
-------------------
|
||||
|
||||
By default all SCHED_NORMAL/SCHED_OTHER tasks are initialized to:
|
||||
|
||||
::
|
||||
|
||||
p_fair->uclamp[UCLAMP_MIN] = 0
|
||||
p_fair->uclamp[UCLAMP_MAX] = 1024
|
||||
|
||||
That is, by default they're boosted to run at the maximum performance point of
|
||||
changed at boot or runtime. No argument was made yet as to why we should
|
||||
provide this, but can be added in the future.
|
||||
|
||||
For SCHED_FIFO/SCHED_RR tasks:
|
||||
|
||||
::
|
||||
|
||||
p_rt->uclamp[UCLAMP_MIN] = 1024
|
||||
p_rt->uclamp[UCLAMP_MAX] = 1024
|
||||
|
||||
That is by default they're boosted to run at the maximum performance point of
|
||||
the system which retains the historical behavior of the RT tasks.
|
||||
|
||||
RT tasks default uclamp_min value can be modified at boot or runtime via
|
||||
sysctl. See below section.
|
||||
|
||||
.. _sched-util-clamp-min-rt-default:
|
||||
|
||||
3.4.1 sched_util_clamp_min_rt_default
|
||||
-------------------------------------
|
||||
|
||||
Running RT tasks at maximum performance point is expensive on battery powered
|
||||
devices and not necessary. To allow system developer to offer good performance
|
||||
guarantees for these tasks without pushing it all the way to maximum
|
||||
performance point, this sysctl knob allows tuning the best boost value to
|
||||
address the system requirement without burning power running at maximum
|
||||
performance point all the time.
|
||||
|
||||
Application developer are encouraged to use the per task util clamp interface
|
||||
to ensure they are performance and power aware. Ideally this knob should be set
|
||||
to 0 by system designers and leave the task of managing performance
|
||||
requirements to the apps.
|
||||
|
||||
4. How to use util clamp
|
||||
========================
|
||||
|
||||
Util clamp promotes the concept of user space assisted power and performance
|
||||
management. At the scheduler level there is no info required to make the best
|
||||
decision. However, with util clamp user space can hint to the scheduler to make
|
||||
better decision about task placement and frequency selection.
|
||||
|
||||
Best results are achieved by not making any assumptions about the system the
|
||||
application is running on and to use it in conjunction with a feedback loop to
|
||||
dynamically monitor and adjust. Ultimately this will allow for a better user
|
||||
experience at a better perf/watt.
|
||||
|
||||
For some systems and use cases, static setup will help to achieve good results.
|
||||
Portability will be a problem in this case. How much work one can do at 100,
|
||||
200 or 1024 is different for each system. Unless there's a specific target
|
||||
system, static setup should be avoided.
|
||||
|
||||
There are enough possibilities to create a whole framework based on util clamp
|
||||
or self contained app that makes use of it directly.
|
||||
|
||||
4.1. Boost important and DVFS-latency-sensitive tasks
|
||||
-----------------------------------------------------
|
||||
|
||||
A GUI task might not be busy to warrant driving the frequency high when it
|
||||
wakes up. However, it requires to finish its work within a specific time window
|
||||
to deliver the desired user experience. The right frequency it requires at
|
||||
wakeup will be system dependent. On some underpowered systems it will be high,
|
||||
on other overpowered ones it will be low or 0.
|
||||
|
||||
This task can increase its UCLAMP_MIN value every time it misses the deadline
|
||||
to ensure on next wake up it runs at a higher performance point. It should try
|
||||
to approach the lowest UCLAMP_MIN value that allows to meet its deadline on any
|
||||
particular system to achieve the best possible perf/watt for that system.
|
||||
|
||||
On heterogeneous systems, it might be important for this task to run on
|
||||
a faster CPU.
|
||||
|
||||
**Generally it is advised to perceive the input as performance level or point
|
||||
which will imply both task placement and frequency selection**.
|
||||
|
||||
4.2. Cap background tasks
|
||||
-------------------------
|
||||
|
||||
Like explained for Android case in the introduction. Any app can lower
|
||||
UCLAMP_MAX for some background tasks that don't care about performance but
|
||||
could end up being busy and consume unnecessary system resources on the system.
|
||||
|
||||
4.3. Powersave mode
|
||||
-------------------
|
||||
|
||||
sched_util_clamp_max system wide interface can be used to limit all tasks from
|
||||
operating at the higher performance points which are usually energy
|
||||
inefficient.
|
||||
|
||||
This is not unique to uclamp as one can achieve the same by reducing max
|
||||
frequency of the cpufreq governor. It can be considered a more convenient
|
||||
alternative interface.
|
||||
|
||||
4.4. Per-app performance restriction
|
||||
------------------------------------
|
||||
|
||||
Middleware/Utility can provide the user an option to set UCLAMP_MIN/MAX for an
|
||||
app every time it is executed to guarantee a minimum performance point and/or
|
||||
limit it from draining system power at the cost of reduced performance for
|
||||
these apps.
|
||||
|
||||
If you want to prevent your laptop from heating up while on the go from
|
||||
compiling the kernel and happy to sacrifice performance to save power, but
|
||||
still would like to keep your browser performance intact, uclamp makes it
|
||||
possible.
|
||||
|
||||
5. Limitations
|
||||
==============
|
||||
|
||||
.. _uclamp-capping-fail:
|
||||
|
||||
5.1. Capping frequency with uclamp_max fails under certain conditions
|
||||
---------------------------------------------------------------------
|
||||
|
||||
If task p0 is capped to run at 512:
|
||||
|
||||
::
|
||||
|
||||
p0->uclamp[UCLAMP_MAX] = 512
|
||||
|
||||
and it shares the rq with p1 which is free to run at any performance point:
|
||||
|
||||
::
|
||||
|
||||
p1->uclamp[UCLAMP_MAX] = 1024
|
||||
|
||||
then due to max aggregation the rq will be allowed to reach max performance
|
||||
point:
|
||||
|
||||
::
|
||||
|
||||
rq->uclamp[UCLAMP_MAX] = max(512, 1024) = 1024
|
||||
|
||||
Assuming both p0 and p1 have UCLAMP_MIN = 0, then the frequency selection for
|
||||
the rq will depend on the actual utilization value of the tasks.
|
||||
|
||||
If p1 is a small task but p0 is a CPU intensive task, then due to the fact that
|
||||
both are running at the same rq, p1 will cause the frequency capping to be left
|
||||
from the rq although p1, which is allowed to run at any performance point,
|
||||
doesn't actually need to run at that frequency.
|
||||
|
||||
5.2. UCLAMP_MAX can break PELT (util_avg) signal
|
||||
------------------------------------------------
|
||||
|
||||
PELT assumes that frequency will always increase as the signals grow to ensure
|
||||
there's always some idle time on the CPU. But with UCLAMP_MAX, this frequency
|
||||
increase will be prevented which can lead to no idle time in some
|
||||
circumstances. When there's no idle time, a task will stuck in a busy loop,
|
||||
which would result in util_avg being 1024.
|
||||
|
||||
Combing with issue described below, this can lead to unwanted frequency spikes
|
||||
when severely capped tasks share the rq with a small non capped task.
|
||||
|
||||
As an example if task p, which have:
|
||||
|
||||
::
|
||||
|
||||
p0->util_avg = 300
|
||||
p0->uclamp[UCLAMP_MAX] = 0
|
||||
|
||||
wakes up on an idle CPU, then it will run at min frequency (Fmin) this
|
||||
CPU is capable of. The max CPU frequency (Fmax) matters here as well,
|
||||
since it designates the shortest computational time to finish the task's
|
||||
work on this CPU.
|
||||
|
||||
::
|
||||
|
||||
rq->uclamp[UCLAMP_MAX] = 0
|
||||
|
||||
If the ratio of Fmax/Fmin is 3, then maximum value will be:
|
||||
|
||||
::
|
||||
|
||||
300 * (Fmax/Fmin) = 900
|
||||
|
||||
which indicates the CPU will still see idle time since 900 is < 1024. The
|
||||
_actual_ util_avg will not be 900 though, but somewhere between 300 and 900. As
|
||||
long as there's idle time, p->util_avg updates will be off by a some margin,
|
||||
but not proportional to Fmax/Fmin.
|
||||
|
||||
::
|
||||
|
||||
p0->util_avg = 300 + small_error
|
||||
|
||||
Now if the ratio of Fmax/Fmin is 4, the maximum value becomes:
|
||||
|
||||
::
|
||||
|
||||
300 * (Fmax/Fmin) = 1200
|
||||
|
||||
which is higher than 1024 and indicates that the CPU has no idle time. When
|
||||
this happens, then the _actual_ util_avg will become:
|
||||
|
||||
::
|
||||
|
||||
p0->util_avg = 1024
|
||||
|
||||
If task p1 wakes up on this CPU, which have:
|
||||
|
||||
::
|
||||
|
||||
p1->util_avg = 200
|
||||
p1->uclamp[UCLAMP_MAX] = 1024
|
||||
|
||||
then the effective UCLAMP_MAX for the CPU will be 1024 according to max
|
||||
aggregation rule. But since the capped p0 task was running and throttled
|
||||
severely, then the rq->util_avg will be:
|
||||
|
||||
::
|
||||
|
||||
p0->util_avg = 1024
|
||||
p1->util_avg = 200
|
||||
|
||||
rq->util_avg = 1024
|
||||
rq->uclamp[UCLAMP_MAX] = 1024
|
||||
|
||||
Hence lead to a frequency spike since if p0 wasn't throttled we should get:
|
||||
|
||||
::
|
||||
|
||||
p0->util_avg = 300
|
||||
p1->util_avg = 200
|
||||
|
||||
rq->util_avg = 500
|
||||
|
||||
and run somewhere near mid performance point of that CPU, not the Fmax we get.
|
||||
|
||||
5.3. Schedutil response time issues
|
||||
-----------------------------------
|
||||
|
||||
schedutil has three limitations:
|
||||
|
||||
1. Hardware takes non-zero time to respond to any frequency change
|
||||
request. On some platforms can be in the order of few ms.
|
||||
2. Non fast-switch systems require a worker deadline thread to wake up
|
||||
and perform the frequency change, which adds measurable overhead.
|
||||
3. schedutil rate_limit_us drops any requests during this rate_limit_us
|
||||
window.
|
||||
|
||||
If a relatively small task is doing critical job and requires a certain
|
||||
performance point when it wakes up and starts running, then all these
|
||||
limitations will prevent it from getting what it wants in the time scale it
|
||||
expects.
|
||||
|
||||
This limitation is not only impactful when using uclamp, but will be more
|
||||
prevalent as we no longer gradually ramp up or down. We could easily be
|
||||
jumping between frequencies depending on the order tasks wake up, and their
|
||||
respective uclamp values.
|
||||
|
||||
We regard that as a limitation of the capabilities of the underlying system
|
||||
itself.
|
||||
|
||||
There is room to improve the behavior of schedutil rate_limit_us, but not much
|
||||
to be done for 1 or 2. They are considered hard limitations of the system.
|
@ -57,7 +57,6 @@ EXPORT_SYMBOL(pm_power_off);
|
||||
void arch_cpu_idle(void)
|
||||
{
|
||||
wtint(0);
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
void arch_cpu_idle_dead(void)
|
||||
|
@ -27,7 +27,6 @@ SECTIONS
|
||||
HEAD_TEXT
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
*(.fixup)
|
||||
*(.gnu.warning)
|
||||
|
@ -114,6 +114,8 @@ void arch_cpu_idle(void)
|
||||
"sleep %0 \n"
|
||||
:
|
||||
:"I"(arg)); /* can't be "r" has to be embedded const */
|
||||
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
||||
#else /* ARC700 */
|
||||
@ -122,6 +124,7 @@ void arch_cpu_idle(void)
|
||||
{
|
||||
/* sleep, but enable both set E1/E2 (levels of interrupts) before committing */
|
||||
__asm__ __volatile__("sleep 0x3 \n");
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -85,7 +85,6 @@ SECTIONS
|
||||
_stext = .;
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -96,7 +96,6 @@
|
||||
SOFTIRQENTRY_TEXT \
|
||||
TEXT_TEXT \
|
||||
SCHED_TEXT \
|
||||
CPUIDLE_TEXT \
|
||||
LOCK_TEXT \
|
||||
KPROBES_TEXT \
|
||||
ARM_STUBS_TEXT \
|
||||
|
@ -26,8 +26,8 @@ static struct cpuidle_ops cpuidle_ops[NR_CPUS] __ro_after_init;
|
||||
*
|
||||
* Returns the index passed as parameter
|
||||
*/
|
||||
int arm_cpuidle_simple_enter(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
__cpuidle int arm_cpuidle_simple_enter(struct cpuidle_device *dev, struct
|
||||
cpuidle_driver *drv, int index)
|
||||
{
|
||||
cpu_do_idle();
|
||||
|
||||
|
@ -78,7 +78,6 @@ void arch_cpu_idle(void)
|
||||
arm_pm_idle();
|
||||
else
|
||||
cpu_do_idle();
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
void arch_cpu_idle_prepare(void)
|
||||
|
@ -638,7 +638,7 @@ static void do_handle_IPI(int ipinr)
|
||||
unsigned int cpu = smp_processor_id();
|
||||
|
||||
if ((unsigned)ipinr < NR_IPI)
|
||||
trace_ipi_entry_rcuidle(ipi_types[ipinr]);
|
||||
trace_ipi_entry(ipi_types[ipinr]);
|
||||
|
||||
switch (ipinr) {
|
||||
case IPI_WAKEUP:
|
||||
@ -685,7 +685,7 @@ static void do_handle_IPI(int ipinr)
|
||||
}
|
||||
|
||||
if ((unsigned)ipinr < NR_IPI)
|
||||
trace_ipi_exit_rcuidle(ipi_types[ipinr]);
|
||||
trace_ipi_exit(ipi_types[ipinr]);
|
||||
}
|
||||
|
||||
/* Legacy version, should go away once all irqchips have been converted */
|
||||
@ -708,7 +708,7 @@ static irqreturn_t ipi_handler(int irq, void *data)
|
||||
|
||||
static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
|
||||
{
|
||||
trace_ipi_raise_rcuidle(target, ipi_types[ipinr]);
|
||||
trace_ipi_raise(target, ipi_types[ipinr]);
|
||||
__ipi_send_mask(ipi_desc[ipinr], target);
|
||||
}
|
||||
|
||||
|
@ -44,8 +44,8 @@ static void davinci_save_ddr_power(int enter, bool pdown)
|
||||
}
|
||||
|
||||
/* Actual code that puts the SoC in different idle states */
|
||||
static int davinci_enter_idle(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
static __cpuidle int davinci_enter_idle(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
{
|
||||
davinci_save_ddr_power(1, ddr2_pdown);
|
||||
cpu_do_idle();
|
||||
|
@ -42,8 +42,9 @@ static void gemini_idle(void)
|
||||
*/
|
||||
|
||||
/* FIXME: Enabling interrupts here is racy! */
|
||||
local_irq_enable();
|
||||
raw_local_irq_enable();
|
||||
cpu_do_idle();
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
||||
static void __init gemini_init_machine(void)
|
||||
|
@ -8,8 +8,8 @@
|
||||
#include <asm/system_misc.h>
|
||||
#include "cpuidle.h"
|
||||
|
||||
static int imx5_cpuidle_enter(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
static __cpuidle int imx5_cpuidle_enter(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
{
|
||||
arm_pm_idle();
|
||||
return index;
|
||||
|
@ -17,17 +17,17 @@
|
||||
static int num_idle_cpus = 0;
|
||||
static DEFINE_RAW_SPINLOCK(cpuidle_lock);
|
||||
|
||||
static int imx6q_enter_wait(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
static __cpuidle int imx6q_enter_wait(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
{
|
||||
raw_spin_lock(&cpuidle_lock);
|
||||
if (++num_idle_cpus == num_online_cpus())
|
||||
imx6_set_lpm(WAIT_UNCLOCKED);
|
||||
raw_spin_unlock(&cpuidle_lock);
|
||||
|
||||
ct_idle_enter();
|
||||
ct_cpuidle_enter();
|
||||
cpu_do_idle();
|
||||
ct_idle_exit();
|
||||
ct_cpuidle_exit();
|
||||
|
||||
raw_spin_lock(&cpuidle_lock);
|
||||
if (num_idle_cpus-- == num_online_cpus())
|
||||
|
@ -11,8 +11,8 @@
|
||||
#include "common.h"
|
||||
#include "cpuidle.h"
|
||||
|
||||
static int imx6sl_enter_wait(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
static __cpuidle int imx6sl_enter_wait(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
{
|
||||
imx6_set_lpm(WAIT_UNCLOCKED);
|
||||
/*
|
||||
|
@ -30,8 +30,8 @@ static int imx6sx_idle_finish(unsigned long val)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int imx6sx_enter_wait(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
static __cpuidle int imx6sx_enter_wait(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
{
|
||||
imx6_set_lpm(WAIT_UNCLOCKED);
|
||||
|
||||
@ -47,7 +47,9 @@ static int imx6sx_enter_wait(struct cpuidle_device *dev,
|
||||
cpu_pm_enter();
|
||||
cpu_cluster_pm_enter();
|
||||
|
||||
ct_cpuidle_enter();
|
||||
cpu_suspend(0, imx6sx_idle_finish);
|
||||
ct_cpuidle_exit();
|
||||
|
||||
cpu_cluster_pm_exit();
|
||||
cpu_pm_exit();
|
||||
@ -87,7 +89,8 @@ static struct cpuidle_driver imx6sx_cpuidle_driver = {
|
||||
*/
|
||||
.exit_latency = 300,
|
||||
.target_residency = 500,
|
||||
.flags = CPUIDLE_FLAG_TIMER_STOP,
|
||||
.flags = CPUIDLE_FLAG_TIMER_STOP |
|
||||
CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = imx6sx_enter_wait,
|
||||
.name = "LOW-POWER-IDLE",
|
||||
.desc = "ARM power off",
|
||||
|
@ -12,8 +12,8 @@
|
||||
#include "common.h"
|
||||
#include "cpuidle.h"
|
||||
|
||||
static int imx7ulp_enter_wait(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
static __cpuidle int imx7ulp_enter_wait(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
{
|
||||
if (index == 1)
|
||||
imx7ulp_set_lpm(ULP_PM_WAIT);
|
||||
|
@ -256,11 +256,13 @@ extern u32 omap4_get_cpu1_ns_pa_addr(void);
|
||||
|
||||
#if defined(CONFIG_SMP) && defined(CONFIG_PM)
|
||||
extern int omap4_mpuss_init(void);
|
||||
extern int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state);
|
||||
extern int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state,
|
||||
bool rcuidle);
|
||||
extern int omap4_hotplug_cpu(unsigned int cpu, unsigned int power_state);
|
||||
#else
|
||||
static inline int omap4_enter_lowpower(unsigned int cpu,
|
||||
unsigned int power_state)
|
||||
unsigned int power_state,
|
||||
bool rcuidle)
|
||||
{
|
||||
cpu_do_idle();
|
||||
return 0;
|
||||
|
@ -133,7 +133,7 @@ static int omap3_enter_idle(struct cpuidle_device *dev,
|
||||
}
|
||||
|
||||
/* Execute ARM wfi */
|
||||
omap_sram_idle();
|
||||
omap_sram_idle(true);
|
||||
|
||||
/*
|
||||
* Call idle CPU PM enter notifier chain to restore
|
||||
@ -265,6 +265,7 @@ static struct cpuidle_driver omap3_idle_driver = {
|
||||
.owner = THIS_MODULE,
|
||||
.states = {
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 2 + 2,
|
||||
.target_residency = 5,
|
||||
@ -272,6 +273,7 @@ static struct cpuidle_driver omap3_idle_driver = {
|
||||
.desc = "MPU ON + CORE ON",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 10 + 10,
|
||||
.target_residency = 30,
|
||||
@ -279,6 +281,7 @@ static struct cpuidle_driver omap3_idle_driver = {
|
||||
.desc = "MPU ON + CORE ON",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 50 + 50,
|
||||
.target_residency = 300,
|
||||
@ -286,6 +289,7 @@ static struct cpuidle_driver omap3_idle_driver = {
|
||||
.desc = "MPU RET + CORE ON",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 1500 + 1800,
|
||||
.target_residency = 4000,
|
||||
@ -293,6 +297,7 @@ static struct cpuidle_driver omap3_idle_driver = {
|
||||
.desc = "MPU OFF + CORE ON",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 2500 + 7500,
|
||||
.target_residency = 12000,
|
||||
@ -300,6 +305,7 @@ static struct cpuidle_driver omap3_idle_driver = {
|
||||
.desc = "MPU RET + CORE RET",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 3000 + 8500,
|
||||
.target_residency = 15000,
|
||||
@ -307,6 +313,7 @@ static struct cpuidle_driver omap3_idle_driver = {
|
||||
.desc = "MPU OFF + CORE RET",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 10000 + 30000,
|
||||
.target_residency = 30000,
|
||||
@ -328,6 +335,7 @@ static struct cpuidle_driver omap3430_idle_driver = {
|
||||
.owner = THIS_MODULE,
|
||||
.states = {
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 110 + 162,
|
||||
.target_residency = 5,
|
||||
@ -335,6 +343,7 @@ static struct cpuidle_driver omap3430_idle_driver = {
|
||||
.desc = "MPU ON + CORE ON",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 106 + 180,
|
||||
.target_residency = 309,
|
||||
@ -342,6 +351,7 @@ static struct cpuidle_driver omap3430_idle_driver = {
|
||||
.desc = "MPU ON + CORE ON",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 107 + 410,
|
||||
.target_residency = 46057,
|
||||
@ -349,6 +359,7 @@ static struct cpuidle_driver omap3430_idle_driver = {
|
||||
.desc = "MPU RET + CORE ON",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 121 + 3374,
|
||||
.target_residency = 46057,
|
||||
@ -356,6 +367,7 @@ static struct cpuidle_driver omap3430_idle_driver = {
|
||||
.desc = "MPU OFF + CORE ON",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 855 + 1146,
|
||||
.target_residency = 46057,
|
||||
@ -363,6 +375,7 @@ static struct cpuidle_driver omap3430_idle_driver = {
|
||||
.desc = "MPU RET + CORE RET",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 7580 + 4134,
|
||||
.target_residency = 484329,
|
||||
@ -370,6 +383,7 @@ static struct cpuidle_driver omap3430_idle_driver = {
|
||||
.desc = "MPU OFF + CORE RET",
|
||||
},
|
||||
{
|
||||
.flags = CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap3_enter_idle_bm,
|
||||
.exit_latency = 7505 + 15274,
|
||||
.target_residency = 484329,
|
||||
|
@ -105,7 +105,7 @@ static int omap_enter_idle_smp(struct cpuidle_device *dev,
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&mpu_lock, flag);
|
||||
|
||||
omap4_enter_lowpower(dev->cpu, cx->cpu_state);
|
||||
omap4_enter_lowpower(dev->cpu, cx->cpu_state, true);
|
||||
|
||||
raw_spin_lock_irqsave(&mpu_lock, flag);
|
||||
if (cx->mpu_state_vote == num_online_cpus())
|
||||
@ -151,10 +151,10 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev,
|
||||
(cx->mpu_logic_state == PWRDM_POWER_OFF);
|
||||
|
||||
/* Enter broadcast mode for periodic timers */
|
||||
RCU_NONIDLE(tick_broadcast_enable());
|
||||
tick_broadcast_enable();
|
||||
|
||||
/* Enter broadcast mode for one-shot timers */
|
||||
RCU_NONIDLE(tick_broadcast_enter());
|
||||
tick_broadcast_enter();
|
||||
|
||||
/*
|
||||
* Call idle CPU PM enter notifier chain so that
|
||||
@ -166,7 +166,7 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev,
|
||||
|
||||
if (dev->cpu == 0) {
|
||||
pwrdm_set_logic_retst(mpu_pd, cx->mpu_logic_state);
|
||||
RCU_NONIDLE(omap_set_pwrdm_state(mpu_pd, cx->mpu_state));
|
||||
omap_set_pwrdm_state(mpu_pd, cx->mpu_state);
|
||||
|
||||
/*
|
||||
* Call idle CPU cluster PM enter notifier chain
|
||||
@ -178,13 +178,13 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev,
|
||||
index = 0;
|
||||
cx = state_ptr + index;
|
||||
pwrdm_set_logic_retst(mpu_pd, cx->mpu_logic_state);
|
||||
RCU_NONIDLE(omap_set_pwrdm_state(mpu_pd, cx->mpu_state));
|
||||
omap_set_pwrdm_state(mpu_pd, cx->mpu_state);
|
||||
mpuss_can_lose_context = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
omap4_enter_lowpower(dev->cpu, cx->cpu_state);
|
||||
omap4_enter_lowpower(dev->cpu, cx->cpu_state, true);
|
||||
cpu_done[dev->cpu] = true;
|
||||
|
||||
/* Wakeup CPU1 only if it is not offlined */
|
||||
@ -194,9 +194,9 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev,
|
||||
mpuss_can_lose_context)
|
||||
gic_dist_disable();
|
||||
|
||||
RCU_NONIDLE(clkdm_deny_idle(cpu_clkdm[1]));
|
||||
RCU_NONIDLE(omap_set_pwrdm_state(cpu_pd[1], PWRDM_POWER_ON));
|
||||
RCU_NONIDLE(clkdm_allow_idle(cpu_clkdm[1]));
|
||||
clkdm_deny_idle(cpu_clkdm[1]);
|
||||
omap_set_pwrdm_state(cpu_pd[1], PWRDM_POWER_ON);
|
||||
clkdm_allow_idle(cpu_clkdm[1]);
|
||||
|
||||
if (IS_PM44XX_ERRATUM(PM_OMAP4_ROM_SMP_BOOT_ERRATUM_GICD) &&
|
||||
mpuss_can_lose_context) {
|
||||
@ -222,7 +222,7 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev,
|
||||
cpu_pm_exit();
|
||||
|
||||
cpu_pm_out:
|
||||
RCU_NONIDLE(tick_broadcast_exit());
|
||||
tick_broadcast_exit();
|
||||
|
||||
fail:
|
||||
cpuidle_coupled_parallel_barrier(dev, &abort_barrier);
|
||||
@ -247,7 +247,8 @@ static struct cpuidle_driver omap4_idle_driver = {
|
||||
/* C2 - CPU0 OFF + CPU1 OFF + MPU CSWR */
|
||||
.exit_latency = 328 + 440,
|
||||
.target_residency = 960,
|
||||
.flags = CPUIDLE_FLAG_COUPLED,
|
||||
.flags = CPUIDLE_FLAG_COUPLED |
|
||||
CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap_enter_idle_coupled,
|
||||
.name = "C2",
|
||||
.desc = "CPUx OFF, MPUSS CSWR",
|
||||
@ -256,7 +257,8 @@ static struct cpuidle_driver omap4_idle_driver = {
|
||||
/* C3 - CPU0 OFF + CPU1 OFF + MPU OSWR */
|
||||
.exit_latency = 460 + 518,
|
||||
.target_residency = 1100,
|
||||
.flags = CPUIDLE_FLAG_COUPLED,
|
||||
.flags = CPUIDLE_FLAG_COUPLED |
|
||||
CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap_enter_idle_coupled,
|
||||
.name = "C3",
|
||||
.desc = "CPUx OFF, MPUSS OSWR",
|
||||
@ -282,7 +284,8 @@ static struct cpuidle_driver omap5_idle_driver = {
|
||||
/* C2 - CPU0 RET + CPU1 RET + MPU CSWR */
|
||||
.exit_latency = 48 + 60,
|
||||
.target_residency = 100,
|
||||
.flags = CPUIDLE_FLAG_TIMER_STOP,
|
||||
.flags = CPUIDLE_FLAG_TIMER_STOP |
|
||||
CPUIDLE_FLAG_RCU_IDLE,
|
||||
.enter = omap_enter_idle_smp,
|
||||
.name = "C2",
|
||||
.desc = "CPUx CSWR, MPUSS CSWR",
|
||||
|
@ -33,6 +33,7 @@
|
||||
* and first to wake-up when MPUSS low power states are excercised
|
||||
*/
|
||||
|
||||
#include <linux/cpuidle.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/errno.h>
|
||||
@ -214,6 +215,7 @@ static void __init save_l2x0_context(void)
|
||||
* of OMAP4 MPUSS subsystem
|
||||
* @cpu : CPU ID
|
||||
* @power_state: Low power state.
|
||||
* @rcuidle: RCU needs to be idled
|
||||
*
|
||||
* MPUSS states for the context save:
|
||||
* save_state =
|
||||
@ -222,7 +224,8 @@ static void __init save_l2x0_context(void)
|
||||
* 2 - CPUx L1 and logic lost + GIC lost: MPUSS OSWR
|
||||
* 3 - CPUx L1 and logic lost + GIC + L2 lost: DEVICE OFF
|
||||
*/
|
||||
int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state)
|
||||
__cpuidle int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state,
|
||||
bool rcuidle)
|
||||
{
|
||||
struct omap4_cpu_pm_info *pm_info = &per_cpu(omap4_pm_info, cpu);
|
||||
unsigned int save_state = 0, cpu_logic_state = PWRDM_POWER_RET;
|
||||
@ -268,6 +271,10 @@ int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state)
|
||||
cpu_clear_prev_logic_pwrst(cpu);
|
||||
pwrdm_set_next_pwrst(pm_info->pwrdm, power_state);
|
||||
pwrdm_set_logic_retst(pm_info->pwrdm, cpu_logic_state);
|
||||
|
||||
if (rcuidle)
|
||||
ct_cpuidle_enter();
|
||||
|
||||
set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.resume));
|
||||
omap_pm_ops.scu_prepare(cpu, power_state);
|
||||
l2x0_pwrst_prepare(cpu, save_state);
|
||||
@ -283,6 +290,9 @@ int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state)
|
||||
if (IS_PM44XX_ERRATUM(PM_OMAP4_ROM_SMP_BOOT_ERRATUM_GICD) && cpu)
|
||||
gic_dist_enable();
|
||||
|
||||
if (rcuidle)
|
||||
ct_cpuidle_exit();
|
||||
|
||||
/*
|
||||
* Restore the CPUx power state to ON otherwise CPUx
|
||||
* power domain can transitions to programmed low power
|
||||
|
@ -29,7 +29,7 @@ static inline int omap4_idle_init(void)
|
||||
|
||||
extern void *omap3_secure_ram_storage;
|
||||
extern void omap3_pm_off_mode_enable(int);
|
||||
extern void omap_sram_idle(void);
|
||||
extern void omap_sram_idle(bool rcuidle);
|
||||
extern int omap_pm_clkdms_setup(struct clockdomain *clkdm, void *unused);
|
||||
|
||||
extern int omap3_pm_get_suspend_state(struct powerdomain *pwrdm);
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include <linux/delay.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/cpuidle.h>
|
||||
|
||||
#include <trace/events/power.h>
|
||||
|
||||
@ -174,7 +175,7 @@ static int omap34xx_do_sram_idle(unsigned long save_state)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void omap_sram_idle(void)
|
||||
__cpuidle void omap_sram_idle(bool rcuidle)
|
||||
{
|
||||
/* Variable to tell what needs to be saved and restored
|
||||
* in omap_sram_idle*/
|
||||
@ -254,11 +255,18 @@ void omap_sram_idle(void)
|
||||
*/
|
||||
if (save_state)
|
||||
omap34xx_save_context(omap3_arm_context);
|
||||
|
||||
if (rcuidle)
|
||||
ct_cpuidle_enter();
|
||||
|
||||
if (save_state == 1 || save_state == 3)
|
||||
cpu_suspend(save_state, omap34xx_do_sram_idle);
|
||||
else
|
||||
omap34xx_do_sram_idle(save_state);
|
||||
|
||||
if (rcuidle)
|
||||
ct_cpuidle_exit();
|
||||
|
||||
/* Restore normal SDRC POWER settings */
|
||||
if (cpu_is_omap3430() && omap_rev() >= OMAP3430_REV_ES3_0 &&
|
||||
(omap_type() == OMAP2_DEVICE_TYPE_EMU ||
|
||||
@ -294,7 +302,7 @@ static void omap3_pm_idle(void)
|
||||
if (omap_irq_pending())
|
||||
return;
|
||||
|
||||
omap_sram_idle();
|
||||
omap3_do_wfi();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SUSPEND
|
||||
@ -316,7 +324,7 @@ static int omap3_pm_suspend(void)
|
||||
|
||||
omap3_intc_suspend();
|
||||
|
||||
omap_sram_idle();
|
||||
omap_sram_idle(false);
|
||||
|
||||
restore:
|
||||
/* Restore next_pwrsts */
|
||||
|
@ -76,7 +76,7 @@ static int omap4_pm_suspend(void)
|
||||
* domain CSWR is not supported by hardware.
|
||||
* More details can be found in OMAP4430 TRM section 4.3.4.2.
|
||||
*/
|
||||
omap4_enter_lowpower(cpu_id, cpu_suspend_state);
|
||||
omap4_enter_lowpower(cpu_id, cpu_suspend_state, false);
|
||||
|
||||
/* Restore next powerdomain state */
|
||||
list_for_each_entry(pwrst, &pwrst_list, node) {
|
||||
|
@ -187,9 +187,9 @@ static int _pwrdm_state_switch(struct powerdomain *pwrdm, int flag)
|
||||
trace_state = (PWRDM_TRACE_STATES_FLAG |
|
||||
((next & OMAP_POWERSTATE_MASK) << 8) |
|
||||
((prev & OMAP_POWERSTATE_MASK) << 0));
|
||||
trace_power_domain_target_rcuidle(pwrdm->name,
|
||||
trace_state,
|
||||
raw_smp_processor_id());
|
||||
trace_power_domain_target(pwrdm->name,
|
||||
trace_state,
|
||||
raw_smp_processor_id());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@ -541,8 +541,8 @@ int pwrdm_set_next_pwrst(struct powerdomain *pwrdm, u8 pwrst)
|
||||
|
||||
if (arch_pwrdm && arch_pwrdm->pwrdm_set_next_pwrst) {
|
||||
/* Trace the pwrdm desired target state */
|
||||
trace_power_domain_target_rcuidle(pwrdm->name, pwrst,
|
||||
raw_smp_processor_id());
|
||||
trace_power_domain_target(pwrdm->name, pwrst,
|
||||
raw_smp_processor_id());
|
||||
/* Program the pwrdm desired target state */
|
||||
ret = arch_pwrdm->pwrdm_set_next_pwrst(pwrdm, pwrst);
|
||||
}
|
||||
|
@ -19,9 +19,8 @@
|
||||
#include "regs-sys-s3c64xx.h"
|
||||
#include "regs-syscon-power-s3c64xx.h"
|
||||
|
||||
static int s3c64xx_enter_idle(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv,
|
||||
int index)
|
||||
static __cpuidle int s3c64xx_enter_idle(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
{
|
||||
unsigned long tmp;
|
||||
|
||||
|
@ -62,15 +62,15 @@ int acpi_processor_ffh_lpi_probe(unsigned int cpu)
|
||||
return psci_acpi_cpu_init_idle(cpu);
|
||||
}
|
||||
|
||||
int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi)
|
||||
__cpuidle int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi)
|
||||
{
|
||||
u32 state = lpi->address;
|
||||
|
||||
if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags))
|
||||
return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(psci_cpu_suspend_enter,
|
||||
return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM_RCU(psci_cpu_suspend_enter,
|
||||
lpi->index, state);
|
||||
else
|
||||
return CPU_PM_CPU_IDLE_ENTER_PARAM(psci_cpu_suspend_enter,
|
||||
return CPU_PM_CPU_IDLE_ENTER_PARAM_RCU(psci_cpu_suspend_enter,
|
||||
lpi->index, state);
|
||||
}
|
||||
#endif
|
||||
|
@ -42,5 +42,4 @@ void noinstr arch_cpu_idle(void)
|
||||
* tricks
|
||||
*/
|
||||
cpu_do_idle();
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
@ -865,7 +865,7 @@ static void do_handle_IPI(int ipinr)
|
||||
unsigned int cpu = smp_processor_id();
|
||||
|
||||
if ((unsigned)ipinr < NR_IPI)
|
||||
trace_ipi_entry_rcuidle(ipi_types[ipinr]);
|
||||
trace_ipi_entry(ipi_types[ipinr]);
|
||||
|
||||
switch (ipinr) {
|
||||
case IPI_RESCHEDULE:
|
||||
@ -914,7 +914,7 @@ static void do_handle_IPI(int ipinr)
|
||||
}
|
||||
|
||||
if ((unsigned)ipinr < NR_IPI)
|
||||
trace_ipi_exit_rcuidle(ipi_types[ipinr]);
|
||||
trace_ipi_exit(ipi_types[ipinr]);
|
||||
}
|
||||
|
||||
static irqreturn_t ipi_handler(int irq, void *data)
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/pgtable.h>
|
||||
#include <linux/cpuidle.h>
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/cpufeature.h>
|
||||
@ -104,6 +105,10 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
|
||||
* From this point debug exceptions are disabled to prevent
|
||||
* updates to mdscr register (saved and restored along with
|
||||
* general purpose registers) from kernel debuggers.
|
||||
*
|
||||
* Strictly speaking the trace_hardirqs_off() here is superfluous,
|
||||
* hardirqs should be firmly off by now. This really ought to use
|
||||
* something like raw_local_daif_save().
|
||||
*/
|
||||
flags = local_daif_save();
|
||||
|
||||
@ -120,6 +125,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
|
||||
*/
|
||||
arm_cpuidle_save_irq_context(&context);
|
||||
|
||||
ct_cpuidle_enter();
|
||||
|
||||
if (__cpu_suspend_enter(&state)) {
|
||||
/* Call the suspend finisher */
|
||||
ret = fn(arg);
|
||||
@ -133,8 +140,11 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
|
||||
*/
|
||||
if (!ret)
|
||||
ret = -EOPNOTSUPP;
|
||||
|
||||
ct_cpuidle_exit();
|
||||
} else {
|
||||
RCU_NONIDLE(__cpu_suspend_exit());
|
||||
ct_cpuidle_exit();
|
||||
__cpu_suspend_exit();
|
||||
}
|
||||
|
||||
arm_cpuidle_restore_irq_context(&context);
|
||||
|
@ -175,7 +175,6 @@ SECTIONS
|
||||
ENTRY_TEXT
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
HYPERVISOR_TEXT
|
||||
|
@ -100,6 +100,5 @@ void arch_cpu_idle(void)
|
||||
#ifdef CONFIG_CPU_PM_STOP
|
||||
asm volatile("stop\n");
|
||||
#endif
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
#endif
|
||||
|
@ -309,7 +309,7 @@ void arch_cpu_idle_dead(void)
|
||||
while (!secondary_stack)
|
||||
arch_cpu_idle();
|
||||
|
||||
local_irq_disable();
|
||||
raw_local_irq_disable();
|
||||
|
||||
asm volatile(
|
||||
"mov sp, %0\n"
|
||||
|
@ -34,7 +34,6 @@ SECTIONS
|
||||
SOFTIRQENTRY_TEXT
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
*(.fixup)
|
||||
|
@ -44,7 +44,6 @@ void arch_cpu_idle(void)
|
||||
{
|
||||
__vmwait();
|
||||
/* interrupts wake us up, but irqs are still disabled */
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -41,7 +41,6 @@ SECTIONS
|
||||
IRQENTRY_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
*(.fixup)
|
||||
|
@ -242,6 +242,7 @@ void arch_cpu_idle(void)
|
||||
(*mark_idle)(1);
|
||||
|
||||
raw_safe_halt();
|
||||
raw_local_irq_disable();
|
||||
|
||||
if (mark_idle)
|
||||
(*mark_idle)(0);
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/sched/cputime.h>
|
||||
|
||||
#include <asm/cputime.h>
|
||||
#include <asm/delay.h>
|
||||
#include <asm/efi.h>
|
||||
#include <asm/hw_irq.h>
|
||||
|
@ -51,7 +51,6 @@ SECTIONS {
|
||||
__end_ivt_text = .;
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -13,4 +13,5 @@ void __cpuidle arch_cpu_idle(void)
|
||||
{
|
||||
raw_local_irq_enable();
|
||||
__arch_cpu_idle(); /* idle instruction needs irq enabled */
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
@ -43,7 +43,6 @@ SECTIONS
|
||||
.text : {
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -48,7 +48,6 @@ SECTIONS {
|
||||
IRQENTRY_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
*(.fixup)
|
||||
. = ALIGN(16);
|
||||
|
@ -19,7 +19,6 @@ SECTIONS
|
||||
IRQENTRY_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
*(.fixup)
|
||||
*(.gnu.warning)
|
||||
|
@ -19,7 +19,6 @@ SECTIONS
|
||||
IRQENTRY_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
*(.fixup)
|
||||
*(.gnu.warning)
|
||||
|
@ -140,5 +140,4 @@ int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu)
|
||||
|
||||
void arch_cpu_idle(void)
|
||||
{
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
@ -36,7 +36,6 @@ SECTIONS {
|
||||
EXIT_TEXT
|
||||
EXIT_CALL
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -33,13 +33,13 @@ static void __cpuidle r3081_wait(void)
|
||||
{
|
||||
unsigned long cfg = read_c0_conf();
|
||||
write_c0_conf(cfg | R30XX_CONF_HALT);
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
void __cpuidle r4k_wait(void)
|
||||
{
|
||||
raw_local_irq_enable();
|
||||
__r4k_wait();
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -57,7 +57,6 @@ void __cpuidle r4k_wait_irqoff(void)
|
||||
" .set arch=r4000 \n"
|
||||
" wait \n"
|
||||
" .set pop \n");
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -77,7 +76,6 @@ static void __cpuidle rm7k_wait_irqoff(void)
|
||||
" wait \n"
|
||||
" mtc0 $1, $12 # stalls until W stage \n"
|
||||
" .set pop \n");
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -103,6 +101,8 @@ static void __cpuidle au1k_wait(void)
|
||||
" nop \n"
|
||||
" .set pop \n"
|
||||
: : "r" (au1k_wait), "r" (c0status));
|
||||
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
||||
static int __initdata nowait;
|
||||
@ -241,18 +241,16 @@ void __init check_wait(void)
|
||||
}
|
||||
}
|
||||
|
||||
void arch_cpu_idle(void)
|
||||
__cpuidle void arch_cpu_idle(void)
|
||||
{
|
||||
if (cpu_wait)
|
||||
cpu_wait();
|
||||
else
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CPU_IDLE
|
||||
|
||||
int mips_cpuidle_wait_enter(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
__cpuidle int mips_cpuidle_wait_enter(struct cpuidle_device *dev,
|
||||
struct cpuidle_driver *drv, int index)
|
||||
{
|
||||
arch_cpu_idle();
|
||||
return index;
|
||||
|
@ -61,7 +61,6 @@ SECTIONS
|
||||
.text : {
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -33,7 +33,6 @@ EXPORT_SYMBOL(pm_power_off);
|
||||
|
||||
void arch_cpu_idle(void)
|
||||
{
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -24,7 +24,6 @@ SECTIONS
|
||||
.text : {
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
IRQENTRY_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
|
@ -102,6 +102,7 @@ void arch_cpu_idle(void)
|
||||
raw_local_irq_enable();
|
||||
if (mfspr(SPR_UPR) & SPR_UPR_PMP)
|
||||
mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME);
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
||||
void (*pm_power_off)(void) = NULL;
|
||||
|
@ -52,7 +52,6 @@ SECTIONS
|
||||
_stext = .;
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -183,8 +183,6 @@ void arch_cpu_idle_dead(void)
|
||||
|
||||
void __cpuidle arch_cpu_idle(void)
|
||||
{
|
||||
raw_local_irq_enable();
|
||||
|
||||
/* nop on real hardware, qemu will idle sleep. */
|
||||
asm volatile("or %%r10,%%r10,%%r10\n":::);
|
||||
}
|
||||
|
@ -86,7 +86,6 @@ SECTIONS
|
||||
TEXT_TEXT
|
||||
LOCK_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
|
@ -51,10 +51,9 @@ void arch_cpu_idle(void)
|
||||
* Some power_save functions return with
|
||||
* interrupts enabled, some don't.
|
||||
*/
|
||||
if (irqs_disabled())
|
||||
raw_local_irq_enable();
|
||||
if (!irqs_disabled())
|
||||
raw_local_irq_disable();
|
||||
} else {
|
||||
raw_local_irq_enable();
|
||||
/*
|
||||
* Go into low thread priority and possibly
|
||||
* low power mode.
|
||||
|
@ -112,7 +112,6 @@ SECTIONS
|
||||
#endif
|
||||
NOINSTR_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -39,7 +39,6 @@ extern asmlinkage void ret_from_kernel_thread(void);
|
||||
void arch_cpu_idle(void)
|
||||
{
|
||||
cpu_do_idle();
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
void __show_regs(struct pt_regs *regs)
|
||||
|
@ -39,7 +39,6 @@ SECTIONS
|
||||
_stext = .;
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
ENTRY_TEXT
|
||||
|
@ -42,7 +42,6 @@ SECTIONS
|
||||
_stext = .;
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
ENTRY_TEXT
|
||||
|
@ -12,9 +12,9 @@
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/sched/cputime.h>
|
||||
#include <trace/events/power.h>
|
||||
#include <asm/cpu_mf.h>
|
||||
#include <asm/cputime.h>
|
||||
#include <asm/nmi.h>
|
||||
#include <asm/smp.h>
|
||||
#include "entry.h"
|
||||
@ -66,7 +66,6 @@ void arch_cpu_idle(void)
|
||||
idle->idle_count++;
|
||||
account_idle_time(cputime_to_nsecs(idle_time));
|
||||
raw_write_seqcount_end(&idle->seqcount);
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
static ssize_t show_idle_count(struct device *dev,
|
||||
|
@ -44,7 +44,6 @@ SECTIONS
|
||||
HEAD_TEXT
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -7,13 +7,13 @@
|
||||
*/
|
||||
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/sched/cputime.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/timex.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/time.h>
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/cputime.h>
|
||||
#include <asm/vtimer.h>
|
||||
#include <asm/vtime.h>
|
||||
#include <asm/cpu_mf.h>
|
||||
|
@ -25,6 +25,7 @@ void default_idle(void)
|
||||
raw_local_irq_enable();
|
||||
/* Isn't this racy ? */
|
||||
cpu_sleep();
|
||||
raw_local_irq_disable();
|
||||
clear_bl_bit();
|
||||
}
|
||||
|
||||
|
@ -30,7 +30,6 @@ SECTIONS
|
||||
HEAD_TEXT
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -57,6 +57,8 @@ static void pmc_leon_idle_fixup(void)
|
||||
"lda [%0] %1, %%g0\n"
|
||||
:
|
||||
: "r"(address), "i"(ASI_LEON_BYPASS));
|
||||
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -70,6 +72,8 @@ static void pmc_leon_idle(void)
|
||||
|
||||
/* For systems without power-down, this will be no-op */
|
||||
__asm__ __volatile__ ("wr %g0, %asr19\n\t");
|
||||
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
|
||||
/* Install LEON Power Down function */
|
||||
|
@ -71,7 +71,6 @@ void arch_cpu_idle(void)
|
||||
{
|
||||
if (sparc_idle)
|
||||
(*sparc_idle)();
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
/* XXX cli/sti -> local_irq_xxx here, check this works once SMP is fixed. */
|
||||
|
@ -59,7 +59,6 @@ void arch_cpu_idle(void)
|
||||
{
|
||||
if (tlb_type != hypervisor) {
|
||||
touch_nmi_watchdog();
|
||||
raw_local_irq_enable();
|
||||
} else {
|
||||
unsigned long pstate;
|
||||
|
||||
@ -90,6 +89,8 @@ void arch_cpu_idle(void)
|
||||
"wrpr %0, %%g0, %%pstate"
|
||||
: "=&r" (pstate)
|
||||
: "i" (PSTATE_IE));
|
||||
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -50,7 +50,6 @@ SECTIONS
|
||||
HEAD_TEXT
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
IRQENTRY_TEXT
|
||||
|
@ -74,7 +74,6 @@ SECTIONS
|
||||
_stext = .;
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
IRQENTRY_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
|
@ -218,7 +218,6 @@ void arch_cpu_idle(void)
|
||||
{
|
||||
cpu_tasks[current_thread_info()->cpu].pid = os_getpid();
|
||||
um_idle_sleep();
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
int __cant_sleep(void) {
|
||||
|
@ -35,7 +35,6 @@ SECTIONS
|
||||
_stext = .;
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
IRQENTRY_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
|
@ -34,6 +34,7 @@ SECTIONS
|
||||
_text = .; /* Text */
|
||||
*(.text)
|
||||
*(.text.*)
|
||||
*(.noinstr.text)
|
||||
_etext = . ;
|
||||
}
|
||||
.rodata : {
|
||||
|
@ -31,6 +31,8 @@
|
||||
TDX_R12 | TDX_R13 | \
|
||||
TDX_R14 | TDX_R15 )
|
||||
|
||||
.section .noinstr.text, "ax"
|
||||
|
||||
/*
|
||||
* __tdx_module_call() - Used by TDX guests to request services from
|
||||
* the TDX module (does not include VMM services) using TDCALL instruction.
|
||||
@ -139,19 +141,6 @@ SYM_FUNC_START(__tdx_hypercall)
|
||||
|
||||
movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
|
||||
|
||||
/*
|
||||
* For the idle loop STI needs to be called directly before the TDCALL
|
||||
* that enters idle (EXIT_REASON_HLT case). STI instruction enables
|
||||
* interrupts only one instruction later. If there is a window between
|
||||
* STI and the instruction that emulates the HALT state, there is a
|
||||
* chance for interrupts to happen in this window, which can delay the
|
||||
* HLT operation indefinitely. Since this is the not the desired
|
||||
* result, conditionally call STI before TDCALL.
|
||||
*/
|
||||
testq $TDX_HCALL_ISSUE_STI, %rsi
|
||||
jz .Lskip_sti
|
||||
sti
|
||||
.Lskip_sti:
|
||||
tdcall
|
||||
|
||||
/*
|
||||
|
@ -64,8 +64,9 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
|
||||
}
|
||||
|
||||
/* Called from __tdx_hypercall() for unrecoverable failure */
|
||||
void __tdx_hypercall_failed(void)
|
||||
noinstr void __tdx_hypercall_failed(void)
|
||||
{
|
||||
instrumentation_begin();
|
||||
panic("TDVMCALL failed. TDX module bug?");
|
||||
}
|
||||
|
||||
@ -75,7 +76,7 @@ void __tdx_hypercall_failed(void)
|
||||
* Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
|
||||
* guest sides of these calls.
|
||||
*/
|
||||
static u64 hcall_func(u64 exit_reason)
|
||||
static __always_inline u64 hcall_func(u64 exit_reason)
|
||||
{
|
||||
return exit_reason;
|
||||
}
|
||||
@ -220,7 +221,7 @@ static int ve_instr_len(struct ve_info *ve)
|
||||
}
|
||||
}
|
||||
|
||||
static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
|
||||
static u64 __cpuidle __halt(const bool irq_disabled)
|
||||
{
|
||||
struct tdx_hypercall_args args = {
|
||||
.r10 = TDX_HYPERCALL_STANDARD,
|
||||
@ -240,20 +241,14 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
|
||||
* can keep the vCPU in virtual HLT, even if an IRQ is
|
||||
* pending, without hanging/breaking the guest.
|
||||
*/
|
||||
return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
|
||||
return __tdx_hypercall(&args, 0);
|
||||
}
|
||||
|
||||
static int handle_halt(struct ve_info *ve)
|
||||
{
|
||||
/*
|
||||
* Since non safe halt is mainly used in CPU offlining
|
||||
* and the guest will always stay in the halt state, don't
|
||||
* call the STI instruction (set do_sti as false).
|
||||
*/
|
||||
const bool irq_disabled = irqs_disabled();
|
||||
const bool do_sti = false;
|
||||
|
||||
if (__halt(irq_disabled, do_sti))
|
||||
if (__halt(irq_disabled))
|
||||
return -EIO;
|
||||
|
||||
return ve_instr_len(ve);
|
||||
@ -261,18 +256,12 @@ static int handle_halt(struct ve_info *ve)
|
||||
|
||||
void __cpuidle tdx_safe_halt(void)
|
||||
{
|
||||
/*
|
||||
* For do_sti=true case, __tdx_hypercall() function enables
|
||||
* interrupts using the STI instruction before the TDCALL. So
|
||||
* set irq_disabled as false.
|
||||
*/
|
||||
const bool irq_disabled = false;
|
||||
const bool do_sti = true;
|
||||
|
||||
/*
|
||||
* Use WARN_ONCE() to report the failure.
|
||||
*/
|
||||
if (__halt(irq_disabled, do_sti))
|
||||
if (__halt(irq_disabled))
|
||||
WARN_ONCE(1, "HLT instruction emulation failed\n");
|
||||
}
|
||||
|
||||
|
@ -41,18 +41,15 @@ static inline unsigned int brs_to(int idx)
|
||||
return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1;
|
||||
}
|
||||
|
||||
static inline void set_debug_extn_cfg(u64 val)
|
||||
static __always_inline void set_debug_extn_cfg(u64 val)
|
||||
{
|
||||
/* bits[4:3] must always be set to 11b */
|
||||
wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3);
|
||||
__wrmsr(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3, val >> 32);
|
||||
}
|
||||
|
||||
static inline u64 get_debug_extn_cfg(void)
|
||||
static __always_inline u64 get_debug_extn_cfg(void)
|
||||
{
|
||||
u64 val;
|
||||
|
||||
rdmsrl(MSR_AMD_DBG_EXTN_CFG, val);
|
||||
return val;
|
||||
return __rdmsr(MSR_AMD_DBG_EXTN_CFG);
|
||||
}
|
||||
|
||||
static bool __init amd_brs_detect(void)
|
||||
@ -405,7 +402,7 @@ void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_i
|
||||
* called from ACPI processor_idle.c or acpi_pad.c
|
||||
* with interrupts disabled
|
||||
*/
|
||||
void perf_amd_brs_lopwr_cb(bool lopwr_in)
|
||||
void noinstr perf_amd_brs_lopwr_cb(bool lopwr_in)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
union amd_debug_extn_cfg cfg;
|
||||
|
@ -71,7 +71,7 @@ ATOMIC64_DECL(add_unless);
|
||||
* the old value.
|
||||
*/
|
||||
|
||||
static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
|
||||
static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
|
||||
{
|
||||
return arch_cmpxchg64(&v->counter, o, n);
|
||||
}
|
||||
@ -85,7 +85,7 @@ static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
|
||||
* Atomically xchgs the value of @v to @n and returns
|
||||
* the old value.
|
||||
*/
|
||||
static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
|
||||
static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
|
||||
{
|
||||
s64 o;
|
||||
unsigned high = (unsigned)(n >> 32);
|
||||
@ -104,7 +104,7 @@ static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
|
||||
*
|
||||
* Atomically sets the value of @v to @n.
|
||||
*/
|
||||
static inline void arch_atomic64_set(atomic64_t *v, s64 i)
|
||||
static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
|
||||
{
|
||||
unsigned high = (unsigned)(i >> 32);
|
||||
unsigned low = (unsigned)i;
|
||||
@ -119,7 +119,7 @@ static inline void arch_atomic64_set(atomic64_t *v, s64 i)
|
||||
*
|
||||
* Atomically reads the value of @v and returns it.
|
||||
*/
|
||||
static inline s64 arch_atomic64_read(const atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
|
||||
{
|
||||
s64 r;
|
||||
alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
|
||||
@ -133,7 +133,7 @@ static inline s64 arch_atomic64_read(const atomic64_t *v)
|
||||
*
|
||||
* Atomically adds @i to @v and returns @i + *@v
|
||||
*/
|
||||
static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
|
||||
{
|
||||
alternative_atomic64(add_return,
|
||||
ASM_OUTPUT2("+A" (i), "+c" (v)),
|
||||
@ -145,7 +145,7 @@ static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
|
||||
/*
|
||||
* Other variants with different arithmetic operators:
|
||||
*/
|
||||
static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
|
||||
{
|
||||
alternative_atomic64(sub_return,
|
||||
ASM_OUTPUT2("+A" (i), "+c" (v)),
|
||||
@ -154,7 +154,7 @@ static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_sub_return arch_atomic64_sub_return
|
||||
|
||||
static inline s64 arch_atomic64_inc_return(atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v)
|
||||
{
|
||||
s64 a;
|
||||
alternative_atomic64(inc_return, "=&A" (a),
|
||||
@ -163,7 +163,7 @@ static inline s64 arch_atomic64_inc_return(atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_inc_return arch_atomic64_inc_return
|
||||
|
||||
static inline s64 arch_atomic64_dec_return(atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
|
||||
{
|
||||
s64 a;
|
||||
alternative_atomic64(dec_return, "=&A" (a),
|
||||
@ -179,7 +179,7 @@ static inline s64 arch_atomic64_dec_return(atomic64_t *v)
|
||||
*
|
||||
* Atomically adds @i to @v.
|
||||
*/
|
||||
static inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
|
||||
{
|
||||
__alternative_atomic64(add, add_return,
|
||||
ASM_OUTPUT2("+A" (i), "+c" (v)),
|
||||
@ -194,7 +194,7 @@ static inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
|
||||
*
|
||||
* Atomically subtracts @i from @v.
|
||||
*/
|
||||
static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
|
||||
{
|
||||
__alternative_atomic64(sub, sub_return,
|
||||
ASM_OUTPUT2("+A" (i), "+c" (v)),
|
||||
@ -208,7 +208,7 @@ static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
|
||||
*
|
||||
* Atomically increments @v by 1.
|
||||
*/
|
||||
static inline void arch_atomic64_inc(atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_inc(atomic64_t *v)
|
||||
{
|
||||
__alternative_atomic64(inc, inc_return, /* no output */,
|
||||
"S" (v) : "memory", "eax", "ecx", "edx");
|
||||
@ -221,7 +221,7 @@ static inline void arch_atomic64_inc(atomic64_t *v)
|
||||
*
|
||||
* Atomically decrements @v by 1.
|
||||
*/
|
||||
static inline void arch_atomic64_dec(atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_dec(atomic64_t *v)
|
||||
{
|
||||
__alternative_atomic64(dec, dec_return, /* no output */,
|
||||
"S" (v) : "memory", "eax", "ecx", "edx");
|
||||
@ -237,7 +237,7 @@ static inline void arch_atomic64_dec(atomic64_t *v)
|
||||
* Atomically adds @a to @v, so long as it was not @u.
|
||||
* Returns non-zero if the add was done, zero otherwise.
|
||||
*/
|
||||
static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
|
||||
static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
|
||||
{
|
||||
unsigned low = (unsigned)u;
|
||||
unsigned high = (unsigned)(u >> 32);
|
||||
@ -248,7 +248,7 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
|
||||
}
|
||||
#define arch_atomic64_add_unless arch_atomic64_add_unless
|
||||
|
||||
static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
|
||||
static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v)
|
||||
{
|
||||
int r;
|
||||
alternative_atomic64(inc_not_zero, "=&a" (r),
|
||||
@ -257,7 +257,7 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
|
||||
|
||||
static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
|
||||
{
|
||||
s64 r;
|
||||
alternative_atomic64(dec_if_positive, "=&A" (r),
|
||||
@ -269,7 +269,7 @@ static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
|
||||
#undef alternative_atomic64
|
||||
#undef __alternative_atomic64
|
||||
|
||||
static inline void arch_atomic64_and(s64 i, atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 old, c = 0;
|
||||
|
||||
@ -277,7 +277,7 @@ static inline void arch_atomic64_and(s64 i, atomic64_t *v)
|
||||
c = old;
|
||||
}
|
||||
|
||||
static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 old, c = 0;
|
||||
|
||||
@ -288,7 +288,7 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
|
||||
|
||||
static inline void arch_atomic64_or(s64 i, atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 old, c = 0;
|
||||
|
||||
@ -296,7 +296,7 @@ static inline void arch_atomic64_or(s64 i, atomic64_t *v)
|
||||
c = old;
|
||||
}
|
||||
|
||||
static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 old, c = 0;
|
||||
|
||||
@ -307,7 +307,7 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
|
||||
|
||||
static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 old, c = 0;
|
||||
|
||||
@ -315,7 +315,7 @@ static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
|
||||
c = old;
|
||||
}
|
||||
|
||||
static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 old, c = 0;
|
||||
|
||||
@ -326,7 +326,7 @@ static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
|
||||
|
||||
static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 old, c = 0;
|
||||
|
||||
|
@ -17,7 +17,7 @@
|
||||
* Atomically reads the value of @v.
|
||||
* Doesn't imply a read memory barrier.
|
||||
*/
|
||||
static inline s64 arch_atomic64_read(const atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
|
||||
{
|
||||
return __READ_ONCE((v)->counter);
|
||||
}
|
||||
@ -29,7 +29,7 @@ static inline s64 arch_atomic64_read(const atomic64_t *v)
|
||||
*
|
||||
* Atomically sets the value of @v to @i.
|
||||
*/
|
||||
static inline void arch_atomic64_set(atomic64_t *v, s64 i)
|
||||
static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
|
||||
{
|
||||
__WRITE_ONCE(v->counter, i);
|
||||
}
|
||||
@ -55,7 +55,7 @@ static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
|
||||
*
|
||||
* Atomically subtracts @i from @v.
|
||||
*/
|
||||
static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
|
||||
{
|
||||
asm volatile(LOCK_PREFIX "subq %1,%0"
|
||||
: "=m" (v->counter)
|
||||
@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
|
||||
* true if the result is zero, or false for all
|
||||
* other cases.
|
||||
*/
|
||||
static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
|
||||
static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
|
||||
{
|
||||
return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
|
||||
}
|
||||
@ -113,7 +113,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v)
|
||||
* returns true if the result is 0, or false for all other
|
||||
* cases.
|
||||
*/
|
||||
static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
|
||||
static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
|
||||
{
|
||||
return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
|
||||
}
|
||||
@ -127,7 +127,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
|
||||
* and returns true if the result is zero, or false for all
|
||||
* other cases.
|
||||
*/
|
||||
static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
|
||||
static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
|
||||
{
|
||||
return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
|
||||
}
|
||||
@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
|
||||
* if the result is negative, or false when
|
||||
* result is greater than or equal to zero.
|
||||
*/
|
||||
static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
|
||||
static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
|
||||
{
|
||||
return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
|
||||
}
|
||||
@ -161,25 +161,25 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_add_return arch_atomic64_add_return
|
||||
|
||||
static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
|
||||
{
|
||||
return arch_atomic64_add_return(-i, v);
|
||||
}
|
||||
#define arch_atomic64_sub_return arch_atomic64_sub_return
|
||||
|
||||
static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
|
||||
{
|
||||
return xadd(&v->counter, i);
|
||||
}
|
||||
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
|
||||
|
||||
static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
|
||||
{
|
||||
return xadd(&v->counter, -i);
|
||||
}
|
||||
#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
|
||||
|
||||
static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
|
||||
static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
|
||||
{
|
||||
return arch_cmpxchg(&v->counter, old, new);
|
||||
}
|
||||
@ -191,13 +191,13 @@ static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s
|
||||
}
|
||||
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
|
||||
|
||||
static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
|
||||
static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
|
||||
{
|
||||
return arch_xchg(&v->counter, new);
|
||||
}
|
||||
#define arch_atomic64_xchg arch_atomic64_xchg
|
||||
|
||||
static inline void arch_atomic64_and(s64 i, atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
|
||||
{
|
||||
asm volatile(LOCK_PREFIX "andq %1,%0"
|
||||
: "+m" (v->counter)
|
||||
@ -205,7 +205,7 @@ static inline void arch_atomic64_and(s64 i, atomic64_t *v)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 val = arch_atomic64_read(v);
|
||||
|
||||
@ -215,7 +215,7 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
|
||||
|
||||
static inline void arch_atomic64_or(s64 i, atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
|
||||
{
|
||||
asm volatile(LOCK_PREFIX "orq %1,%0"
|
||||
: "+m" (v->counter)
|
||||
@ -223,7 +223,7 @@ static inline void arch_atomic64_or(s64 i, atomic64_t *v)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 val = arch_atomic64_read(v);
|
||||
|
||||
@ -233,7 +233,7 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
|
||||
}
|
||||
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
|
||||
|
||||
static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
|
||||
static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
|
||||
{
|
||||
asm volatile(LOCK_PREFIX "xorq %1,%0"
|
||||
: "+m" (v->counter)
|
||||
@ -241,7 +241,7 @@ static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
|
||||
static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
|
||||
{
|
||||
s64 val = arch_atomic64_read(v);
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
|
||||
#define XCR_XFEATURE_IN_USE_MASK 0x00000001
|
||||
|
||||
static inline u64 xgetbv(u32 index)
|
||||
static __always_inline u64 xgetbv(u32 index)
|
||||
{
|
||||
u32 eax, edx;
|
||||
|
||||
@ -27,7 +27,7 @@ static inline void xsetbv(u32 index, u64 value)
|
||||
*
|
||||
* Callers should check X86_FEATURE_XGETBV1.
|
||||
*/
|
||||
static inline u64 xfeatures_in_use(void)
|
||||
static __always_inline u64 xfeatures_in_use(void)
|
||||
{
|
||||
return xgetbv(XCR_XFEATURE_IN_USE_MASK);
|
||||
}
|
||||
|
@ -8,9 +8,6 @@
|
||||
|
||||
#include <asm/nospec-branch.h>
|
||||
|
||||
/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
|
||||
#define __cpuidle __section(".cpuidle.text")
|
||||
|
||||
/*
|
||||
* Interrupt control:
|
||||
*/
|
||||
@ -45,13 +42,13 @@ static __always_inline void native_irq_enable(void)
|
||||
asm volatile("sti": : :"memory");
|
||||
}
|
||||
|
||||
static inline __cpuidle void native_safe_halt(void)
|
||||
static __always_inline void native_safe_halt(void)
|
||||
{
|
||||
mds_idle_clear_cpu_buffers();
|
||||
asm volatile("sti; hlt": : :"memory");
|
||||
}
|
||||
|
||||
static inline __cpuidle void native_halt(void)
|
||||
static __always_inline void native_halt(void)
|
||||
{
|
||||
mds_idle_clear_cpu_buffers();
|
||||
asm volatile("hlt": : :"memory");
|
||||
@ -84,7 +81,7 @@ static __always_inline void arch_local_irq_enable(void)
|
||||
* Used in the idle loop; sti takes one instruction cycle
|
||||
* to complete:
|
||||
*/
|
||||
static inline __cpuidle void arch_safe_halt(void)
|
||||
static __always_inline void arch_safe_halt(void)
|
||||
{
|
||||
native_safe_halt();
|
||||
}
|
||||
@ -93,7 +90,7 @@ static inline __cpuidle void arch_safe_halt(void)
|
||||
* Used when interrupts are already enabled or to
|
||||
* shutdown the processor:
|
||||
*/
|
||||
static inline __cpuidle void halt(void)
|
||||
static __always_inline void halt(void)
|
||||
{
|
||||
native_halt();
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ extern struct clocksource kvm_clock;
|
||||
|
||||
DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
|
||||
|
||||
static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
|
||||
static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
|
||||
{
|
||||
return &this_cpu_read(hv_clock_per_cpu)->pvti;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@
|
||||
#define TPAUSE_C01_STATE 1
|
||||
#define TPAUSE_C02_STATE 0
|
||||
|
||||
static inline void __monitor(const void *eax, unsigned long ecx,
|
||||
static __always_inline void __monitor(const void *eax, unsigned long ecx,
|
||||
unsigned long edx)
|
||||
{
|
||||
/* "monitor %eax, %ecx, %edx;" */
|
||||
@ -34,7 +34,7 @@ static inline void __monitor(const void *eax, unsigned long ecx,
|
||||
:: "a" (eax), "c" (ecx), "d"(edx));
|
||||
}
|
||||
|
||||
static inline void __monitorx(const void *eax, unsigned long ecx,
|
||||
static __always_inline void __monitorx(const void *eax, unsigned long ecx,
|
||||
unsigned long edx)
|
||||
{
|
||||
/* "monitorx %eax, %ecx, %edx;" */
|
||||
@ -42,7 +42,7 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
|
||||
:: "a" (eax), "c" (ecx), "d"(edx));
|
||||
}
|
||||
|
||||
static inline void __mwait(unsigned long eax, unsigned long ecx)
|
||||
static __always_inline void __mwait(unsigned long eax, unsigned long ecx)
|
||||
{
|
||||
mds_idle_clear_cpu_buffers();
|
||||
|
||||
@ -77,8 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
|
||||
* EAX (logical) address to monitor
|
||||
* ECX #GP if not zero
|
||||
*/
|
||||
static inline void __mwaitx(unsigned long eax, unsigned long ebx,
|
||||
unsigned long ecx)
|
||||
static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx,
|
||||
unsigned long ecx)
|
||||
{
|
||||
/* No MDS buffer clear as this is AMD/HYGON only */
|
||||
|
||||
@ -87,7 +87,7 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
|
||||
:: "a" (eax), "b" (ebx), "c" (ecx));
|
||||
}
|
||||
|
||||
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
|
||||
static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx)
|
||||
{
|
||||
mds_idle_clear_cpu_buffers();
|
||||
/* "mwait %eax, %ecx;" */
|
||||
@ -105,7 +105,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
|
||||
* New with Core Duo processors, MWAIT can take some hints based on CPU
|
||||
* capability.
|
||||
*/
|
||||
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
|
||||
static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
|
||||
{
|
||||
if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
|
||||
if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
|
||||
|
@ -564,7 +564,7 @@ static __always_inline void mds_user_clear_cpu_buffers(void)
|
||||
*
|
||||
* Clear CPU buffers if the corresponding static key is enabled
|
||||
*/
|
||||
static inline void mds_idle_clear_cpu_buffers(void)
|
||||
static __always_inline void mds_idle_clear_cpu_buffers(void)
|
||||
{
|
||||
if (static_branch_likely(&mds_idle_clear))
|
||||
mds_clear_cpu_buffers();
|
||||
|
@ -26,7 +26,7 @@ DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
|
||||
|
||||
void paravirt_set_sched_clock(u64 (*func)(void));
|
||||
|
||||
static inline u64 paravirt_sched_clock(void)
|
||||
static __always_inline u64 paravirt_sched_clock(void)
|
||||
{
|
||||
return static_call(pv_sched_clock)();
|
||||
}
|
||||
@ -168,7 +168,7 @@ static inline void __write_cr4(unsigned long x)
|
||||
PVOP_VCALL1(cpu.write_cr4, x);
|
||||
}
|
||||
|
||||
static inline void arch_safe_halt(void)
|
||||
static __always_inline void arch_safe_halt(void)
|
||||
{
|
||||
PVOP_VCALL0(irq.safe_halt);
|
||||
}
|
||||
@ -178,7 +178,9 @@ static inline void halt(void)
|
||||
PVOP_VCALL0(irq.halt);
|
||||
}
|
||||
|
||||
static inline void wbinvd(void)
|
||||
extern noinstr void pv_native_wbinvd(void);
|
||||
|
||||
static __always_inline void wbinvd(void)
|
||||
{
|
||||
PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
|
||||
}
|
||||
|
@ -586,7 +586,7 @@ extern void perf_amd_brs_lopwr_cb(bool lopwr_in);
|
||||
|
||||
DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
|
||||
|
||||
static inline void perf_lopwr_cb(bool lopwr_in)
|
||||
static __always_inline void perf_lopwr_cb(bool lopwr_in)
|
||||
{
|
||||
static_call_mod(perf_lopwr_cb)(lopwr_in);
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
/* some helper functions for xen and kvm pv clock sources */
|
||||
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
|
||||
u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src);
|
||||
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
|
||||
void pvclock_set_flags(u8 flags);
|
||||
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
|
||||
@ -39,7 +40,7 @@ bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
|
||||
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
|
||||
* yielding a 64-bit result.
|
||||
*/
|
||||
static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
|
||||
static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
|
||||
{
|
||||
u64 product;
|
||||
#ifdef __i386__
|
||||
|
@ -5,13 +5,13 @@
|
||||
#include <linux/types.h>
|
||||
|
||||
#define BUILDIO(bwl, bw, type) \
|
||||
static inline void __out##bwl(type value, u16 port) \
|
||||
static __always_inline void __out##bwl(type value, u16 port) \
|
||||
{ \
|
||||
asm volatile("out" #bwl " %" #bw "0, %w1" \
|
||||
: : "a"(value), "Nd"(port)); \
|
||||
} \
|
||||
\
|
||||
static inline type __in##bwl(u16 port) \
|
||||
static __always_inline type __in##bwl(u16 port) \
|
||||
{ \
|
||||
type value; \
|
||||
asm volatile("in" #bwl " %w1, %" #bw "0" \
|
||||
|
@ -8,7 +8,6 @@
|
||||
#define TDX_HYPERCALL_STANDARD 0
|
||||
|
||||
#define TDX_HCALL_HAS_OUTPUT BIT(0)
|
||||
#define TDX_HCALL_ISSUE_STI BIT(1)
|
||||
|
||||
#define TDX_CPUID_LEAF_ID 0x21
|
||||
#define TDX_IDENT "IntelTDX "
|
||||
|
@ -115,7 +115,7 @@ static inline void wrpkru(u32 pkru)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void native_wbinvd(void)
|
||||
static __always_inline void native_wbinvd(void)
|
||||
{
|
||||
asm volatile("wbinvd": : :"memory");
|
||||
}
|
||||
@ -179,7 +179,7 @@ static inline void __write_cr4(unsigned long x)
|
||||
native_write_cr4(x);
|
||||
}
|
||||
|
||||
static inline void wbinvd(void)
|
||||
static __always_inline void wbinvd(void)
|
||||
{
|
||||
native_wbinvd();
|
||||
}
|
||||
@ -196,7 +196,7 @@ static inline void load_gs_index(unsigned int selector)
|
||||
|
||||
#endif /* CONFIG_PARAVIRT_XXL */
|
||||
|
||||
static inline void clflush(volatile void *__p)
|
||||
static __always_inline void clflush(volatile void *__p)
|
||||
{
|
||||
asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
|
||||
}
|
||||
@ -295,7 +295,7 @@ static inline int enqcmds(void __iomem *dst, const void *src)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void tile_release(void)
|
||||
static __always_inline void tile_release(void)
|
||||
{
|
||||
/*
|
||||
* Instruction opcode for TILERELEASE; supported in binutils
|
||||
|
@ -382,7 +382,7 @@ MULTI_stack_switch(struct multicall_entry *mcl,
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int
|
||||
static __always_inline int
|
||||
HYPERVISOR_sched_op(int cmd, void *arg)
|
||||
{
|
||||
return _hypercall2(int, sched_op, cmd, arg);
|
||||
|
@ -86,7 +86,7 @@ void update_spec_ctrl_cond(u64 val)
|
||||
wrmsrl(MSR_IA32_SPEC_CTRL, val);
|
||||
}
|
||||
|
||||
u64 spec_ctrl_current(void)
|
||||
noinstr u64 spec_ctrl_current(void)
|
||||
{
|
||||
return this_cpu_read(x86_spec_ctrl_current);
|
||||
}
|
||||
|
@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char *arg)
|
||||
}
|
||||
early_param("no-steal-acc", parse_no_stealacc);
|
||||
|
||||
static unsigned long long notrace vmware_sched_clock(void)
|
||||
static noinstr u64 vmware_sched_clock(void)
|
||||
{
|
||||
unsigned long long ns;
|
||||
|
||||
|
@ -853,12 +853,12 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
|
||||
* Initialize register state that may prevent from entering low-power idle.
|
||||
* This function will be invoked from the cpuidle driver only when needed.
|
||||
*/
|
||||
void fpu_idle_fpregs(void)
|
||||
noinstr void fpu_idle_fpregs(void)
|
||||
{
|
||||
/* Note: AMX_TILE being enabled implies XGETBV1 support */
|
||||
if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
|
||||
(xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
|
||||
tile_release();
|
||||
fpregs_deactivate(¤t->thread.fpu);
|
||||
__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
|
||||
}
|
||||
}
|
||||
|
@ -71,12 +71,12 @@ static int kvm_set_wallclock(const struct timespec64 *now)
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
static u64 kvm_clock_read(void)
|
||||
static noinstr u64 kvm_clock_read(void)
|
||||
{
|
||||
u64 ret;
|
||||
|
||||
preempt_disable_notrace();
|
||||
ret = pvclock_clocksource_read(this_cpu_pvti());
|
||||
ret = pvclock_clocksource_read_nowd(this_cpu_pvti());
|
||||
preempt_enable_notrace();
|
||||
return ret;
|
||||
}
|
||||
@ -86,7 +86,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs)
|
||||
return kvm_clock_read();
|
||||
}
|
||||
|
||||
static u64 kvm_sched_clock_read(void)
|
||||
static noinstr u64 kvm_sched_clock_read(void)
|
||||
{
|
||||
return kvm_clock_read() - kvm_sched_clock_offset;
|
||||
}
|
||||
|
@ -216,6 +216,11 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
|
||||
native_set_debugreg(regno, val);
|
||||
}
|
||||
|
||||
noinstr void pv_native_wbinvd(void)
|
||||
{
|
||||
native_wbinvd();
|
||||
}
|
||||
|
||||
static noinstr void pv_native_irq_enable(void)
|
||||
{
|
||||
native_irq_enable();
|
||||
@ -225,6 +230,11 @@ static noinstr void pv_native_irq_disable(void)
|
||||
{
|
||||
native_irq_disable();
|
||||
}
|
||||
|
||||
static noinstr void pv_native_safe_halt(void)
|
||||
{
|
||||
native_safe_halt();
|
||||
}
|
||||
#endif
|
||||
|
||||
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
|
||||
@ -256,7 +266,7 @@ struct paravirt_patch_template pv_ops = {
|
||||
.cpu.read_cr0 = native_read_cr0,
|
||||
.cpu.write_cr0 = native_write_cr0,
|
||||
.cpu.write_cr4 = native_write_cr4,
|
||||
.cpu.wbinvd = native_wbinvd,
|
||||
.cpu.wbinvd = pv_native_wbinvd,
|
||||
.cpu.read_msr = native_read_msr,
|
||||
.cpu.write_msr = native_write_msr,
|
||||
.cpu.read_msr_safe = native_read_msr_safe,
|
||||
@ -290,7 +300,7 @@ struct paravirt_patch_template pv_ops = {
|
||||
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
|
||||
.irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
|
||||
.irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
|
||||
.irq.safe_halt = native_safe_halt,
|
||||
.irq.safe_halt = pv_native_safe_halt,
|
||||
.irq.halt = native_halt,
|
||||
#endif /* CONFIG_PARAVIRT_XXL */
|
||||
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <linux/cpuidle.h>
|
||||
#include <linux/acpi.h>
|
||||
#include <linux/elf-randomize.h>
|
||||
#include <linux/static_call.h>
|
||||
#include <trace/events/power.h>
|
||||
#include <linux/hw_breakpoint.h>
|
||||
#include <asm/cpu.h>
|
||||
@ -694,7 +695,24 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
|
||||
unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
|
||||
EXPORT_SYMBOL(boot_option_idle_override);
|
||||
|
||||
static void (*x86_idle)(void);
|
||||
/*
|
||||
* We use this if we don't have any better idle routine..
|
||||
*/
|
||||
void __cpuidle default_idle(void)
|
||||
{
|
||||
raw_safe_halt();
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
|
||||
EXPORT_SYMBOL(default_idle);
|
||||
#endif
|
||||
|
||||
DEFINE_STATIC_CALL_NULL(x86_idle, default_idle);
|
||||
|
||||
static bool x86_idle_set(void)
|
||||
{
|
||||
return !!static_call_query(x86_idle);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_SMP
|
||||
static inline void play_dead(void)
|
||||
@ -717,28 +735,17 @@ void arch_cpu_idle_dead(void)
|
||||
/*
|
||||
* Called from the generic idle code.
|
||||
*/
|
||||
void arch_cpu_idle(void)
|
||||
void __cpuidle arch_cpu_idle(void)
|
||||
{
|
||||
x86_idle();
|
||||
static_call(x86_idle)();
|
||||
}
|
||||
|
||||
/*
|
||||
* We use this if we don't have any better idle routine..
|
||||
*/
|
||||
void __cpuidle default_idle(void)
|
||||
{
|
||||
raw_safe_halt();
|
||||
}
|
||||
#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
|
||||
EXPORT_SYMBOL(default_idle);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_XEN
|
||||
bool xen_set_default_idle(void)
|
||||
{
|
||||
bool ret = !!x86_idle;
|
||||
bool ret = x86_idle_set();
|
||||
|
||||
x86_idle = default_idle;
|
||||
static_call_update(x86_idle, default_idle);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -800,13 +807,7 @@ static void amd_e400_idle(void)
|
||||
|
||||
default_idle();
|
||||
|
||||
/*
|
||||
* The switch back from broadcast mode needs to be called with
|
||||
* interrupts disabled.
|
||||
*/
|
||||
raw_local_irq_disable();
|
||||
tick_broadcast_exit();
|
||||
raw_local_irq_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -864,12 +865,10 @@ static __cpuidle void mwait_idle(void)
|
||||
}
|
||||
|
||||
__monitor((void *)¤t_thread_info()->flags, 0, 0);
|
||||
if (!need_resched())
|
||||
if (!need_resched()) {
|
||||
__sti_mwait(0, 0);
|
||||
else
|
||||
raw_local_irq_enable();
|
||||
} else {
|
||||
raw_local_irq_enable();
|
||||
raw_local_irq_disable();
|
||||
}
|
||||
}
|
||||
__current_clr_polling();
|
||||
}
|
||||
@ -880,20 +879,20 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
|
||||
if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
|
||||
pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
|
||||
#endif
|
||||
if (x86_idle || boot_option_idle_override == IDLE_POLL)
|
||||
if (x86_idle_set() || boot_option_idle_override == IDLE_POLL)
|
||||
return;
|
||||
|
||||
if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
|
||||
pr_info("using AMD E400 aware idle routine\n");
|
||||
x86_idle = amd_e400_idle;
|
||||
static_call_update(x86_idle, amd_e400_idle);
|
||||
} else if (prefer_mwait_c1_over_halt(c)) {
|
||||
pr_info("using mwait in idle threads\n");
|
||||
x86_idle = mwait_idle;
|
||||
static_call_update(x86_idle, mwait_idle);
|
||||
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
|
||||
pr_info("using TDX aware idle routine\n");
|
||||
x86_idle = tdx_safe_halt;
|
||||
static_call_update(x86_idle, tdx_safe_halt);
|
||||
} else
|
||||
x86_idle = default_idle;
|
||||
static_call_update(x86_idle, default_idle);
|
||||
}
|
||||
|
||||
void amd_e400_c1e_apic_setup(void)
|
||||
@ -946,7 +945,7 @@ static int __init idle_setup(char *str)
|
||||
* To continue to load the CPU idle driver, don't touch
|
||||
* the boot_option_idle_override.
|
||||
*/
|
||||
x86_idle = default_idle;
|
||||
static_call_update(x86_idle, default_idle);
|
||||
boot_option_idle_override = IDLE_HALT;
|
||||
} else if (!strcmp(str, "nomwait")) {
|
||||
/*
|
||||
|
@ -64,7 +64,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
|
||||
return flags & valid_flags;
|
||||
}
|
||||
|
||||
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
|
||||
static __always_inline
|
||||
u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
|
||||
{
|
||||
unsigned version;
|
||||
u64 ret;
|
||||
@ -77,7 +78,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
|
||||
flags = src->flags;
|
||||
} while (pvclock_read_retry(src, version));
|
||||
|
||||
if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
|
||||
if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
|
||||
src->flags &= ~PVCLOCK_GUEST_STOPPED;
|
||||
pvclock_touch_watchdogs();
|
||||
}
|
||||
@ -100,16 +101,25 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
|
||||
* updating at the same time, and one of them could be slightly behind,
|
||||
* making the assumption that last_value always go forward fail to hold.
|
||||
*/
|
||||
last = atomic64_read(&last_value);
|
||||
last = arch_atomic64_read(&last_value);
|
||||
do {
|
||||
if (ret < last)
|
||||
if (ret <= last)
|
||||
return last;
|
||||
last = atomic64_cmpxchg(&last_value, last, ret);
|
||||
} while (unlikely(last != ret));
|
||||
} while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
|
||||
{
|
||||
return __pvclock_clocksource_read(src, true);
|
||||
}
|
||||
|
||||
noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src)
|
||||
{
|
||||
return __pvclock_clocksource_read(src, false);
|
||||
}
|
||||
|
||||
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
|
||||
struct pvclock_vcpu_time_info *vcpu_time,
|
||||
struct timespec64 *ts)
|
||||
|
@ -215,7 +215,7 @@ static void __init cyc2ns_init_secondary_cpus(void)
|
||||
/*
|
||||
* Scheduler clock - returns current time in nanosec units.
|
||||
*/
|
||||
u64 native_sched_clock(void)
|
||||
noinstr u64 native_sched_clock(void)
|
||||
{
|
||||
if (static_branch_likely(&__use_tsc)) {
|
||||
u64 tsc_now = rdtsc();
|
||||
@ -248,7 +248,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
|
||||
/* We need to define a real function for sched_clock, to override the
|
||||
weak default version */
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
unsigned long long sched_clock(void)
|
||||
noinstr u64 sched_clock(void)
|
||||
{
|
||||
return paravirt_sched_clock();
|
||||
}
|
||||
@ -258,8 +258,7 @@ bool using_native_sched_clock(void)
|
||||
return static_call_query(pv_sched_clock) == native_sched_clock;
|
||||
}
|
||||
#else
|
||||
unsigned long long
|
||||
sched_clock(void) __attribute__((alias("native_sched_clock")));
|
||||
u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
|
||||
|
||||
bool using_native_sched_clock(void) { return true; }
|
||||
#endif
|
||||
|
@ -129,7 +129,6 @@ SECTIONS
|
||||
HEAD_TEXT
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/export.h>
|
||||
|
||||
.pushsection .noinstr.text, "ax"
|
||||
.section .noinstr.text, "ax"
|
||||
|
||||
/*
|
||||
* We build a jump to memcpy_orig by default which gets NOPped out on
|
||||
@ -43,7 +43,7 @@ SYM_TYPED_FUNC_START(__memcpy)
|
||||
SYM_FUNC_END(__memcpy)
|
||||
EXPORT_SYMBOL(__memcpy)
|
||||
|
||||
SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
|
||||
SYM_FUNC_ALIAS(memcpy, __memcpy)
|
||||
EXPORT_SYMBOL(memcpy)
|
||||
|
||||
/*
|
||||
@ -184,4 +184,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
|
||||
RET
|
||||
SYM_FUNC_END(memcpy_orig)
|
||||
|
||||
.popsection
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user