f3232321db
On Power9, a pair of SMT4 cores can be presented by the firmware as a SMT8 core for backward compatibility reasons, with the fusion of two SMT4 cores. Powerpc allows LPARs to be live migrated from Power8 to Power9. Existing software developed/configured for Power8, expects to see a SMT8 core. In order to maintain userspace backward compatibility (with Power8 chips in case of Power9) in enterprise Linux systems, the topology_sibling_cpumask has to be set to SMT8 core. cpu_smt_mask() should generally point to the cpu mask of the SMT4 core. Hence override the default cpu_smt_mask() to be powerpc specific allowing for better scheduling behaviour on Power. schbench (latency measured in usecs, so lesser is better) Without patch With patch Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 34 50.0000th: 38 75.0000th: 47 75.0000th: 52 90.0000th: 54 90.0000th: 60 95.0000th: 57 95.0000th: 64 *99.0000th: 62 *99.0000th: 72 99.5000th: 65 99.5000th: 75 99.9000th: 76 99.9000th: 3452 min=0, max=9205 min=0, max=9344 schbench (With Cede disabled) Without patch With patch Latency percentiles (usec) Latency percentiles (usec) 50.0000th: 20 50.0000th: 21 75.0000th: 28 75.0000th: 29 90.0000th: 33 90.0000th: 34 95.0000th: 35 95.0000th: 37 *99.0000th: 40 *99.0000th: 40 99.5000th: 48 99.5000th: 42 99.9000th: 94 99.9000th: 79 min=0, max=791 min=0, max=791 perf bench sched pipe usec/ops : lesser is better Without patch N Min Max Median Avg Stddev 101 5.095113 5.595269 5.204842 5.2298776 0.10762713 5.10 - 5.15 : ################################################## 23% (24) 5.15 - 5.20 : ############################################# 21% (22) 5.20 - 5.25 : ################################################## 23% (24) 5.25 - 5.30 : ######################### 11% (12) 5.30 - 5.35 : ########## 4% (5) 5.35 - 5.40 : ######## 3% (4) 5.40 - 5.45 : ######## 3% (4) 5.45 - 5.50 : #### 1% (2) 5.50 - 5.55 : ## 0% (1) 5.55 - 5.60 : #### 1% (2) With patch N Min Max Median Avg Stddev 101 5.134675 8.524719 5.207658 5.2780985 0.34911969 5.1 - 5.5 : ################################################## 94% (95) 5.5 - 5.8 : ## 3% (4) 5.8 - 6.2 : 0% (1) 6.2 - 6.5 : 6.5 - 6.8 : 6.8 - 7.2 : 7.2 - 7.5 : 7.5 - 7.8 : 7.8 - 8.2 : 8.2 - 8.5 : perf bench sched pipe (cede disabled) usec/ops : lesser is better Without patch N Min Max Median Avg Stddev 101 7.884227 12.576538 7.956474 8.0170722 0.46159054 7.9 - 8.4 : ################################################## 99% (100) 8.4 - 8.8 : 8.8 - 9.3 : 9.3 - 9.8 : 9.8 - 10.2 : 10.2 - 10.7 : 10.7 - 11.2 : 11.2 - 11.6 : 11.6 - 12.1 : 12.1 - 12.6 : With patch N Min Max Median Avg Stddev 101 7.956021 8.217284 8.015615 8.0283866 0.049844967 7.96 - 7.98 : ###################### 12% (13) 7.98 - 8.01 : ################################################## 28% (29) 8.01 - 8.03 : #################################### 20% (21) 8.03 - 8.06 : ######################### 14% (15) 8.06 - 8.09 : ###################### 12% (13) 8.09 - 8.11 : ###### 3% (4) 8.11 - 8.14 : ### 1% (2) 8.14 - 8.17 : ### 1% (2) 8.17 - 8.19 : 8.19 - 8.22 : # 0% (1) Observations: With the patch, the initial run/iteration takes a slight longer time. This can be attributed to the fact that now we pick a CPU from a idle core which could be sleep mode. Once we remove the cede, state the numbers improve in favour of the patch. ebizzy: transactions per second (higher is better) without patch N Min Max Median Avg Stddev 100 1018433 1304470 1193208 1182315.7 60018.733 1018433 - 1047037 : ###### 3% (3) 1047037 - 1075640 : ######## 4% (4) 1075640 - 1104244 : ######## 4% (4) 1104244 - 1132848 : ############### 7% (7) 1132848 - 1161452 : #################################### 17% (17) 1161452 - 1190055 : ########################## 12% (12) 1190055 - 1218659 : ############################################# 21% (21) 1218659 - 1247263 : ################################################## 23% (23) 1247263 - 1275866 : ######## 4% (4) 1275866 - 1304470 : ######## 4% (4) with patch N Min Max Median Avg Stddev 100 967014 1292938 1208819 1185281.8 69815.851 967014 - 999606 : ## 1% (1) 999606 - 1032199 : ## 1% (1) 1032199 - 1064791 : ############ 6% (6) 1064791 - 1097384 : ########## 5% (5) 1097384 - 1129976 : ################## 9% (9) 1129976 - 1162568 : #################### 10% (10) 1162568 - 1195161 : ########################## 13% (13) 1195161 - 1227753 : ############################################ 22% (22) 1227753 - 1260346 : ################################################## 25% (25) 1260346 - 1292938 : ############## 7% (7) Observations: Not much changes, ebizzy is not much impacted. Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200807074517.27957-2-srikar@linux.vnet.ibm.com
119 lines
2.9 KiB
C
119 lines
2.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_POWERPC_CPUTHREADS_H
|
|
#define _ASM_POWERPC_CPUTHREADS_H
|
|
|
|
#ifndef __ASSEMBLY__
|
|
#include <linux/cpumask.h>
|
|
#include <asm/cpu_has_feature.h>
|
|
|
|
/*
|
|
* Mapping of threads to cores
|
|
*
|
|
* Note: This implementation is limited to a power of 2 number of
|
|
* threads per core and the same number for each core in the system
|
|
* (though it would work if some processors had less threads as long
|
|
* as the CPU numbers are still allocated, just not brought online).
|
|
*
|
|
* However, the API allows for a different implementation in the future
|
|
* if needed, as long as you only use the functions and not the variables
|
|
* directly.
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
extern int threads_per_core;
|
|
extern int threads_per_subcore;
|
|
extern int threads_shift;
|
|
extern cpumask_t threads_core_mask;
|
|
#else
|
|
#define threads_per_core 1
|
|
#define threads_per_subcore 1
|
|
#define threads_shift 0
|
|
#define has_big_cores 0
|
|
#define threads_core_mask (*get_cpu_mask(0))
|
|
#endif
|
|
|
|
/* cpu_thread_mask_to_cores - Return a cpumask of one per cores
|
|
* hit by the argument
|
|
*
|
|
* @threads: a cpumask of online threads
|
|
*
|
|
* This function returns a cpumask which will have one online cpu's
|
|
* bit set for each core that has at least one thread set in the argument.
|
|
*
|
|
* This can typically be used for things like IPI for tlb invalidations
|
|
* since those need to be done only once per core/TLB
|
|
*/
|
|
static inline cpumask_t cpu_thread_mask_to_cores(const struct cpumask *threads)
|
|
{
|
|
cpumask_t tmp, res;
|
|
int i, cpu;
|
|
|
|
cpumask_clear(&res);
|
|
for (i = 0; i < NR_CPUS; i += threads_per_core) {
|
|
cpumask_shift_left(&tmp, &threads_core_mask, i);
|
|
if (cpumask_intersects(threads, &tmp)) {
|
|
cpu = cpumask_next_and(-1, &tmp, cpu_online_mask);
|
|
if (cpu < nr_cpu_ids)
|
|
cpumask_set_cpu(cpu, &res);
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
static inline int cpu_nr_cores(void)
|
|
{
|
|
return nr_cpu_ids >> threads_shift;
|
|
}
|
|
|
|
static inline cpumask_t cpu_online_cores_map(void)
|
|
{
|
|
return cpu_thread_mask_to_cores(cpu_online_mask);
|
|
}
|
|
|
|
#ifdef CONFIG_SMP
|
|
int cpu_core_index_of_thread(int cpu);
|
|
int cpu_first_thread_of_core(int core);
|
|
#else
|
|
static inline int cpu_core_index_of_thread(int cpu) { return cpu; }
|
|
static inline int cpu_first_thread_of_core(int core) { return core; }
|
|
#endif
|
|
|
|
static inline int cpu_thread_in_core(int cpu)
|
|
{
|
|
return cpu & (threads_per_core - 1);
|
|
}
|
|
|
|
static inline int cpu_thread_in_subcore(int cpu)
|
|
{
|
|
return cpu & (threads_per_subcore - 1);
|
|
}
|
|
|
|
static inline int cpu_first_thread_sibling(int cpu)
|
|
{
|
|
return cpu & ~(threads_per_core - 1);
|
|
}
|
|
|
|
static inline int cpu_last_thread_sibling(int cpu)
|
|
{
|
|
return cpu | (threads_per_core - 1);
|
|
}
|
|
|
|
static inline u32 get_tensr(void)
|
|
{
|
|
#ifdef CONFIG_BOOKE
|
|
if (cpu_has_feature(CPU_FTR_SMT))
|
|
return mfspr(SPRN_TENSR);
|
|
#endif
|
|
return 1;
|
|
}
|
|
|
|
void book3e_start_thread(int thread, unsigned long addr);
|
|
void book3e_stop_thread(int thread);
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
|
|
#define INVALID_THREAD_HWID 0x0fff
|
|
|
|
#endif /* _ASM_POWERPC_CPUTHREADS_H */
|
|
|