|
|
|
@ -38,7 +38,7 @@
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor",
|
|
|
|
|
"MetricExpr": "TOPDOWN.SLOTS / ( TOPDOWN.SLOTS / 2 ) if #SMT_on else 1",
|
|
|
|
|
"MetricGroup": "SMT",
|
|
|
|
|
"MetricGroup": "SMT;TmaL1",
|
|
|
|
|
"MetricName": "Slots_Utilization"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@ -61,24 +61,18 @@
|
|
|
|
|
"MetricName": "FLOPc"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width)",
|
|
|
|
|
"BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
|
|
|
|
|
"MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )",
|
|
|
|
|
"MetricGroup": "Cor;Flops;HPC",
|
|
|
|
|
"MetricName": "FP_Arith_Utilization",
|
|
|
|
|
"PublicDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width). Values > 1 are possible due to Fused-Multiply Add (FMA) counting."
|
|
|
|
|
"PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
|
|
|
|
|
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
|
|
|
|
|
"MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
|
|
|
|
|
"MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
|
|
|
|
|
"MetricName": "ILP"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
|
|
|
|
|
"MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
|
|
|
|
|
"MetricGroup": "Bad;BadSpec;BrMispredicts",
|
|
|
|
|
"MetricName": "IpMispredict"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
|
|
|
|
|
"MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
|
|
|
|
@ -169,12 +163,24 @@
|
|
|
|
|
"MetricName": "IpArith_AVX512",
|
|
|
|
|
"PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)",
|
|
|
|
|
"MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@",
|
|
|
|
|
"MetricGroup": "Prefetches",
|
|
|
|
|
"MetricName": "IpSWPF"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST",
|
|
|
|
|
"MetricExpr": "INST_RETIRED.ANY",
|
|
|
|
|
"MetricGroup": "Summary;TmaL1",
|
|
|
|
|
"MetricName": "Instructions"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "",
|
|
|
|
|
"MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
|
|
|
|
|
"MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
|
|
|
|
|
"MetricName": "Execute"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average number of Uops issued by front-end when it issued something",
|
|
|
|
|
"MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@",
|
|
|
|
@ -194,11 +200,23 @@
|
|
|
|
|
"MetricName": "DSB_Coverage"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Number of Instructions per non-speculative DSB miss",
|
|
|
|
|
"BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.",
|
|
|
|
|
"MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@",
|
|
|
|
|
"MetricGroup": "DSBmiss",
|
|
|
|
|
"MetricName": "DSB_Switch_Cost"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)",
|
|
|
|
|
"MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS",
|
|
|
|
|
"MetricGroup": "DSBmiss;Fed",
|
|
|
|
|
"MetricName": "IpDSB_Miss_Ret"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
|
|
|
|
|
"MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
|
|
|
|
|
"MetricGroup": "Bad;BadSpec;BrMispredicts",
|
|
|
|
|
"MetricName": "IpMispredict"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Fraction of branches that are non-taken conditionals",
|
|
|
|
|
"MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES",
|
|
|
|
@ -230,11 +248,10 @@
|
|
|
|
|
"MetricName": "Other_Branches"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Actual Average Latency for L1 data-cache miss demand load instructions (in core cycles)",
|
|
|
|
|
"BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
|
|
|
|
|
"MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBound;MemoryLat",
|
|
|
|
|
"MetricName": "Load_Miss_Real_Latency",
|
|
|
|
|
"PublicDescription": "Actual Average Latency for L1 data-cache miss demand load instructions (in core cycles). Latency may be overestimated for multi-load instructions - e.g. repeat strings."
|
|
|
|
|
"MetricName": "Load_Miss_Real_Latency"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
|
|
|
|
@ -242,30 +259,6 @@
|
|
|
|
|
"MetricGroup": "Mem;MemoryBound;MemoryBW",
|
|
|
|
|
"MetricName": "MLP"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L1D_Cache_Fill_BW"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L2_Cache_Fill_BW"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L3_Cache_Fill_BW"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW;Offcore",
|
|
|
|
|
"MetricName": "L3_Cache_Access_BW"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
|
|
|
|
|
"MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
|
|
|
|
@ -285,13 +278,13 @@
|
|
|
|
|
"MetricName": "L2MPKI"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
|
|
|
|
|
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
|
|
|
|
|
"MetricExpr": "1000 * ( ( OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD ) + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS ) / INST_RETIRED.ANY",
|
|
|
|
|
"MetricGroup": "Mem;CacheMisses;Offcore",
|
|
|
|
|
"MetricName": "L2MPKI_All"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "L2 cache misses per kilo instruction for all demand loads (including speculative)",
|
|
|
|
|
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)",
|
|
|
|
|
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
|
|
|
|
|
"MetricGroup": "Mem;CacheMisses",
|
|
|
|
|
"MetricName": "L2MPKI_Load"
|
|
|
|
@ -309,7 +302,7 @@
|
|
|
|
|
"MetricName": "L3MPKI"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Fill Buffer (FB) true hits per kilo instructions for retired demand loads",
|
|
|
|
|
"BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
|
|
|
|
|
"MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
|
|
|
|
|
"MetricGroup": "Mem;CacheMisses",
|
|
|
|
|
"MetricName": "FB_HPKI"
|
|
|
|
@ -321,6 +314,54 @@
|
|
|
|
|
"MetricGroup": "Mem;MemoryTLB",
|
|
|
|
|
"MetricName": "Page_Walks_Utilization"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L1D_Cache_Fill_BW"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L2_Cache_Fill_BW"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L3_Cache_Fill_BW"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW;Offcore",
|
|
|
|
|
"MetricName": "L3_Cache_Access_BW"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L1D_Cache_Fill_BW_1T"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L2_Cache_Fill_BW_1T"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW",
|
|
|
|
|
"MetricName": "L3_Cache_Fill_BW_1T"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
|
|
|
|
|
"MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)",
|
|
|
|
|
"MetricGroup": "Mem;MemoryBW;Offcore",
|
|
|
|
|
"MetricName": "L3_Cache_Access_BW_1T"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average CPU Utilization",
|
|
|
|
|
"MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
|
|
|
|
@ -337,7 +378,8 @@
|
|
|
|
|
"BriefDescription": "Giga Floating Point Operations Per Second",
|
|
|
|
|
"MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * ( FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE ) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE ) / 1000000000 ) / duration_time",
|
|
|
|
|
"MetricGroup": "Cor;Flops;HPC",
|
|
|
|
|
"MetricName": "GFLOPs"
|
|
|
|
|
"MetricName": "GFLOPs",
|
|
|
|
|
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
|
|
|
|
|