Add a delay, configurable via debugfs (default 34ms), to disable scheduling of a context after the pin count goes to zero. Disable scheduling is a costly operation as it requires synchronizing with the GuC. So the idea is that a delay allows the user to resubmit something before doing this operation. This delay is only done if the context isn't closed and less than a given threshold (default is 3/4) of the guc_ids are in use. Alan Previn: Matt Brost first introduced this patch back in Oct 2021. However no real world workload with measured performance impact was available to prove the intended results. Today, this series is being republished in response to a real world workload that benefited greatly from it along with measured performance improvement. Workload description: 36 containers were created on a DG2 device where each container was performing a combination of 720p 3d game rendering and 30fps video encoding. The workload density was configured in a way that guaranteed each container to ALWAYS be able to render and encode no less than 30fps with a predefined maximum render + encode latency time. That means the totality of all 36 containers and their workloads were not saturating the engines to their max (in order to maintain just enough headrooom to meet the min fps and max latencies of incoming container submissions). Problem statement: It was observed that the CPU core processing the i915 soft IRQ work was experiencing severe load. Using tracelogs and an instrumentation patch to count specific i915 IRQ events, it was confirmed that the majority of the CPU cycles were caused by the gen11_other_irq_handler() -> guc_irq_handler() code path. The vast majority of the cycles was determined to be processing a specific G2H IRQ: i.e. INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE. These IRQs are sent by GuC in response to i915 KMD sending H2G requests: INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET. Those H2G requests are sent whenever a context goes idle so that we can unpin the context from GuC. The high CPU utilization % symptom was limiting density scaling. Root Cause Analysis: Because the incoming execution buffers were spread across 36 different containers (each with multiple contexts) but the system in totality was NOT saturated to the max, it was assumed that each context was constantly idling between submissions. This was causing a thrashing of unpinning contexts from GuC at one moment, followed quickly by repinning them due to incoming workload the very next moment. These event-pairs were being triggered across multiple contexts per container, across all containers at the rate of > 30 times per sec per context. Metrics: When running this workload without this patch, we measured an average of ~69K INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE events every 10 seconds or ~10 million times over ~25+ mins. With this patch, the count reduced to ~480 every 10 seconds or about ~28K over ~10 mins. The improvement observed is ~99% for the average counts per 10 seconds. Design awareness: Selftest impact. As temporary WA disable this feature for the selftests. Selftests are very timing sensitive and any change in timing can cause failure. A follow up patch will fixup the selftests to understand this delay. Design awareness: Race between guc_request_alloc and guc_context_close. If a context close is issued while there is a request submission in flight and a delayed schedule disable is pending, guc_context_close and guc_request_alloc will race to cancel the delayed disable. To close the race, make sure that guc_request_alloc waits for guc_context_close to finish running before checking any state. Design awareness: GT Reset event. If a gt reset is triggered, as preparation steps, add an additional step to ensure all contexts that have a pending delay-disable-schedule task be flushed of it. Move them directly into the closed state after cancelling the worker. This is okay because the existing flow flushes all yet-to-arrive G2H's dropping them anyway. Signed-off-by: Matthew Brost <matthew.brost@intel.com> Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Reviewed-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221006225121.826257-2-alan.previn.teres.alexis@intel.com
143 lines
4.5 KiB
C
143 lines
4.5 KiB
C
/*
|
|
* Copyright © 2016 Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef __I915_SELFTEST_H__
|
|
#define __I915_SELFTEST_H__
|
|
|
|
#include <linux/types.h>
|
|
|
|
struct pci_dev;
|
|
struct drm_i915_private;
|
|
|
|
struct i915_selftest {
|
|
unsigned long timeout_jiffies;
|
|
unsigned int timeout_ms;
|
|
unsigned int random_seed;
|
|
char *filter;
|
|
int mock;
|
|
int live;
|
|
int perf;
|
|
};
|
|
|
|
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
|
|
#include <linux/fault-inject.h>
|
|
|
|
extern struct i915_selftest i915_selftest;
|
|
|
|
int i915_mock_selftests(void);
|
|
int i915_live_selftests(struct pci_dev *pdev);
|
|
int i915_perf_selftests(struct pci_dev *pdev);
|
|
|
|
/* We extract the function declarations from i915_mock_selftests.h and
|
|
* i915_live_selftests.h Add your unit test declarations there!
|
|
*
|
|
* Mock unit tests are run very early upon module load, before the driver
|
|
* is probed. All hardware interactions, as well as other subsystems, must
|
|
* be "mocked".
|
|
*
|
|
* Live unit tests are run after the driver is loaded - all hardware
|
|
* interactions are real.
|
|
*/
|
|
#define selftest(name, func) int func(void);
|
|
#include "selftests/i915_mock_selftests.h"
|
|
#undef selftest
|
|
#define selftest(name, func) int func(struct drm_i915_private *i915);
|
|
#include "selftests/i915_live_selftests.h"
|
|
#include "selftests/i915_perf_selftests.h"
|
|
#undef selftest
|
|
|
|
struct i915_subtest {
|
|
int (*func)(void *data);
|
|
const char *name;
|
|
};
|
|
|
|
int __i915_nop_setup(void *data);
|
|
int __i915_nop_teardown(int err, void *data);
|
|
|
|
int __i915_live_setup(void *data);
|
|
int __i915_live_teardown(int err, void *data);
|
|
|
|
int __intel_gt_live_setup(void *data);
|
|
int __intel_gt_live_teardown(int err, void *data);
|
|
|
|
int __i915_subtests(const char *caller,
|
|
int (*setup)(void *data),
|
|
int (*teardown)(int err, void *data),
|
|
const struct i915_subtest *st,
|
|
unsigned int count,
|
|
void *data);
|
|
#define i915_subtests(T, data) \
|
|
__i915_subtests(__func__, \
|
|
__i915_nop_setup, __i915_nop_teardown, \
|
|
T, ARRAY_SIZE(T), data)
|
|
#define i915_live_subtests(T, data) ({ \
|
|
typecheck(struct drm_i915_private *, data); \
|
|
(data)->gt[0]->uc.guc.submission_state.sched_disable_delay_ms = 0; \
|
|
__i915_subtests(__func__, \
|
|
__i915_live_setup, __i915_live_teardown, \
|
|
T, ARRAY_SIZE(T), data); \
|
|
})
|
|
#define intel_gt_live_subtests(T, data) ({ \
|
|
typecheck(struct intel_gt *, data); \
|
|
(data)->uc.guc.submission_state.sched_disable_delay_ms = 0; \
|
|
__i915_subtests(__func__, \
|
|
__intel_gt_live_setup, __intel_gt_live_teardown, \
|
|
T, ARRAY_SIZE(T), data); \
|
|
})
|
|
|
|
#define SUBTEST(x) { x, #x }
|
|
|
|
#define I915_SELFTEST_DECLARE(x) x
|
|
#define I915_SELFTEST_ONLY(x) unlikely(x)
|
|
#define I915_SELFTEST_EXPORT
|
|
|
|
#else /* !IS_ENABLED(CONFIG_DRM_I915_SELFTEST) */
|
|
|
|
static inline int i915_mock_selftests(void) { return 0; }
|
|
static inline int i915_live_selftests(struct pci_dev *pdev) { return 0; }
|
|
static inline int i915_perf_selftests(struct pci_dev *pdev) { return 0; }
|
|
|
|
#define I915_SELFTEST_DECLARE(x)
|
|
#define I915_SELFTEST_ONLY(x) 0
|
|
#define I915_SELFTEST_EXPORT static
|
|
|
|
#endif
|
|
|
|
/* Using the i915_selftest_ prefix becomes a little unwieldy with the helpers.
|
|
* Instead we use the igt_ shorthand, in reference to the intel-gpu-tools
|
|
* suite of uabi test cases (which includes a test runner for our selftests).
|
|
*/
|
|
|
|
#define IGT_TIMEOUT(name__) \
|
|
unsigned long name__ = jiffies + i915_selftest.timeout_jiffies
|
|
|
|
__printf(2, 3)
|
|
bool __igt_timeout(unsigned long timeout, const char *fmt, ...);
|
|
|
|
#define igt_timeout(t, fmt, ...) \
|
|
__igt_timeout((t), KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
|
|
|
|
void igt_hexdump(const void *buf, size_t len);
|
|
|
|
#endif /* !__I915_SELFTEST_H__ */
|