linux/drivers/gpu/drm/i915/i915_debugfs.c

792 lines
21 KiB
C
Raw Normal View History

/*
* Copyright © 2008 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
* Keith Packard <keithp@keithp.com>
*
*/
#include <linux/sched/mm.h>
#include <linux/sort.h>
drm: Split out drm_probe_helper.h Having the probe helper stuff (which pretty much everyone needs) in the drm_crtc_helper.h file (which atomic drivers should never need) is confusing. Split them out. To make sure I actually achieved the goal here I went through all drivers. And indeed, all atomic drivers are now free of drm_crtc_helper.h includes. v2: Make it compile. There was so much compile fail on arm drivers that I figured I'll better not include any of the acks on v1. v3: Massive rebase because i915 has lost a lot of drmP.h includes, but not all: Through drm_crtc_helper.h > drm_modeset_helper.h -> drmP.h there was still one, which this patch largely removes. Which means rolling out lots more includes all over. This will also conflict with ongoing drmP.h cleanup by others I expect. v3: Rebase on top of atomic bochs. v4: Review from Laurent for bridge/rcar/omap/shmob/core bits: - (re)move some of the added includes, use the better include files in other places (all suggested from Laurent adopted unchanged). - sort alphabetically v5: Actually try to sort them, and while at it, sort all the ones I touch. v6: Rebase onto i915 changes. v7: Rebase once more. Acked-by: Harry Wentland <harry.wentland@amd.com> Acked-by: Sam Ravnborg <sam@ravnborg.org> Cc: Sam Ravnborg <sam@ravnborg.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com> Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Acked-by: Benjamin Gaignard <benjamin.gaignard@linaro.org> Acked-by: Jani Nikula <jani.nikula@intel.com> Acked-by: Neil Armstrong <narmstrong@baylibre.com> Acked-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> Acked-by: CK Hu <ck.hu@mediatek.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Sam Ravnborg <sam@ravnborg.org> Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com> Acked-by: Liviu Dudau <liviu.dudau@arm.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Cc: linux-arm-kernel@lists.infradead.org Cc: virtualization@lists.linux-foundation.org Cc: etnaviv@lists.freedesktop.org Cc: linux-samsung-soc@vger.kernel.org Cc: intel-gfx@lists.freedesktop.org Cc: linux-mediatek@lists.infradead.org Cc: linux-amlogic@lists.infradead.org Cc: linux-arm-msm@vger.kernel.org Cc: freedreno@lists.freedesktop.org Cc: nouveau@lists.freedesktop.org Cc: spice-devel@lists.freedesktop.org Cc: amd-gfx@lists.freedesktop.org Cc: linux-renesas-soc@vger.kernel.org Cc: linux-rockchip@lists.infradead.org Cc: linux-stm32@st-md-mailman.stormreply.com Cc: linux-tegra@vger.kernel.org Cc: xen-devel@lists.xen.org Link: https://patchwork.freedesktop.org/patch/msgid/20190117210334.13234-1-daniel.vetter@ffwll.ch
2019-01-17 22:03:34 +01:00
#include <drm/drm_debugfs.h>
#include "gem/i915_gem_context.h"
#include "gt/intel_gt.h"
#include "gt/intel_gt_buffer_pool.h"
#include "gt/intel_gt_clock_utils.h"
#include "gt/intel_gt_debugfs.h"
drm/i915: Defer final intel_wakeref_put to process context As we need to acquire a mutex to serialise the final intel_wakeref_put, we need to ensure that we are in process context at that time. However, we want to allow operation on the intel_wakeref from inside timer and other hardirq context, which means that need to defer that final put to a workqueue. Inside the final wakeref puts, we are safe to operate in any context, as we are simply marking up the HW and state tracking for the potential sleep. It's only the serialisation with the potential sleeping getting that requires careful wait avoidance. This allows us to retain the immediate processing as before (we only need to sleep over the same races as the current mutex_lock). v2: Add a selftest to ensure we exercise the code while lockdep watches. v3: That test was extremely loud and complained about many things! v4: Not a whale! Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=111295 References: https://bugs.freedesktop.org/show_bug.cgi?id=111245 References: https://bugs.freedesktop.org/show_bug.cgi?id=111256 Fixes: 18398904ca9e ("drm/i915: Only recover active engines") Fixes: 51fbd8de87dc ("drm/i915/pmu: Atomically acquire the gt_pm wakeref") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190808202758.10453-1-chris@chris-wilson.co.uk
2019-08-08 21:27:58 +01:00
#include "gt/intel_gt_pm.h"
#include "gt/intel_gt_pm_debugfs.h"
#include "gt/intel_gt_regs.h"
#include "gt/intel_gt_requests.h"
#include "gt/intel_rc6.h"
#include "gt/intel_reset.h"
#include "gt/intel_rps.h"
#include "gt/intel_sseu_debugfs.h"
#include "i915_debugfs.h"
#include "i915_debugfs_params.h"
#include "i915_irq.h"
#include "i915_scheduler.h"
#include "intel_pm.h"
static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
{
return to_i915(node->minor->dev);
}
static int i915_capabilities(struct seq_file *m, void *data)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct drm_printer p = drm_seq_file_printer(m);
seq_printf(m, "pch: %d\n", INTEL_PCH_TYPE(i915));
intel_device_info_print_static(INTEL_INFO(i915), &p);
intel_device_info_print_runtime(RUNTIME_INFO(i915), &p);
drm/i915: Use per device iommu check With both integrated and discrete Intel GPUs in a system, the current global check of intel_iommu_gfx_mapped, as done from intel_vtd_active() may not be completely accurate. In this patch we add i915 parameter to intel_vtd_active() in order to prepare it for multiple GPUs and we also change the check away from Intel specific intel_iommu_gfx_mapped (global exported by the Intel IOMMU driver) to probing the presence of IOMMU on a specific device using device_iommu_mapped(). This will return true both for IOMMU pass-through and address translation modes which matches the current behaviour. If in the future we wanted to distinguish between these two modes we could either use iommu_get_domain_for_dev() and check for __IOMMU_DOMAIN_PAGING bit indicating address translation, or ask for a new API to be exported from the IOMMU core code. v2: * Check for dmar translation specifically, not just iommu domain. (Baolu) v3: * Go back to plain "any domain" check for now, rewrite commit message. v4: * Use device_iommu_mapped. (Robin, Baolu) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Lu Baolu <baolu.lu@linux.intel.com> Cc: Lucas De Marchi <lucas.demarchi@intel.com> Cc: Robin Murphy <robin.murphy@arm.com> Acked-by: Robin Murphy <robin.murphy@arm.com> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com> Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20211126141424.493753-1-tvrtko.ursulin@linux.intel.com
2021-11-26 14:14:24 +00:00
i915_print_iommu_status(i915, &p);
intel_gt_info_print(&to_gt(i915)->info, &p);
intel_driver_caps_print(&i915->caps, &p);
kernel_param_lock(THIS_MODULE);
i915_params_dump(&i915->params, &p);
kernel_param_unlock(THIS_MODULE);
return 0;
}
static char get_tiling_flag(struct drm_i915_gem_object *obj)
{
switch (i915_gem_object_get_tiling(obj)) {
default:
case I915_TILING_NONE: return ' ';
case I915_TILING_X: return 'X';
case I915_TILING_Y: return 'Y';
}
}
static char get_global_flag(struct drm_i915_gem_object *obj)
{
return READ_ONCE(obj->userfault_count) ? 'g' : ' ';
}
static char get_pin_mapped_flag(struct drm_i915_gem_object *obj)
{
return obj->mm.mapping ? 'M' : ' ';
}
static const char *
stringify_page_sizes(unsigned int page_sizes, char *buf, size_t len)
{
size_t x = 0;
switch (page_sizes) {
case 0:
return "";
case I915_GTT_PAGE_SIZE_4K:
return "4K";
case I915_GTT_PAGE_SIZE_64K:
return "64K";
case I915_GTT_PAGE_SIZE_2M:
return "2M";
default:
if (!buf)
return "M";
if (page_sizes & I915_GTT_PAGE_SIZE_2M)
x += snprintf(buf + x, len - x, "2M, ");
if (page_sizes & I915_GTT_PAGE_SIZE_64K)
x += snprintf(buf + x, len - x, "64K, ");
if (page_sizes & I915_GTT_PAGE_SIZE_4K)
x += snprintf(buf + x, len - x, "4K, ");
buf[x-2] = '\0';
return buf;
}
}
static const char *stringify_vma_type(const struct i915_vma *vma)
{
if (i915_vma_is_ggtt(vma))
return "ggtt";
if (i915_vma_is_dpt(vma))
return "dpt";
return "ppgtt";
}
static const char *i915_cache_level_str(struct drm_i915_private *i915, int type)
{
switch (type) {
case I915_CACHE_NONE: return " uncached";
case I915_CACHE_LLC: return HAS_LLC(i915) ? " LLC" : " snooped";
case I915_CACHE_L3_LLC: return " L3+LLC";
case I915_CACHE_WT: return " WT";
default: return "";
}
}
void
i915_debugfs_describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
{
drm/i915: Implement inter-engine read-read optimisations Currently, we only track the last request globally across all engines. This prevents us from issuing concurrent read requests on e.g. the RCS and BCS engines (or more likely the render and media engines). Without semaphores, we incur costly stalls as we synchronise between rings - greatly impacting the current performance of Broadwell versus Haswell in certain workloads (like video decode). With the introduction of reference counted requests, it is much easier to track the last request per ring, as well as the last global write request so that we can optimise inter-engine read read requests (as well as better optimise certain CPU waits). v2: Fix inverted readonly condition for nonblocking waits. v3: Handle non-continguous engine array after waits v4: Rebase, tidy, rewrite ring list debugging v5: Use obj->active as a bitfield, it looks cool v6: Micro-optimise, mostly involving moving code around v7: Fix retire-requests-upto for execlists (and multiple rq->ringbuf) v8: Rebase v9: Refactor i915_gem_object_sync() to allow the compiler to better optimise it. Benchmark: igt/gem_read_read_speed hsw:gt3e (with semaphores): Before: Time to read-read 1024k: 275.794µs After: Time to read-read 1024k: 123.260µs hsw:gt3e (w/o semaphores): Before: Time to read-read 1024k: 230.433µs After: Time to read-read 1024k: 124.593µs bdw-u (w/o semaphores): Before After Time to read-read 1x1: 26.274µs 10.350µs Time to read-read 128x128: 40.097µs 21.366µs Time to read-read 256x256: 77.087µs 42.608µs Time to read-read 512x512: 281.999µs 181.155µs Time to read-read 1024x1024: 1196.141µs 1118.223µs Time to read-read 2048x2048: 5639.072µs 5225.837µs Time to read-read 4096x4096: 22401.662µs 21137.067µs Time to read-read 8192x8192: 89617.735µs 85637.681µs Testcase: igt/gem_concurrent_blit (read-read and friends) Cc: Lionel Landwerlin <lionel.g.landwerlin@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> [v8] [danvet: s/\<rq\>/req/g] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-27 13:41:17 +01:00
struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
struct i915_vma *vma;
int pin_count = 0;
drm/i915: Replace obj->pin_global with obj->frontbuffer obj->pin_global was originally used as a means to keep the shrinker off the active scanout, but we use the vma->pin_count itself for that and the obj->frontbuffer to delay shrinking active framebuffers. The other role that obj->pin_global gained was for spotting display objects inside GEM and working harder to keep those coherent; for which we can again simply inspect obj->frontbuffer directly. Coming up next, we will want to manipulate the pin_global counter outside of the principle locks, so would need to make pin_global atomic. However, since obj->frontbuffer is already managed atomically, it makes sense to use that the primary key for display objects instead of having pin_global. Ville pointed out the principle difference is that obj->frontbuffer is set for as long as an intel_framebuffer is attached to an object, but obj->pin_global was only raised for as long as the object was active. In practice, this means that we consider the object as being on the scanout for longer than is strictly required, causing us to be more proactive in flushing -- though it should be true that we would have flushed eventually when the back became the front, except that on the flip path that flush is async but when hit from another ioctl it will be synchronous. v2: i915_gem_object_is_framebuffer() Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190902040303.14195-5-chris@chris-wilson.co.uk
2019-09-02 05:02:47 +01:00
seq_printf(m, "%pK: %c%c%c %8zdKiB %02x %02x %s%s%s",
&obj->base,
get_tiling_flag(obj),
get_global_flag(obj),
get_pin_mapped_flag(obj),
obj->base.size / 1024,
obj->read_domains,
obj->write_domain,
i915_cache_level_str(dev_priv, obj->cache_level),
obj->mm.dirty ? " dirty" : "",
obj->mm.madv == I915_MADV_DONTNEED ? " purgeable" : "");
if (obj->base.name)
seq_printf(m, " (name: %d)", obj->base.name);
spin_lock(&obj->vma.lock);
list_for_each_entry(vma, &obj->vma.list, obj_link) {
if (!drm_mm_node_allocated(&vma->node))
continue;
spin_unlock(&obj->vma.lock);
if (i915_vma_is_pinned(vma))
pin_count++;
seq_printf(m, " (%s offset: %08llx, size: %08llx, pages: %s",
stringify_vma_type(vma),
vma->node.start, vma->node.size,
stringify_page_sizes(vma->page_sizes.gtt, NULL, 0));
if (i915_vma_is_ggtt(vma) || i915_vma_is_dpt(vma)) {
switch (vma->ggtt_view.type) {
case I915_GGTT_VIEW_NORMAL:
seq_puts(m, ", normal");
break;
case I915_GGTT_VIEW_PARTIAL:
seq_printf(m, ", partial [%08llx+%x]",
vma->ggtt_view.partial.offset << PAGE_SHIFT,
vma->ggtt_view.partial.size << PAGE_SHIFT);
break;
case I915_GGTT_VIEW_ROTATED:
seq_printf(m, ", rotated [(%ux%u, src_stride=%u, dst_stride=%u, offset=%u), (%ux%u, src_stride=%u, dst_stride=%u, offset=%u)]",
vma->ggtt_view.rotated.plane[0].width,
vma->ggtt_view.rotated.plane[0].height,
vma->ggtt_view.rotated.plane[0].src_stride,
vma->ggtt_view.rotated.plane[0].dst_stride,
vma->ggtt_view.rotated.plane[0].offset,
vma->ggtt_view.rotated.plane[1].width,
vma->ggtt_view.rotated.plane[1].height,
vma->ggtt_view.rotated.plane[1].src_stride,
vma->ggtt_view.rotated.plane[1].dst_stride,
vma->ggtt_view.rotated.plane[1].offset);
break;
case I915_GGTT_VIEW_REMAPPED:
seq_printf(m, ", remapped [(%ux%u, src_stride=%u, dst_stride=%u, offset=%u), (%ux%u, src_stride=%u, dst_stride=%u, offset=%u)]",
vma->ggtt_view.remapped.plane[0].width,
vma->ggtt_view.remapped.plane[0].height,
vma->ggtt_view.remapped.plane[0].src_stride,
vma->ggtt_view.remapped.plane[0].dst_stride,
vma->ggtt_view.remapped.plane[0].offset,
vma->ggtt_view.remapped.plane[1].width,
vma->ggtt_view.remapped.plane[1].height,
vma->ggtt_view.remapped.plane[1].src_stride,
vma->ggtt_view.remapped.plane[1].dst_stride,
vma->ggtt_view.remapped.plane[1].offset);
break;
default:
MISSING_CASE(vma->ggtt_view.type);
break;
}
}
if (vma->fence)
seq_printf(m, " , fence: %d", vma->fence->id);
seq_puts(m, ")");
spin_lock(&obj->vma.lock);
}
spin_unlock(&obj->vma.lock);
seq_printf(m, " (pinned x %d)", pin_count);
if (i915_gem_object_is_stolen(obj))
seq_printf(m, " (stolen: %08llx)", obj->stolen->start);
drm/i915: Replace obj->pin_global with obj->frontbuffer obj->pin_global was originally used as a means to keep the shrinker off the active scanout, but we use the vma->pin_count itself for that and the obj->frontbuffer to delay shrinking active framebuffers. The other role that obj->pin_global gained was for spotting display objects inside GEM and working harder to keep those coherent; for which we can again simply inspect obj->frontbuffer directly. Coming up next, we will want to manipulate the pin_global counter outside of the principle locks, so would need to make pin_global atomic. However, since obj->frontbuffer is already managed atomically, it makes sense to use that the primary key for display objects instead of having pin_global. Ville pointed out the principle difference is that obj->frontbuffer is set for as long as an intel_framebuffer is attached to an object, but obj->pin_global was only raised for as long as the object was active. In practice, this means that we consider the object as being on the scanout for longer than is strictly required, causing us to be more proactive in flushing -- though it should be true that we would have flushed eventually when the back became the front, except that on the flip path that flush is async but when hit from another ioctl it will be synchronous. v2: i915_gem_object_is_framebuffer() Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190902040303.14195-5-chris@chris-wilson.co.uk
2019-09-02 05:02:47 +01:00
if (i915_gem_object_is_framebuffer(obj))
seq_printf(m, " (fb)");
}
static int i915_gem_object_info(struct seq_file *m, void *data)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct drm_printer p = drm_seq_file_printer(m);
struct intel_memory_region *mr;
enum intel_region_id id;
seq_printf(m, "%u shrinkable [%u free] objects, %llu bytes\n",
i915->mm.shrink_count,
atomic_read(&i915->mm.free_count),
i915->mm.shrink_memory);
for_each_memory_region(mr, i915, id)
intel_memory_region_debug(mr, &p);
return 0;
}
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
static ssize_t gpu_state_read(struct file *file, char __user *ubuf,
size_t count, loff_t *pos)
{
struct i915_gpu_coredump *error;
ssize_t ret;
void *buf;
error = file->private_data;
if (!error)
return 0;
/* Bounce buffer required because of kernfs __user API convenience. */
buf = kmalloc(count, GFP_KERNEL);
if (!buf)
return -ENOMEM;
ret = i915_gpu_coredump_copy_to_buffer(error, buf, *pos, count);
if (ret <= 0)
goto out;
if (!copy_to_user(ubuf, buf, ret))
*pos += ret;
else
ret = -EFAULT;
out:
kfree(buf);
return ret;
}
static int gpu_state_release(struct inode *inode, struct file *file)
{
i915_gpu_coredump_put(file->private_data);
return 0;
}
static int i915_gpu_info_open(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
struct i915_gpu_coredump *gpu;
intel_wakeref_t wakeref;
gpu = NULL;
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
gpu = i915_gpu_coredump(to_gt(i915), ALL_ENGINES);
if (IS_ERR(gpu))
return PTR_ERR(gpu);
file->private_data = gpu;
return 0;
}
static const struct file_operations i915_gpu_info_fops = {
.owner = THIS_MODULE,
.open = i915_gpu_info_open,
.read = gpu_state_read,
.llseek = default_llseek,
.release = gpu_state_release,
};
static ssize_t
i915_error_state_write(struct file *filp,
const char __user *ubuf,
size_t cnt,
loff_t *ppos)
{
struct i915_gpu_coredump *error = filp->private_data;
if (!error)
return 0;
drm_dbg(&error->i915->drm, "Resetting error state\n");
i915_reset_error_state(error->i915);
return cnt;
}
static int i915_error_state_open(struct inode *inode, struct file *file)
{
struct i915_gpu_coredump *error;
error = i915_first_error_state(inode->i_private);
if (IS_ERR(error))
return PTR_ERR(error);
file->private_data = error;
return 0;
}
static const struct file_operations i915_error_state_fops = {
.owner = THIS_MODULE,
.open = i915_error_state_open,
.read = gpu_state_read,
.write = i915_error_state_write,
.llseek = default_llseek,
.release = gpu_state_release,
};
#endif
static int i915_frequency_info(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_gt *gt = to_gt(i915);
struct drm_printer p = drm_seq_file_printer(m);
intel_gt_pm_frequency_dump(gt, &p);
return 0;
}
static const char *swizzle_string(unsigned swizzle)
{
switch (swizzle) {
case I915_BIT_6_SWIZZLE_NONE:
return "none";
case I915_BIT_6_SWIZZLE_9:
return "bit9";
case I915_BIT_6_SWIZZLE_9_10:
return "bit9/bit10";
case I915_BIT_6_SWIZZLE_9_11:
return "bit9/bit11";
case I915_BIT_6_SWIZZLE_9_10_11:
return "bit9/bit10/bit11";
case I915_BIT_6_SWIZZLE_9_17:
return "bit9/bit17";
case I915_BIT_6_SWIZZLE_9_10_17:
return "bit9/bit10/bit17";
case I915_BIT_6_SWIZZLE_UNKNOWN:
return "unknown";
}
return "bug";
}
static int i915_swizzle_info(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_uncore *uncore = &dev_priv->uncore;
intel_wakeref_t wakeref;
seq_printf(m, "bit6 swizzle for X-tiling = %s\n",
swizzle_string(dev_priv->ggtt.bit_6_swizzle_x));
seq_printf(m, "bit6 swizzle for Y-tiling = %s\n",
swizzle_string(dev_priv->ggtt.bit_6_swizzle_y));
if (dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES)
seq_puts(m, "L-shaped memory detected\n");
/* On BDW+, swizzling is not used. See detect_bit_6_swizzle() */
if (GRAPHICS_VER(dev_priv) >= 8 || IS_VALLEYVIEW(dev_priv))
return 0;
wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
if (IS_GRAPHICS_VER(dev_priv, 3, 4)) {
seq_printf(m, "DDC = 0x%08x\n",
intel_uncore_read(uncore, DCC));
seq_printf(m, "DDC2 = 0x%08x\n",
intel_uncore_read(uncore, DCC2));
seq_printf(m, "C0DRB3 = 0x%04x\n",
intel_uncore_read16(uncore, C0DRB3_BW));
seq_printf(m, "C1DRB3 = 0x%04x\n",
intel_uncore_read16(uncore, C1DRB3_BW));
} else if (GRAPHICS_VER(dev_priv) >= 6) {
seq_printf(m, "MAD_DIMM_C0 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C0));
seq_printf(m, "MAD_DIMM_C1 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C1));
seq_printf(m, "MAD_DIMM_C2 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C2));
seq_printf(m, "TILECTL = 0x%08x\n",
intel_uncore_read(uncore, TILECTL));
if (GRAPHICS_VER(dev_priv) >= 8)
seq_printf(m, "GAMTARBMODE = 0x%08x\n",
intel_uncore_read(uncore, GAMTARBMODE));
else
seq_printf(m, "ARB_MODE = 0x%08x\n",
intel_uncore_read(uncore, ARB_MODE));
seq_printf(m, "DISP_ARB_CTL = 0x%08x\n",
intel_uncore_read(uncore, DISP_ARB_CTL));
}
intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref);
return 0;
}
static int i915_rps_boost_info(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_rps *rps = &to_gt(dev_priv)->rps;
seq_printf(m, "RPS enabled? %s\n", yesno(intel_rps_is_enabled(rps)));
seq_printf(m, "RPS active? %s\n", yesno(intel_rps_is_active(rps)));
seq_printf(m, "GPU busy? %s\n", yesno(to_gt(dev_priv)->awake));
drm/i915: Avoid keeping waitboost active for signaling threads Once a client has requested a waitboost, we keep that waitboost active until all clients are no longer waiting. This is because we don't distinguish which waiter deserves the boost. However, with the advent of fence signaling, the signaler threads appear as waiters to the RPS interrupt handler. So instead of using a single boolean to track when to keep the waitboost active, use a counter of all outstanding waitboosted requests. At this point, I have removed all vestiges of the rate limiting on clients. Whilst this means that compositors should remain more fluid, it also means that boosts are more prevalent. See commit b29c19b64528 ("drm/i915: Boost RPS frequency for CPU stalls") for a longer discussion on the pros and cons of both approaches. A drawback of this implementation is that it requires constant request submission to keep the waitboost trimmed (as it is now cancelled when the request is completed). This will be fine for a busy system, but near idle the boosts may be kept for longer than desired (effectively tens of vblanks worstcase) and there is a reliance on rc6 instead. v2: Remove defunct rps.client_lock Reported-by: Michał Winiarski <michal.winiarski@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170628123548.9236-1-chris@chris-wilson.co.uk
2017-06-28 13:35:48 +01:00
seq_printf(m, "Boosts outstanding? %d\n",
atomic_read(&rps->num_waiters));
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 14:26:29 +01:00
seq_printf(m, "Interactive? %d\n", READ_ONCE(rps->power.interactive));
seq_printf(m, "Frequency requested %d, actual %d\n",
intel_gpu_freq(rps, rps->cur_freq),
intel_rps_read_actual_frequency(rps));
seq_printf(m, " min hard:%d, soft:%d; max soft:%d, hard:%d\n",
intel_gpu_freq(rps, rps->min_freq),
intel_gpu_freq(rps, rps->min_freq_softlimit),
intel_gpu_freq(rps, rps->max_freq_softlimit),
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m, " idle:%d, efficient:%d, boost:%d\n",
intel_gpu_freq(rps, rps->idle_freq),
intel_gpu_freq(rps, rps->efficient_freq),
intel_gpu_freq(rps, rps->boost_freq));
seq_printf(m, "Wait boosts: %d\n", READ_ONCE(rps->boosts));
return 0;
}
static int i915_runtime_pm_status(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct pci_dev *pdev = to_pci_dev(dev_priv->drm.dev);
if (!HAS_RUNTIME_PM(dev_priv))
seq_puts(m, "Runtime power management not supported\n");
seq_printf(m, "Runtime power status: %s\n",
enableddisabled(!dev_priv->power_domains.init_wakeref));
seq_printf(m, "GPU idle: %s\n", yesno(!to_gt(dev_priv)->awake));
seq_printf(m, "IRQs disabled: %s\n",
yesno(!intel_irqs_enabled(dev_priv)));
#ifdef CONFIG_PM
seq_printf(m, "Usage count: %d\n",
atomic_read(&dev_priv->drm.dev->power.usage_count));
#else
seq_printf(m, "Device Power Management (CONFIG_PM) disabled\n");
#endif
seq_printf(m, "PCI device power state: %s [%d]\n",
pci_power_name(pdev->current_state),
pdev->current_state);
if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_RUNTIME_PM)) {
struct drm_printer p = drm_seq_file_printer(m);
print_intel_runtime_pm_wakeref(&dev_priv->runtime_pm, &p);
}
return 0;
}
static int i915_engine_info(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_engine_cs *engine;
intel_wakeref_t wakeref;
struct drm_printer p;
wakeref = intel_runtime_pm_get(&i915->runtime_pm);
seq_printf(m, "GT awake? %s [%d], %llums\n",
yesno(to_gt(i915)->awake),
atomic_read(&to_gt(i915)->wakeref.count),
ktime_to_ms(intel_gt_get_awake_time(to_gt(i915))));
seq_printf(m, "CS timestamp frequency: %u Hz, %d ns\n",
to_gt(i915)->clock_frequency,
to_gt(i915)->clock_period_ns);
p = drm_seq_file_printer(m);
for_each_uabi_engine(engine, i915)
intel_engine_dump(engine, &p, "%s\n", engine->name);
intel_gt_show_timelines(to_gt(i915), &p, i915_request_show_with_schedule);
intel_runtime_pm_put(&i915->runtime_pm, wakeref);
return 0;
}
static int i915_wa_registers(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_engine_cs *engine;
for_each_uabi_engine(engine, i915) {
const struct i915_wa_list *wal = &engine->ctx_wa_list;
const struct i915_wa *wa;
unsigned int count;
count = wal->count;
if (!count)
continue;
seq_printf(m, "%s: Workarounds applied: %u\n",
engine->name, count);
for (wa = wal->list; count--; wa++)
seq_printf(m, "0x%X: 0x%08X, mask: 0x%08X\n",
i915_mmio_reg_offset(wa->reg),
wa->set, wa->clr);
seq_printf(m, "\n");
}
return 0;
}
static int i915_wedged_get(void *data, u64 *val)
{
struct drm_i915_private *i915 = data;
return intel_gt_debugfs_reset_show(to_gt(i915), val);
}
static int i915_wedged_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
return intel_gt_debugfs_reset_store(to_gt(i915), val);
}
DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
static int
i915_perf_noa_delay_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
/*
* This would lead to infinite waits as we're doing timestamp
* difference on the CS with only 32bits.
*/
if (intel_gt_ns_to_clock_interval(to_gt(i915), val) > U32_MAX)
return -EINVAL;
atomic64_set(&i915->perf.noa_programming_delay, val);
return 0;
}
static int
i915_perf_noa_delay_get(void *data, u64 *val)
{
struct drm_i915_private *i915 = data;
*val = atomic64_read(&i915->perf.noa_programming_delay);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
i915_perf_noa_delay_get,
i915_perf_noa_delay_set,
"%llu\n");
#define DROP_UNBOUND BIT(0)
#define DROP_BOUND BIT(1)
#define DROP_RETIRE BIT(2)
#define DROP_ACTIVE BIT(3)
#define DROP_FREED BIT(4)
#define DROP_SHRINK_ALL BIT(5)
#define DROP_IDLE BIT(6)
#define DROP_RESET_ACTIVE BIT(7)
#define DROP_RESET_SEQNO BIT(8)
#define DROP_RCU BIT(9)
#define DROP_ALL (DROP_UNBOUND | \
DROP_BOUND | \
DROP_RETIRE | \
DROP_ACTIVE | \
DROP_FREED | \
DROP_SHRINK_ALL |\
DROP_IDLE | \
DROP_RESET_ACTIVE | \
DROP_RESET_SEQNO | \
DROP_RCU)
static int
i915_drop_caches_get(void *data, u64 *val)
{
*val = DROP_ALL;
return 0;
}
static int
gt_drop_caches(struct intel_gt *gt, u64 val)
{
int ret;
if (val & DROP_RESET_ACTIVE &&
wait_for(intel_engines_are_idle(gt), I915_IDLE_ENGINES_TIMEOUT))
intel_gt_set_wedged(gt);
if (val & DROP_RETIRE)
intel_gt_retire_requests(gt);
if (val & (DROP_IDLE | DROP_ACTIVE)) {
ret = intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
}
if (val & DROP_IDLE) {
ret = intel_gt_pm_wait_for_idle(gt);
if (ret)
return ret;
}
if (val & DROP_RESET_ACTIVE && intel_gt_terminally_wedged(gt))
intel_gt_handle_error(gt, ALL_ENGINES, 0, NULL);
if (val & DROP_FREED)
intel_gt_flush_buffer_pool(gt);
return 0;
}
static int
i915_drop_caches_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
drm/i915/debugfs: add noreclaim annotations We have a debugfs hook to directly call into i915_gem_shrink() with the fs_reclaim acquire annotations to simulate hitting direct reclaim. However we should also annotate this with memalloc_noreclaim, which will set PF_MEMALLOC for us on the current context, to ensure we can't re-enter direct reclaim(just like "real" direct reclaim does). This is an issue now that ttm_bo_validate could potentially be called here, which might try to allocate a tiny amount of memory to hold the new ttm_resource struct, as per the below splat: [ 2507.913844] WARNING: possible recursive locking detected [ 2507.913848] 5.16.0-rc4+ #5 Tainted: G U [ 2507.913853] -------------------------------------------- [ 2507.913856] gem_exec_captur/1825 is trying to acquire lock: [ 2507.913861] ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: kmem_cache_alloc_trace+0x30/0x390 [ 2507.913875] but task is already holding lock: [ 2507.913879] ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: i915_drop_caches_set+0x1c9/0x2c0 [i915] [ 2507.913962] other info that might help us debug this: [ 2507.913966] Possible unsafe locking scenario: [ 2507.913970] CPU0 [ 2507.913973] ---- [ 2507.913975] lock(fs_reclaim); [ 2507.913979] lock(fs_reclaim); [ 2507.913983] DEADLOCK *** [ 2507.913988] May be due to missing lock nesting notation [ 2507.913992] 4 locks held by gem_exec_captur/1825: [ 2507.913997] #0: ffff888101f6e460 (sb_writers#17){..}-{0:0}, at: ksys_write+0xe9/0x1b0 [ 2507.914009] #1: ffff88812d99e2b8 (&attr->mutex){..}-{3:3}, at: simple_attr_write+0xbb/0x220 [ 2507.914019] #2: ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: i915_drop_caches_set+0x1c9/0x2c0 [i915] [ 2507.914085] #3: ffff8881b4a11b20 (reservation_ww_class_mutex){..}-{3:3}, at: ww_mutex_trylock+0x43f/0xcb0 [ 2507.914097] stack backtrace: [ 2507.914102] CPU: 0 PID: 1825 Comm: gem_exec_captur Tainted: G U 5.16.0-rc4+ #5 [ 2507.914109] Hardware name: ASUS System Product Name/PRIME B560M-A AC, BIOS 0403 01/26/2021 [ 2507.914115] Call Trace: [ 2507.914118] <TASK> [ 2507.914121] dump_stack_lvl+0x59/0x73 [ 2507.914128] __lock_acquire.cold+0x227/0x3b0 [ 2507.914135] ? lockdep_hardirqs_on_prepare+0x410/0x410 [ 2507.914141] ? __lock_acquire+0x23ca/0x5000 [ 2507.914147] lock_acquire+0x19c/0x4b0 [ 2507.914152] ? kmem_cache_alloc_trace+0x30/0x390 [ 2507.914157] ? lock_release+0x690/0x690 [ 2507.914163] ? lock_is_held_type+0xe4/0x140 [ 2507.914170] ? ttm_sys_man_alloc+0x47/0xb0 [ttm] [ 2507.914178] fs_reclaim_acquire+0x11a/0x160 [ 2507.914183] ? kmem_cache_alloc_trace+0x30/0x390 [ 2507.914188] kmem_cache_alloc_trace+0x30/0x390 [ 2507.914192] ? lock_release+0x37f/0x690 [ 2507.914198] ttm_sys_man_alloc+0x47/0xb0 [ttm] [ 2507.914206] ttm_bo_pipeline_gutting+0x70/0x440 [ttm] [ 2507.914214] ? ttm_mem_io_free+0x150/0x150 [ttm] [ 2507.914221] ? lock_is_held_type+0xe4/0x140 [ 2507.914227] ttm_bo_validate+0x2fb/0x370 [ttm] [ 2507.914234] ? lock_acquire+0x19c/0x4b0 [ 2507.914239] ? ttm_bo_bounce_temp_buffer.constprop.0+0xf0/0xf0 [ttm] [ 2507.914246] ? lock_acquire+0x131/0x4b0 [ 2507.914251] ? lock_is_held_type+0xe4/0x140 [ 2507.914257] i915_ttm_shrinker_release_pages+0x2bc/0x490 [i915] [ 2507.914339] ? i915_ttm_swap_notify+0x130/0x130 [i915] [ 2507.914429] ? i915_gem_object_release_mmap_offset+0x32/0x250 [i915] [ 2507.914529] i915_gem_shrink+0xb14/0x1290 [i915] [ 2507.914616] ? ___i915_gem_object_make_shrinkable+0x3e0/0x3e0 [i915] [ 2507.914698] ? _raw_spin_unlock_irqrestore+0x2d/0x60 [ 2507.914705] ? track_intel_runtime_pm_wakeref+0x180/0x230 [i915] [ 2507.914777] i915_gem_shrink_all+0x4b/0x70 [i915] [ 2507.914857] i915_drop_caches_set+0x227/0x2c0 [i915] Reported-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Signed-off-by: Matthew Auld <matthew.auld@intel.com> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20211213125530.3960007-1-matthew.auld@intel.com
2021-12-13 12:55:30 +00:00
unsigned int flags;
int ret;
DRM_DEBUG("Dropping caches: 0x%08llx [0x%08llx]\n",
val, val & DROP_ALL);
ret = gt_drop_caches(to_gt(i915), val);
if (ret)
return ret;
fs_reclaim_acquire(GFP_KERNEL);
drm/i915/debugfs: add noreclaim annotations We have a debugfs hook to directly call into i915_gem_shrink() with the fs_reclaim acquire annotations to simulate hitting direct reclaim. However we should also annotate this with memalloc_noreclaim, which will set PF_MEMALLOC for us on the current context, to ensure we can't re-enter direct reclaim(just like "real" direct reclaim does). This is an issue now that ttm_bo_validate could potentially be called here, which might try to allocate a tiny amount of memory to hold the new ttm_resource struct, as per the below splat: [ 2507.913844] WARNING: possible recursive locking detected [ 2507.913848] 5.16.0-rc4+ #5 Tainted: G U [ 2507.913853] -------------------------------------------- [ 2507.913856] gem_exec_captur/1825 is trying to acquire lock: [ 2507.913861] ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: kmem_cache_alloc_trace+0x30/0x390 [ 2507.913875] but task is already holding lock: [ 2507.913879] ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: i915_drop_caches_set+0x1c9/0x2c0 [i915] [ 2507.913962] other info that might help us debug this: [ 2507.913966] Possible unsafe locking scenario: [ 2507.913970] CPU0 [ 2507.913973] ---- [ 2507.913975] lock(fs_reclaim); [ 2507.913979] lock(fs_reclaim); [ 2507.913983] DEADLOCK *** [ 2507.913988] May be due to missing lock nesting notation [ 2507.913992] 4 locks held by gem_exec_captur/1825: [ 2507.913997] #0: ffff888101f6e460 (sb_writers#17){..}-{0:0}, at: ksys_write+0xe9/0x1b0 [ 2507.914009] #1: ffff88812d99e2b8 (&attr->mutex){..}-{3:3}, at: simple_attr_write+0xbb/0x220 [ 2507.914019] #2: ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: i915_drop_caches_set+0x1c9/0x2c0 [i915] [ 2507.914085] #3: ffff8881b4a11b20 (reservation_ww_class_mutex){..}-{3:3}, at: ww_mutex_trylock+0x43f/0xcb0 [ 2507.914097] stack backtrace: [ 2507.914102] CPU: 0 PID: 1825 Comm: gem_exec_captur Tainted: G U 5.16.0-rc4+ #5 [ 2507.914109] Hardware name: ASUS System Product Name/PRIME B560M-A AC, BIOS 0403 01/26/2021 [ 2507.914115] Call Trace: [ 2507.914118] <TASK> [ 2507.914121] dump_stack_lvl+0x59/0x73 [ 2507.914128] __lock_acquire.cold+0x227/0x3b0 [ 2507.914135] ? lockdep_hardirqs_on_prepare+0x410/0x410 [ 2507.914141] ? __lock_acquire+0x23ca/0x5000 [ 2507.914147] lock_acquire+0x19c/0x4b0 [ 2507.914152] ? kmem_cache_alloc_trace+0x30/0x390 [ 2507.914157] ? lock_release+0x690/0x690 [ 2507.914163] ? lock_is_held_type+0xe4/0x140 [ 2507.914170] ? ttm_sys_man_alloc+0x47/0xb0 [ttm] [ 2507.914178] fs_reclaim_acquire+0x11a/0x160 [ 2507.914183] ? kmem_cache_alloc_trace+0x30/0x390 [ 2507.914188] kmem_cache_alloc_trace+0x30/0x390 [ 2507.914192] ? lock_release+0x37f/0x690 [ 2507.914198] ttm_sys_man_alloc+0x47/0xb0 [ttm] [ 2507.914206] ttm_bo_pipeline_gutting+0x70/0x440 [ttm] [ 2507.914214] ? ttm_mem_io_free+0x150/0x150 [ttm] [ 2507.914221] ? lock_is_held_type+0xe4/0x140 [ 2507.914227] ttm_bo_validate+0x2fb/0x370 [ttm] [ 2507.914234] ? lock_acquire+0x19c/0x4b0 [ 2507.914239] ? ttm_bo_bounce_temp_buffer.constprop.0+0xf0/0xf0 [ttm] [ 2507.914246] ? lock_acquire+0x131/0x4b0 [ 2507.914251] ? lock_is_held_type+0xe4/0x140 [ 2507.914257] i915_ttm_shrinker_release_pages+0x2bc/0x490 [i915] [ 2507.914339] ? i915_ttm_swap_notify+0x130/0x130 [i915] [ 2507.914429] ? i915_gem_object_release_mmap_offset+0x32/0x250 [i915] [ 2507.914529] i915_gem_shrink+0xb14/0x1290 [i915] [ 2507.914616] ? ___i915_gem_object_make_shrinkable+0x3e0/0x3e0 [i915] [ 2507.914698] ? _raw_spin_unlock_irqrestore+0x2d/0x60 [ 2507.914705] ? track_intel_runtime_pm_wakeref+0x180/0x230 [i915] [ 2507.914777] i915_gem_shrink_all+0x4b/0x70 [i915] [ 2507.914857] i915_drop_caches_set+0x227/0x2c0 [i915] Reported-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Signed-off-by: Matthew Auld <matthew.auld@intel.com> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20211213125530.3960007-1-matthew.auld@intel.com
2021-12-13 12:55:30 +00:00
flags = memalloc_noreclaim_save();
if (val & DROP_BOUND)
i915_gem_shrink(NULL, i915, LONG_MAX, NULL, I915_SHRINK_BOUND);
if (val & DROP_UNBOUND)
i915_gem_shrink(NULL, i915, LONG_MAX, NULL, I915_SHRINK_UNBOUND);
if (val & DROP_SHRINK_ALL)
i915_gem_shrink_all(i915);
drm/i915/debugfs: add noreclaim annotations We have a debugfs hook to directly call into i915_gem_shrink() with the fs_reclaim acquire annotations to simulate hitting direct reclaim. However we should also annotate this with memalloc_noreclaim, which will set PF_MEMALLOC for us on the current context, to ensure we can't re-enter direct reclaim(just like "real" direct reclaim does). This is an issue now that ttm_bo_validate could potentially be called here, which might try to allocate a tiny amount of memory to hold the new ttm_resource struct, as per the below splat: [ 2507.913844] WARNING: possible recursive locking detected [ 2507.913848] 5.16.0-rc4+ #5 Tainted: G U [ 2507.913853] -------------------------------------------- [ 2507.913856] gem_exec_captur/1825 is trying to acquire lock: [ 2507.913861] ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: kmem_cache_alloc_trace+0x30/0x390 [ 2507.913875] but task is already holding lock: [ 2507.913879] ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: i915_drop_caches_set+0x1c9/0x2c0 [i915] [ 2507.913962] other info that might help us debug this: [ 2507.913966] Possible unsafe locking scenario: [ 2507.913970] CPU0 [ 2507.913973] ---- [ 2507.913975] lock(fs_reclaim); [ 2507.913979] lock(fs_reclaim); [ 2507.913983] DEADLOCK *** [ 2507.913988] May be due to missing lock nesting notation [ 2507.913992] 4 locks held by gem_exec_captur/1825: [ 2507.913997] #0: ffff888101f6e460 (sb_writers#17){..}-{0:0}, at: ksys_write+0xe9/0x1b0 [ 2507.914009] #1: ffff88812d99e2b8 (&attr->mutex){..}-{3:3}, at: simple_attr_write+0xbb/0x220 [ 2507.914019] #2: ffffffffb9df2500 (fs_reclaim){..}-{0:0}, at: i915_drop_caches_set+0x1c9/0x2c0 [i915] [ 2507.914085] #3: ffff8881b4a11b20 (reservation_ww_class_mutex){..}-{3:3}, at: ww_mutex_trylock+0x43f/0xcb0 [ 2507.914097] stack backtrace: [ 2507.914102] CPU: 0 PID: 1825 Comm: gem_exec_captur Tainted: G U 5.16.0-rc4+ #5 [ 2507.914109] Hardware name: ASUS System Product Name/PRIME B560M-A AC, BIOS 0403 01/26/2021 [ 2507.914115] Call Trace: [ 2507.914118] <TASK> [ 2507.914121] dump_stack_lvl+0x59/0x73 [ 2507.914128] __lock_acquire.cold+0x227/0x3b0 [ 2507.914135] ? lockdep_hardirqs_on_prepare+0x410/0x410 [ 2507.914141] ? __lock_acquire+0x23ca/0x5000 [ 2507.914147] lock_acquire+0x19c/0x4b0 [ 2507.914152] ? kmem_cache_alloc_trace+0x30/0x390 [ 2507.914157] ? lock_release+0x690/0x690 [ 2507.914163] ? lock_is_held_type+0xe4/0x140 [ 2507.914170] ? ttm_sys_man_alloc+0x47/0xb0 [ttm] [ 2507.914178] fs_reclaim_acquire+0x11a/0x160 [ 2507.914183] ? kmem_cache_alloc_trace+0x30/0x390 [ 2507.914188] kmem_cache_alloc_trace+0x30/0x390 [ 2507.914192] ? lock_release+0x37f/0x690 [ 2507.914198] ttm_sys_man_alloc+0x47/0xb0 [ttm] [ 2507.914206] ttm_bo_pipeline_gutting+0x70/0x440 [ttm] [ 2507.914214] ? ttm_mem_io_free+0x150/0x150 [ttm] [ 2507.914221] ? lock_is_held_type+0xe4/0x140 [ 2507.914227] ttm_bo_validate+0x2fb/0x370 [ttm] [ 2507.914234] ? lock_acquire+0x19c/0x4b0 [ 2507.914239] ? ttm_bo_bounce_temp_buffer.constprop.0+0xf0/0xf0 [ttm] [ 2507.914246] ? lock_acquire+0x131/0x4b0 [ 2507.914251] ? lock_is_held_type+0xe4/0x140 [ 2507.914257] i915_ttm_shrinker_release_pages+0x2bc/0x490 [i915] [ 2507.914339] ? i915_ttm_swap_notify+0x130/0x130 [i915] [ 2507.914429] ? i915_gem_object_release_mmap_offset+0x32/0x250 [i915] [ 2507.914529] i915_gem_shrink+0xb14/0x1290 [i915] [ 2507.914616] ? ___i915_gem_object_make_shrinkable+0x3e0/0x3e0 [i915] [ 2507.914698] ? _raw_spin_unlock_irqrestore+0x2d/0x60 [ 2507.914705] ? track_intel_runtime_pm_wakeref+0x180/0x230 [i915] [ 2507.914777] i915_gem_shrink_all+0x4b/0x70 [i915] [ 2507.914857] i915_drop_caches_set+0x227/0x2c0 [i915] Reported-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Signed-off-by: Matthew Auld <matthew.auld@intel.com> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20211213125530.3960007-1-matthew.auld@intel.com
2021-12-13 12:55:30 +00:00
memalloc_noreclaim_restore(flags);
fs_reclaim_release(GFP_KERNEL);
if (val & DROP_RCU)
rcu_barrier();
if (val & DROP_FREED)
i915_gem_drain_freed_objects(i915);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_drop_caches_fops,
i915_drop_caches_get, i915_drop_caches_set,
"0x%08llx\n");
static int i915_sseu_status(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_gt *gt = to_gt(i915);
return intel_sseu_status(m, gt);
}
static int i915_forcewake_open(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
return intel_gt_pm_debugfs_forcewake_user_open(to_gt(i915));
}
static int i915_forcewake_release(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
return intel_gt_pm_debugfs_forcewake_user_release(to_gt(i915));
}
static const struct file_operations i915_forcewake_fops = {
.owner = THIS_MODULE,
.open = i915_forcewake_open,
.release = i915_forcewake_release,
};
static const struct drm_info_list i915_debugfs_list[] = {
{"i915_capabilities", i915_capabilities, 0},
{"i915_gem_objects", i915_gem_object_info, 0},
{"i915_frequency_info", i915_frequency_info, 0},
{"i915_swizzle_info", i915_swizzle_info, 0},
{"i915_runtime_pm_status", i915_runtime_pm_status, 0},
{"i915_engine_info", i915_engine_info, 0},
{"i915_wa_registers", i915_wa_registers, 0},
{"i915_sseu_status", i915_sseu_status, 0},
{"i915_rps_boost_info", i915_rps_boost_info, 0},
};
#define I915_DEBUGFS_ENTRIES ARRAY_SIZE(i915_debugfs_list)
static const struct i915_debugfs_files {
const char *name;
const struct file_operations *fops;
} i915_debugfs_files[] = {
{"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
{"i915_wedged", &i915_wedged_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops},
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
{"i915_error_state", &i915_error_state_fops},
{"i915_gpu_info", &i915_gpu_info_fops},
#endif
};
void i915_debugfs_register(struct drm_i915_private *dev_priv)
{
struct drm_minor *minor = dev_priv->drm.primary;
int i;
i915_debugfs_params(dev_priv);
debugfs_create_file("i915_forcewake_user", S_IRUSR, minor->debugfs_root,
to_i915(minor->dev), &i915_forcewake_fops);
for (i = 0; i < ARRAY_SIZE(i915_debugfs_files); i++) {
debugfs_create_file(i915_debugfs_files[i].name,
S_IRUGO | S_IWUSR,
minor->debugfs_root,
to_i915(minor->dev),
i915_debugfs_files[i].fops);
}
drm_debugfs_create_files(i915_debugfs_list,
I915_DEBUGFS_ENTRIES,
minor->debugfs_root, minor);
}