mirror of https://github.com/hak5/openwrt.git
898 lines
28 KiB
Diff
898 lines
28 KiB
Diff
From 22dbf1420a552d1952d22b92d8c30f8162b026b5 Mon Sep 17 00:00:00 2001
|
|
From: Eric Anholt <eric@anholt.net>
|
|
Date: Tue, 16 Apr 2019 15:58:54 -0700
|
|
Subject: [PATCH 609/806] drm/v3d: Add support for compute shader dispatch.
|
|
|
|
The compute shader dispatch interface is pretty simple -- just pass in
|
|
the regs that userspace has passed us, with no CLs to run. However,
|
|
with no CL to run it means that we need to do manual cache flushing of
|
|
the L2 after the HW execution completes (for SSBO, atomic, and
|
|
image_load_store writes that are the output of compute shaders).
|
|
|
|
This doesn't yet expose the L2 cache's ability to have a region of the
|
|
address space not write back to memory (which could be used for
|
|
shared_var storage).
|
|
|
|
So far, the Mesa side has been tested on V3D v4.2 simpenrose (passing
|
|
the ES31 tests), and on the kernel side on 7278 (failing atomic
|
|
compswap tests in a way that doesn't reproduce on simpenrose).
|
|
|
|
v2: Fix excessive allocation for the clean_job (reported by Dan
|
|
Carpenter). Keep refs on jobs until clean_job is finished, to
|
|
avoid spurious MMU errors if the output BOs are freed by userspace
|
|
before L2 cleaning is finished.
|
|
|
|
Signed-off-by: Eric Anholt <eric@anholt.net>
|
|
Link: https://patchwork.freedesktop.org/patch/msgid/20190416225856.20264-4-eric@anholt.net
|
|
Acked-by: Rob Clark <robdclark@gmail.com>
|
|
---
|
|
drivers/gpu/drm/v3d/v3d_debugfs.c | 22 +++++
|
|
drivers/gpu/drm/v3d/v3d_drv.c | 10 +-
|
|
drivers/gpu/drm/v3d/v3d_drv.h | 28 +++++-
|
|
drivers/gpu/drm/v3d/v3d_fence.c | 2 +
|
|
drivers/gpu/drm/v3d/v3d_gem.c | 156 +++++++++++++++++++++++++++++-
|
|
drivers/gpu/drm/v3d/v3d_irq.c | 16 ++-
|
|
drivers/gpu/drm/v3d/v3d_regs.h | 73 ++++++++++++++
|
|
drivers/gpu/drm/v3d/v3d_sched.c | 121 +++++++++++++++++++++--
|
|
drivers/gpu/drm/v3d/v3d_trace.h | 94 ++++++++++++++++++
|
|
include/uapi/drm/v3d_drm.h | 28 ++++++
|
|
10 files changed, 531 insertions(+), 19 deletions(-)
|
|
|
|
--- a/drivers/gpu/drm/v3d/v3d_debugfs.c
|
|
+++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
|
|
@@ -57,6 +57,17 @@ static const struct v3d_reg_def v3d_core
|
|
REGDEF(V3D_GMP_VIO_ADDR),
|
|
};
|
|
|
|
+static const struct v3d_reg_def v3d_csd_reg_defs[] = {
|
|
+ REGDEF(V3D_CSD_STATUS),
|
|
+ REGDEF(V3D_CSD_CURRENT_CFG0),
|
|
+ REGDEF(V3D_CSD_CURRENT_CFG1),
|
|
+ REGDEF(V3D_CSD_CURRENT_CFG2),
|
|
+ REGDEF(V3D_CSD_CURRENT_CFG3),
|
|
+ REGDEF(V3D_CSD_CURRENT_CFG4),
|
|
+ REGDEF(V3D_CSD_CURRENT_CFG5),
|
|
+ REGDEF(V3D_CSD_CURRENT_CFG6),
|
|
+};
|
|
+
|
|
static int v3d_v3d_debugfs_regs(struct seq_file *m, void *unused)
|
|
{
|
|
struct drm_info_node *node = (struct drm_info_node *)m->private;
|
|
@@ -88,6 +99,17 @@ static int v3d_v3d_debugfs_regs(struct s
|
|
V3D_CORE_READ(core,
|
|
v3d_core_reg_defs[i].reg));
|
|
}
|
|
+
|
|
+ if (v3d_has_csd(v3d)) {
|
|
+ for (i = 0; i < ARRAY_SIZE(v3d_csd_reg_defs); i++) {
|
|
+ seq_printf(m, "core %d %s (0x%04x): 0x%08x\n",
|
|
+ core,
|
|
+ v3d_csd_reg_defs[i].name,
|
|
+ v3d_csd_reg_defs[i].reg,
|
|
+ V3D_CORE_READ(core,
|
|
+ v3d_csd_reg_defs[i].reg));
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
return 0;
|
|
--- a/drivers/gpu/drm/v3d/v3d_drv.c
|
|
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
|
|
@@ -7,9 +7,9 @@
|
|
* This driver supports the Broadcom V3D 3.3 and 4.1 OpenGL ES GPUs.
|
|
* For V3D 2.x support, see the VC4 driver.
|
|
*
|
|
- * Currently only single-core rendering using the binner and renderer,
|
|
- * along with TFU (texture formatting unit) rendering is supported.
|
|
- * V3D 4.x's CSD (compute shader dispatch) is not yet supported.
|
|
+ * The V3D GPU includes a tiled render (composed of a bin and render
|
|
+ * pipelines), the TFU (texture formatting unit), and the CSD (compute
|
|
+ * shader dispatch).
|
|
*/
|
|
|
|
#include <linux/clk.h>
|
|
@@ -114,6 +114,9 @@ static int v3d_get_param_ioctl(struct dr
|
|
case DRM_V3D_PARAM_SUPPORTS_TFU:
|
|
args->value = 1;
|
|
return 0;
|
|
+ case DRM_V3D_PARAM_SUPPORTS_CSD:
|
|
+ args->value = v3d_has_csd(v3d);
|
|
+ return 0;
|
|
default:
|
|
DRM_DEBUG("Unknown parameter %d\n", args->param);
|
|
return -EINVAL;
|
|
@@ -183,6 +186,7 @@ static const struct drm_ioctl_desc v3d_d
|
|
DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
|
|
DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
|
|
DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
|
|
+ DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
|
|
};
|
|
|
|
static const struct vm_operations_struct v3d_vm_ops = {
|
|
--- a/drivers/gpu/drm/v3d/v3d_drv.h
|
|
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
|
|
@@ -16,9 +16,11 @@ enum v3d_queue {
|
|
V3D_BIN,
|
|
V3D_RENDER,
|
|
V3D_TFU,
|
|
+ V3D_CSD,
|
|
+ V3D_CACHE_CLEAN,
|
|
};
|
|
|
|
-#define V3D_MAX_QUEUES (V3D_TFU + 1)
|
|
+#define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1)
|
|
|
|
struct v3d_queue_state {
|
|
struct drm_gpu_scheduler sched;
|
|
@@ -70,6 +72,7 @@ struct v3d_dev {
|
|
struct v3d_bin_job *bin_job;
|
|
struct v3d_render_job *render_job;
|
|
struct v3d_tfu_job *tfu_job;
|
|
+ struct v3d_csd_job *csd_job;
|
|
|
|
struct v3d_queue_state queue[V3D_MAX_QUEUES];
|
|
|
|
@@ -92,6 +95,12 @@ struct v3d_dev {
|
|
*/
|
|
struct mutex sched_lock;
|
|
|
|
+ /* Lock taken during a cache clean and when initiating an L2
|
|
+ * flush, to keep L2 flushes from interfering with the
|
|
+ * synchronous L2 cleans.
|
|
+ */
|
|
+ struct mutex cache_clean_lock;
|
|
+
|
|
struct {
|
|
u32 num_allocated;
|
|
u32 pages_allocated;
|
|
@@ -104,6 +113,12 @@ to_v3d_dev(struct drm_device *dev)
|
|
return (struct v3d_dev *)dev->dev_private;
|
|
}
|
|
|
|
+static inline bool
|
|
+v3d_has_csd(struct v3d_dev *v3d)
|
|
+{
|
|
+ return v3d->ver >= 41;
|
|
+}
|
|
+
|
|
/* The per-fd struct, which tracks the MMU mappings. */
|
|
struct v3d_file_priv {
|
|
struct v3d_dev *v3d;
|
|
@@ -237,6 +252,14 @@ struct v3d_tfu_job {
|
|
struct drm_v3d_submit_tfu args;
|
|
};
|
|
|
|
+struct v3d_csd_job {
|
|
+ struct v3d_job base;
|
|
+
|
|
+ u32 timedout_batches;
|
|
+
|
|
+ struct drm_v3d_submit_csd args;
|
|
+};
|
|
+
|
|
/**
|
|
* _wait_for - magic (register) wait macro
|
|
*
|
|
@@ -302,11 +325,14 @@ int v3d_submit_cl_ioctl(struct drm_devic
|
|
struct drm_file *file_priv);
|
|
int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
|
|
struct drm_file *file_priv);
|
|
+int v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
|
|
+ struct drm_file *file_priv);
|
|
int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
|
|
struct drm_file *file_priv);
|
|
void v3d_job_put(struct v3d_job *job);
|
|
void v3d_reset(struct v3d_dev *v3d);
|
|
void v3d_invalidate_caches(struct v3d_dev *v3d);
|
|
+void v3d_clean_caches(struct v3d_dev *v3d);
|
|
|
|
/* v3d_irq.c */
|
|
int v3d_irq_init(struct v3d_dev *v3d);
|
|
--- a/drivers/gpu/drm/v3d/v3d_fence.c
|
|
+++ b/drivers/gpu/drm/v3d/v3d_fence.c
|
|
@@ -36,6 +36,8 @@ static const char *v3d_fence_get_timelin
|
|
return "v3d-render";
|
|
case V3D_TFU:
|
|
return "v3d-tfu";
|
|
+ case V3D_CSD:
|
|
+ return "v3d-csd";
|
|
default:
|
|
return NULL;
|
|
}
|
|
--- a/drivers/gpu/drm/v3d/v3d_gem.c
|
|
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
|
|
@@ -162,10 +162,52 @@ v3d_flush_l2t(struct v3d_dev *v3d, int c
|
|
/* While there is a busy bit (V3D_L2TCACTL_L2TFLS), we don't
|
|
* need to wait for completion before dispatching the job --
|
|
* L2T accesses will be stalled until the flush has completed.
|
|
+ * However, we do need to make sure we don't try to trigger a
|
|
+ * new flush while the L2_CLEAN queue is trying to
|
|
+ * synchronously clean after a job.
|
|
*/
|
|
+ mutex_lock(&v3d->cache_clean_lock);
|
|
V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
|
|
V3D_L2TCACTL_L2TFLS |
|
|
V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
|
|
+ mutex_unlock(&v3d->cache_clean_lock);
|
|
+}
|
|
+
|
|
+/* Cleans texture L1 and L2 cachelines (writing back dirty data).
|
|
+ *
|
|
+ * For cleaning, which happens from the CACHE_CLEAN queue after CSD has
|
|
+ * executed, we need to make sure that the clean is done before
|
|
+ * signaling job completion. So, we synchronously wait before
|
|
+ * returning, and we make sure that L2 invalidates don't happen in the
|
|
+ * meantime to confuse our are-we-done checks.
|
|
+ */
|
|
+void
|
|
+v3d_clean_caches(struct v3d_dev *v3d)
|
|
+{
|
|
+ struct drm_device *dev = &v3d->drm;
|
|
+ int core = 0;
|
|
+
|
|
+ trace_v3d_cache_clean_begin(dev);
|
|
+
|
|
+ V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF);
|
|
+ if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
|
|
+ V3D_L2TCACTL_L2TFLS), 100)) {
|
|
+ DRM_ERROR("Timeout waiting for L1T write combiner flush\n");
|
|
+ }
|
|
+
|
|
+ mutex_lock(&v3d->cache_clean_lock);
|
|
+ V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
|
|
+ V3D_L2TCACTL_L2TFLS |
|
|
+ V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAN, V3D_L2TCACTL_FLM));
|
|
+
|
|
+ if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
|
|
+ V3D_L2TCACTL_L2TFLS), 100)) {
|
|
+ DRM_ERROR("Timeout waiting for L2T clean\n");
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&v3d->cache_clean_lock);
|
|
+
|
|
+ trace_v3d_cache_clean_end(dev);
|
|
}
|
|
|
|
/* Invalidates the slice caches. These are read-only caches. */
|
|
@@ -584,7 +626,8 @@ static void
|
|
v3d_attach_fences_and_unlock_reservation(struct drm_file *file_priv,
|
|
struct v3d_job *job,
|
|
struct ww_acquire_ctx *acquire_ctx,
|
|
- u32 out_sync)
|
|
+ u32 out_sync,
|
|
+ struct dma_fence *done_fence)
|
|
{
|
|
struct drm_syncobj *sync_out;
|
|
|
|
@@ -594,7 +637,7 @@ v3d_attach_fences_and_unlock_reservation
|
|
/* Update the return sync object for the job */
|
|
sync_out = drm_syncobj_find(file_priv, out_sync);
|
|
if (sync_out) {
|
|
- drm_syncobj_replace_fence(sync_out, job->done_fence);
|
|
+ drm_syncobj_replace_fence(sync_out, done_fence);
|
|
drm_syncobj_put(sync_out);
|
|
}
|
|
}
|
|
@@ -691,8 +734,10 @@ v3d_submit_cl_ioctl(struct drm_device *d
|
|
mutex_unlock(&v3d->sched_lock);
|
|
|
|
v3d_attach_fences_and_unlock_reservation(file_priv,
|
|
- &render->base, &acquire_ctx,
|
|
- args->out_sync);
|
|
+ &render->base,
|
|
+ &acquire_ctx,
|
|
+ args->out_sync,
|
|
+ render->base.done_fence);
|
|
|
|
if (bin)
|
|
v3d_job_put(&bin->base);
|
|
@@ -785,7 +830,8 @@ v3d_submit_tfu_ioctl(struct drm_device *
|
|
|
|
v3d_attach_fences_and_unlock_reservation(file_priv,
|
|
&job->base, &acquire_ctx,
|
|
- args->out_sync);
|
|
+ args->out_sync,
|
|
+ job->base.done_fence);
|
|
|
|
v3d_job_put(&job->base);
|
|
|
|
@@ -801,6 +847,105 @@ fail:
|
|
return ret;
|
|
}
|
|
|
|
+/**
|
|
+ * v3d_submit_csd_ioctl() - Submits a CSD (texture formatting) job to the V3D.
|
|
+ * @dev: DRM device
|
|
+ * @data: ioctl argument
|
|
+ * @file_priv: DRM file for this fd
|
|
+ *
|
|
+ * Userspace provides the register setup for the CSD, which we don't
|
|
+ * need to validate since the CSD is behind the MMU.
|
|
+ */
|
|
+int
|
|
+v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
|
|
+ struct drm_file *file_priv)
|
|
+{
|
|
+ struct v3d_dev *v3d = to_v3d_dev(dev);
|
|
+ struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
|
|
+ struct drm_v3d_submit_csd *args = data;
|
|
+ struct v3d_csd_job *job;
|
|
+ struct v3d_job *clean_job;
|
|
+ struct ww_acquire_ctx acquire_ctx;
|
|
+ int ret;
|
|
+
|
|
+ trace_v3d_submit_csd_ioctl(&v3d->drm, args->cfg[5], args->cfg[6]);
|
|
+
|
|
+ if (!v3d_has_csd(v3d)) {
|
|
+ DRM_DEBUG("Attempting CSD submit on non-CSD hardware\n");
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ job = kcalloc(1, sizeof(*job), GFP_KERNEL);
|
|
+ if (!job)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = v3d_job_init(v3d, file_priv, &job->base,
|
|
+ v3d_job_free, args->in_sync);
|
|
+ if (ret) {
|
|
+ kfree(job);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ clean_job = kcalloc(1, sizeof(*clean_job), GFP_KERNEL);
|
|
+ if (!clean_job) {
|
|
+ v3d_job_put(&job->base);
|
|
+ kfree(job);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ ret = v3d_job_init(v3d, file_priv, clean_job, v3d_job_free, 0);
|
|
+ if (ret) {
|
|
+ v3d_job_put(&job->base);
|
|
+ kfree(clean_job);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ job->args = *args;
|
|
+
|
|
+ ret = v3d_lookup_bos(dev, file_priv, clean_job,
|
|
+ args->bo_handles, args->bo_handle_count);
|
|
+ if (ret)
|
|
+ goto fail;
|
|
+
|
|
+ ret = v3d_lock_bo_reservations(clean_job, &acquire_ctx);
|
|
+ if (ret)
|
|
+ goto fail;
|
|
+
|
|
+ mutex_lock(&v3d->sched_lock);
|
|
+ ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD);
|
|
+ if (ret)
|
|
+ goto fail_unreserve;
|
|
+
|
|
+ ret = v3d_add_dep(clean_job, dma_fence_get(job->base.done_fence));
|
|
+ if (ret)
|
|
+ goto fail_unreserve;
|
|
+ ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN);
|
|
+ if (ret)
|
|
+ goto fail_unreserve;
|
|
+ mutex_unlock(&v3d->sched_lock);
|
|
+
|
|
+ v3d_attach_fences_and_unlock_reservation(file_priv,
|
|
+ clean_job,
|
|
+ &acquire_ctx,
|
|
+ args->out_sync,
|
|
+ clean_job->done_fence);
|
|
+
|
|
+ v3d_job_put(&job->base);
|
|
+ v3d_job_put(clean_job);
|
|
+
|
|
+ return 0;
|
|
+
|
|
+fail_unreserve:
|
|
+ mutex_unlock(&v3d->sched_lock);
|
|
+ v3d_unlock_bo_reservations(clean_job->bo, clean_job->bo_count,
|
|
+ &acquire_ctx);
|
|
+fail:
|
|
+ v3d_job_put(&job->base);
|
|
+ v3d_job_put(clean_job);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
int
|
|
v3d_gem_init(struct drm_device *dev)
|
|
{
|
|
@@ -816,6 +961,7 @@ v3d_gem_init(struct drm_device *dev)
|
|
mutex_init(&v3d->bo_lock);
|
|
mutex_init(&v3d->reset_lock);
|
|
mutex_init(&v3d->sched_lock);
|
|
+ mutex_init(&v3d->cache_clean_lock);
|
|
|
|
/* Note: We don't allocate address 0. Various bits of HW
|
|
* treat 0 as special, such as the occlusion query counters
|
|
--- a/drivers/gpu/drm/v3d/v3d_irq.c
|
|
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
|
|
@@ -4,9 +4,9 @@
|
|
/**
|
|
* DOC: Interrupt management for the V3D engine
|
|
*
|
|
- * When we take a bin, render, or TFU done interrupt, we need to
|
|
- * signal the fence for that job so that the scheduler can queue up
|
|
- * the next one and unblock any waiters.
|
|
+ * When we take a bin, render, TFU done, or CSD done interrupt, we
|
|
+ * need to signal the fence for that job so that the scheduler can
|
|
+ * queue up the next one and unblock any waiters.
|
|
*
|
|
* When we take the binner out of memory interrupt, we need to
|
|
* allocate some new memory and pass it to the binner so that the
|
|
@@ -20,6 +20,7 @@
|
|
#define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM | \
|
|
V3D_INT_FLDONE | \
|
|
V3D_INT_FRDONE | \
|
|
+ V3D_INT_CSDDONE | \
|
|
V3D_INT_GMPV))
|
|
|
|
#define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV | \
|
|
@@ -108,6 +109,15 @@ v3d_irq(int irq, void *arg)
|
|
dma_fence_signal(&fence->base);
|
|
status = IRQ_HANDLED;
|
|
}
|
|
+
|
|
+ if (intsts & V3D_INT_CSDDONE) {
|
|
+ struct v3d_fence *fence =
|
|
+ to_v3d_fence(v3d->csd_job->base.irq_fence);
|
|
+
|
|
+ trace_v3d_csd_irq(&v3d->drm, fence->seqno);
|
|
+ dma_fence_signal(&fence->base);
|
|
+ status = IRQ_HANDLED;
|
|
+ }
|
|
|
|
/* We shouldn't be triggering these if we have GMP in
|
|
* always-allowed mode.
|
|
--- a/drivers/gpu/drm/v3d/v3d_regs.h
|
|
+++ b/drivers/gpu/drm/v3d/v3d_regs.h
|
|
@@ -238,8 +238,11 @@
|
|
#define V3D_CTL_L2TCACTL 0x00030
|
|
# define V3D_L2TCACTL_TMUWCF BIT(8)
|
|
# define V3D_L2TCACTL_L2T_NO_WM BIT(4)
|
|
+/* Invalidates cache lines. */
|
|
# define V3D_L2TCACTL_FLM_FLUSH 0
|
|
+/* Removes cachelines without writing dirty lines back. */
|
|
# define V3D_L2TCACTL_FLM_CLEAR 1
|
|
+/* Writes out dirty cachelines and marks them clean, but doesn't invalidate. */
|
|
# define V3D_L2TCACTL_FLM_CLEAN 2
|
|
# define V3D_L2TCACTL_FLM_MASK V3D_MASK(2, 1)
|
|
# define V3D_L2TCACTL_FLM_SHIFT 1
|
|
@@ -255,6 +258,8 @@
|
|
#define V3D_CTL_INT_MSK_CLR 0x00064
|
|
# define V3D_INT_QPU_MASK V3D_MASK(27, 16)
|
|
# define V3D_INT_QPU_SHIFT 16
|
|
+# define V3D_INT_CSDDONE BIT(7)
|
|
+# define V3D_INT_PCTR BIT(6)
|
|
# define V3D_INT_GMPV BIT(5)
|
|
# define V3D_INT_TRFB BIT(4)
|
|
# define V3D_INT_SPILLUSE BIT(3)
|
|
@@ -374,4 +379,72 @@
|
|
#define V3D_GMP_PRESERVE_LOAD 0x00818
|
|
#define V3D_GMP_VALID_LINES 0x00820
|
|
|
|
+#define V3D_CSD_STATUS 0x00900
|
|
+# define V3D_CSD_STATUS_NUM_COMPLETED_MASK V3D_MASK(11, 4)
|
|
+# define V3D_CSD_STATUS_NUM_COMPLETED_SHIFT 4
|
|
+# define V3D_CSD_STATUS_NUM_ACTIVE_MASK V3D_MASK(3, 2)
|
|
+# define V3D_CSD_STATUS_NUM_ACTIVE_SHIFT 2
|
|
+# define V3D_CSD_STATUS_HAVE_CURRENT_DISPATCH BIT(1)
|
|
+# define V3D_CSD_STATUS_HAVE_QUEUED_DISPATCH BIT(0)
|
|
+
|
|
+#define V3D_CSD_QUEUED_CFG0 0x00904
|
|
+# define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_MASK V3D_MASK(31, 16)
|
|
+# define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_SHIFT 16
|
|
+# define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_MASK V3D_MASK(15, 0)
|
|
+# define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_SHIFT 0
|
|
+
|
|
+#define V3D_CSD_QUEUED_CFG1 0x00908
|
|
+# define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_MASK V3D_MASK(31, 16)
|
|
+# define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_SHIFT 16
|
|
+# define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_MASK V3D_MASK(15, 0)
|
|
+# define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_SHIFT 0
|
|
+
|
|
+#define V3D_CSD_QUEUED_CFG2 0x0090c
|
|
+# define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_MASK V3D_MASK(31, 16)
|
|
+# define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_SHIFT 16
|
|
+# define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_MASK V3D_MASK(15, 0)
|
|
+# define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_SHIFT 0
|
|
+
|
|
+#define V3D_CSD_QUEUED_CFG3 0x00910
|
|
+# define V3D_CSD_QUEUED_CFG3_OVERLAP_WITH_PREV BIT(26)
|
|
+# define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_MASK V3D_MASK(25, 20)
|
|
+# define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_SHIFT 20
|
|
+# define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_MASK V3D_MASK(19, 12)
|
|
+# define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_SHIFT 12
|
|
+# define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_MASK V3D_MASK(11, 8)
|
|
+# define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_SHIFT 8
|
|
+# define V3D_CSD_QUEUED_CFG3_WG_SIZE_MASK V3D_MASK(7, 0)
|
|
+# define V3D_CSD_QUEUED_CFG3_WG_SIZE_SHIFT 0
|
|
+
|
|
+/* Number of batches, minus 1 */
|
|
+#define V3D_CSD_QUEUED_CFG4 0x00914
|
|
+
|
|
+/* Shader address, pnan, singleseg, threading, like a shader record. */
|
|
+#define V3D_CSD_QUEUED_CFG5 0x00918
|
|
+
|
|
+/* Uniforms address (4 byte aligned) */
|
|
+#define V3D_CSD_QUEUED_CFG6 0x0091c
|
|
+
|
|
+#define V3D_CSD_CURRENT_CFG0 0x00920
|
|
+#define V3D_CSD_CURRENT_CFG1 0x00924
|
|
+#define V3D_CSD_CURRENT_CFG2 0x00928
|
|
+#define V3D_CSD_CURRENT_CFG3 0x0092c
|
|
+#define V3D_CSD_CURRENT_CFG4 0x00930
|
|
+#define V3D_CSD_CURRENT_CFG5 0x00934
|
|
+#define V3D_CSD_CURRENT_CFG6 0x00938
|
|
+
|
|
+#define V3D_CSD_CURRENT_ID0 0x0093c
|
|
+# define V3D_CSD_CURRENT_ID0_WG_X_MASK V3D_MASK(31, 16)
|
|
+# define V3D_CSD_CURRENT_ID0_WG_X_SHIFT 16
|
|
+# define V3D_CSD_CURRENT_ID0_WG_IN_SG_MASK V3D_MASK(11, 8)
|
|
+# define V3D_CSD_CURRENT_ID0_WG_IN_SG_SHIFT 8
|
|
+# define V3D_CSD_CURRENT_ID0_L_IDX_MASK V3D_MASK(7, 0)
|
|
+# define V3D_CSD_CURRENT_ID0_L_IDX_SHIFT 0
|
|
+
|
|
+#define V3D_CSD_CURRENT_ID1 0x00940
|
|
+# define V3D_CSD_CURRENT_ID0_WG_Z_MASK V3D_MASK(31, 16)
|
|
+# define V3D_CSD_CURRENT_ID0_WG_Z_SHIFT 16
|
|
+# define V3D_CSD_CURRENT_ID0_WG_Y_MASK V3D_MASK(15, 0)
|
|
+# define V3D_CSD_CURRENT_ID0_WG_Y_SHIFT 0
|
|
+
|
|
#endif /* V3D_REGS_H */
|
|
--- a/drivers/gpu/drm/v3d/v3d_sched.c
|
|
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
|
|
@@ -48,6 +48,12 @@ to_tfu_job(struct drm_sched_job *sched_j
|
|
return container_of(sched_job, struct v3d_tfu_job, base.base);
|
|
}
|
|
|
|
+static struct v3d_csd_job *
|
|
+to_csd_job(struct drm_sched_job *sched_job)
|
|
+{
|
|
+ return container_of(sched_job, struct v3d_csd_job, base.base);
|
|
+}
|
|
+
|
|
static void
|
|
v3d_job_free(struct drm_sched_job *sched_job)
|
|
{
|
|
@@ -205,6 +211,48 @@ v3d_tfu_job_run(struct drm_sched_job *sc
|
|
return fence;
|
|
}
|
|
|
|
+static struct dma_fence *
|
|
+v3d_csd_job_run(struct drm_sched_job *sched_job)
|
|
+{
|
|
+ struct v3d_csd_job *job = to_csd_job(sched_job);
|
|
+ struct v3d_dev *v3d = job->base.v3d;
|
|
+ struct drm_device *dev = &v3d->drm;
|
|
+ struct dma_fence *fence;
|
|
+ int i;
|
|
+
|
|
+ v3d->csd_job = job;
|
|
+
|
|
+ v3d_invalidate_caches(v3d);
|
|
+
|
|
+ fence = v3d_fence_create(v3d, V3D_CSD);
|
|
+ if (IS_ERR(fence))
|
|
+ return NULL;
|
|
+
|
|
+ if (job->base.irq_fence)
|
|
+ dma_fence_put(job->base.irq_fence);
|
|
+ job->base.irq_fence = dma_fence_get(fence);
|
|
+
|
|
+ trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
|
|
+
|
|
+ for (i = 1; i <= 6; i++)
|
|
+ V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]);
|
|
+ /* CFG0 write kicks off the job. */
|
|
+ V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]);
|
|
+
|
|
+ return fence;
|
|
+}
|
|
+
|
|
+static struct dma_fence *
|
|
+v3d_cache_clean_job_run(struct drm_sched_job *sched_job)
|
|
+{
|
|
+ struct v3d_job *job = to_v3d_job(sched_job);
|
|
+ struct v3d_dev *v3d = job->v3d;
|
|
+
|
|
+ v3d_clean_caches(v3d);
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
static void
|
|
v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
|
|
{
|
|
@@ -277,13 +325,31 @@ v3d_render_job_timedout(struct drm_sched
|
|
}
|
|
|
|
static void
|
|
-v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
|
|
+v3d_generic_job_timedout(struct drm_sched_job *sched_job)
|
|
{
|
|
struct v3d_job *job = to_v3d_job(sched_job);
|
|
|
|
v3d_gpu_reset_for_timeout(job->v3d, sched_job);
|
|
}
|
|
|
|
+static void
|
|
+v3d_csd_job_timedout(struct drm_sched_job *sched_job)
|
|
+{
|
|
+ struct v3d_csd_job *job = to_csd_job(sched_job);
|
|
+ struct v3d_dev *v3d = job->base.v3d;
|
|
+ u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4);
|
|
+
|
|
+ /* If we've made progress, skip reset and let the timer get
|
|
+ * rearmed.
|
|
+ */
|
|
+ if (job->timedout_batches != batches) {
|
|
+ job->timedout_batches = batches;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ v3d_gpu_reset_for_timeout(v3d, sched_job);
|
|
+}
|
|
+
|
|
static const struct drm_sched_backend_ops v3d_bin_sched_ops = {
|
|
.dependency = v3d_job_dependency,
|
|
.run_job = v3d_bin_job_run,
|
|
@@ -301,10 +367,24 @@ static const struct drm_sched_backend_op
|
|
static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
|
|
.dependency = v3d_job_dependency,
|
|
.run_job = v3d_tfu_job_run,
|
|
- .timedout_job = v3d_tfu_job_timedout,
|
|
+ .timedout_job = v3d_generic_job_timedout,
|
|
.free_job = v3d_job_free,
|
|
};
|
|
|
|
+static const struct drm_sched_backend_ops v3d_csd_sched_ops = {
|
|
+ .dependency = v3d_job_dependency,
|
|
+ .run_job = v3d_csd_job_run,
|
|
+ .timedout_job = v3d_csd_job_timedout,
|
|
+ .free_job = v3d_job_free
|
|
+};
|
|
+
|
|
+static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = {
|
|
+ .dependency = v3d_job_dependency,
|
|
+ .run_job = v3d_cache_clean_job_run,
|
|
+ .timedout_job = v3d_generic_job_timedout,
|
|
+ .free_job = v3d_job_free
|
|
+};
|
|
+
|
|
int
|
|
v3d_sched_init(struct v3d_dev *v3d)
|
|
{
|
|
@@ -331,7 +411,7 @@ v3d_sched_init(struct v3d_dev *v3d)
|
|
if (ret) {
|
|
dev_err(v3d->dev, "Failed to create render scheduler: %d.",
|
|
ret);
|
|
- drm_sched_fini(&v3d->queue[V3D_BIN].sched);
|
|
+ v3d_sched_fini(v3d);
|
|
return ret;
|
|
}
|
|
|
|
@@ -343,11 +423,36 @@ v3d_sched_init(struct v3d_dev *v3d)
|
|
if (ret) {
|
|
dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
|
|
ret);
|
|
- drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
|
|
- drm_sched_fini(&v3d->queue[V3D_BIN].sched);
|
|
+ v3d_sched_fini(v3d);
|
|
return ret;
|
|
}
|
|
|
|
+ if (v3d_has_csd(v3d)) {
|
|
+ ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
|
|
+ &v3d_csd_sched_ops,
|
|
+ hw_jobs_limit, job_hang_limit,
|
|
+ msecs_to_jiffies(hang_limit_ms),
|
|
+ "v3d_csd");
|
|
+ if (ret) {
|
|
+ dev_err(v3d->dev, "Failed to create CSD scheduler: %d.",
|
|
+ ret);
|
|
+ v3d_sched_fini(v3d);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
|
|
+ &v3d_cache_clean_sched_ops,
|
|
+ hw_jobs_limit, job_hang_limit,
|
|
+ msecs_to_jiffies(hang_limit_ms),
|
|
+ "v3d_cache_clean");
|
|
+ if (ret) {
|
|
+ dev_err(v3d->dev, "Failed to create CACHE_CLEAN scheduler: %d.",
|
|
+ ret);
|
|
+ v3d_sched_fini(v3d);
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
@@ -356,6 +461,8 @@ v3d_sched_fini(struct v3d_dev *v3d)
|
|
{
|
|
enum v3d_queue q;
|
|
|
|
- for (q = 0; q < V3D_MAX_QUEUES; q++)
|
|
- drm_sched_fini(&v3d->queue[q].sched);
|
|
+ for (q = 0; q < V3D_MAX_QUEUES; q++) {
|
|
+ if (v3d->queue[q].sched.ops)
|
|
+ drm_sched_fini(&v3d->queue[q].sched);
|
|
+ }
|
|
}
|
|
--- a/drivers/gpu/drm/v3d/v3d_trace.h
|
|
+++ b/drivers/gpu/drm/v3d/v3d_trace.h
|
|
@@ -124,6 +124,26 @@ TRACE_EVENT(v3d_tfu_irq,
|
|
__entry->seqno)
|
|
);
|
|
|
|
+TRACE_EVENT(v3d_csd_irq,
|
|
+ TP_PROTO(struct drm_device *dev,
|
|
+ uint64_t seqno),
|
|
+ TP_ARGS(dev, seqno),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u32, dev)
|
|
+ __field(u64, seqno)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = dev->primary->index;
|
|
+ __entry->seqno = seqno;
|
|
+ ),
|
|
+
|
|
+ TP_printk("dev=%u, seqno=%llu",
|
|
+ __entry->dev,
|
|
+ __entry->seqno)
|
|
+);
|
|
+
|
|
TRACE_EVENT(v3d_submit_tfu_ioctl,
|
|
TP_PROTO(struct drm_device *dev, u32 iia),
|
|
TP_ARGS(dev, iia),
|
|
@@ -163,6 +183,80 @@ TRACE_EVENT(v3d_submit_tfu,
|
|
__entry->seqno)
|
|
);
|
|
|
|
+TRACE_EVENT(v3d_submit_csd_ioctl,
|
|
+ TP_PROTO(struct drm_device *dev, u32 cfg5, u32 cfg6),
|
|
+ TP_ARGS(dev, cfg5, cfg6),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u32, dev)
|
|
+ __field(u32, cfg5)
|
|
+ __field(u32, cfg6)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = dev->primary->index;
|
|
+ __entry->cfg5 = cfg5;
|
|
+ __entry->cfg6 = cfg6;
|
|
+ ),
|
|
+
|
|
+ TP_printk("dev=%u, CFG5 0x%08x, CFG6 0x%08x",
|
|
+ __entry->dev,
|
|
+ __entry->cfg5,
|
|
+ __entry->cfg6)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(v3d_submit_csd,
|
|
+ TP_PROTO(struct drm_device *dev,
|
|
+ uint64_t seqno),
|
|
+ TP_ARGS(dev, seqno),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u32, dev)
|
|
+ __field(u64, seqno)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = dev->primary->index;
|
|
+ __entry->seqno = seqno;
|
|
+ ),
|
|
+
|
|
+ TP_printk("dev=%u, seqno=%llu",
|
|
+ __entry->dev,
|
|
+ __entry->seqno)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(v3d_cache_clean_begin,
|
|
+ TP_PROTO(struct drm_device *dev),
|
|
+ TP_ARGS(dev),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u32, dev)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = dev->primary->index;
|
|
+ ),
|
|
+
|
|
+ TP_printk("dev=%u",
|
|
+ __entry->dev)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(v3d_cache_clean_end,
|
|
+ TP_PROTO(struct drm_device *dev),
|
|
+ TP_ARGS(dev),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u32, dev)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = dev->primary->index;
|
|
+ ),
|
|
+
|
|
+ TP_printk("dev=%u",
|
|
+ __entry->dev)
|
|
+);
|
|
+
|
|
TRACE_EVENT(v3d_reset_begin,
|
|
TP_PROTO(struct drm_device *dev),
|
|
TP_ARGS(dev),
|
|
--- a/include/uapi/drm/v3d_drm.h
|
|
+++ b/include/uapi/drm/v3d_drm.h
|
|
@@ -37,6 +37,7 @@ extern "C" {
|
|
#define DRM_V3D_GET_PARAM 0x04
|
|
#define DRM_V3D_GET_BO_OFFSET 0x05
|
|
#define DRM_V3D_SUBMIT_TFU 0x06
|
|
+#define DRM_V3D_SUBMIT_CSD 0x07
|
|
|
|
#define DRM_IOCTL_V3D_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
|
|
#define DRM_IOCTL_V3D_WAIT_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
|
|
@@ -45,6 +46,7 @@ extern "C" {
|
|
#define DRM_IOCTL_V3D_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
|
|
#define DRM_IOCTL_V3D_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
|
|
#define DRM_IOCTL_V3D_SUBMIT_TFU DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
|
|
+#define DRM_IOCTL_V3D_SUBMIT_CSD DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd)
|
|
|
|
/**
|
|
* struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
|
|
@@ -172,6 +174,7 @@ enum drm_v3d_param {
|
|
DRM_V3D_PARAM_V3D_CORE0_IDENT1,
|
|
DRM_V3D_PARAM_V3D_CORE0_IDENT2,
|
|
DRM_V3D_PARAM_SUPPORTS_TFU,
|
|
+ DRM_V3D_PARAM_SUPPORTS_CSD,
|
|
};
|
|
|
|
struct drm_v3d_get_param {
|
|
@@ -212,6 +215,31 @@ struct drm_v3d_submit_tfu {
|
|
__u32 out_sync;
|
|
};
|
|
|
|
+/* Submits a compute shader for dispatch. This job will block on any
|
|
+ * previous compute shaders submitted on this fd, and any other
|
|
+ * synchronization must be performed with in_sync/out_sync.
|
|
+ */
|
|
+struct drm_v3d_submit_csd {
|
|
+ __u32 cfg[7];
|
|
+ __u32 coef[4];
|
|
+
|
|
+ /* Pointer to a u32 array of the BOs that are referenced by the job.
|
|
+ */
|
|
+ __u64 bo_handles;
|
|
+
|
|
+ /* Number of BO handles passed in (size is that times 4). */
|
|
+ __u32 bo_handle_count;
|
|
+
|
|
+ /* sync object to block on before running the CSD job. Each
|
|
+ * CSD job will execute in the order submitted to its FD.
|
|
+ * Synchronization against rendering/TFU jobs or CSD from
|
|
+ * other fds requires using sync objects.
|
|
+ */
|
|
+ __u32 in_sync;
|
|
+ /* Sync object to signal when the CSD job is done. */
|
|
+ __u32 out_sync;
|
|
+};
|
|
+
|
|
#if defined(__cplusplus)
|
|
}
|
|
#endif
|