From 587e6c10a7ce89a5924fdbeff2ec524fbd6a124b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2019 17:16:25 +0100 Subject: [PATCH] iommu/arm-smmu-v3: Reduce contention during command-queue insertion The SMMU command queue is a bottleneck in large systems, thanks to the spin_lock which serialises accesses from all CPUs to the single queue supported by the hardware. Attempt to improve this situation by moving to a new algorithm for inserting commands into the queue, which is lock-free on the fast-path. Tested-by: Ganapatrao Kulkarni Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 715 ++++++++++++++++++++++++++++-------- 1 file changed, 552 insertions(+), 163 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 9ebb8b39a3b1..202b4b6fc70a 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -183,7 +183,7 @@ #define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1)) #define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift)) -#define Q_OVERFLOW_FLAG (1 << 31) +#define Q_OVERFLOW_FLAG (1U << 31) #define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG) #define Q_ENT(q, p) ((q)->base + \ Q_IDX(&((q)->llq), p) * \ @@ -307,6 +307,8 @@ #define CMDQ_ERR_CERROR_ABT_IDX 2 #define CMDQ_ERR_CERROR_ATC_INV_IDX 3 +#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG + #define CMDQ_0_OP GENMASK_ULL(7, 0) #define CMDQ_0_SSV (1UL << 11) @@ -369,9 +371,8 @@ #define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12) /* High-level queue structures */ -#define ARM_SMMU_POLL_TIMEOUT_US 100 -#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */ -#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10 +#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */ +#define ARM_SMMU_POLL_SPIN_COUNT 10 #define MSI_IOVA_BASE 0x8000000 #define MSI_IOVA_LENGTH 0x100000 @@ -473,15 +474,24 @@ struct arm_smmu_cmdq_ent { #define CMDQ_OP_CMD_SYNC 0x46 struct { - u32 msidata; u64 msiaddr; } sync; }; }; struct arm_smmu_ll_queue { - u32 prod; - u32 cons; + union { + u64 val; + struct { + u32 prod; + u32 cons; + }; + struct { + atomic_t prod; + atomic_t cons; + } atomic; + u8 __pad[SMP_CACHE_BYTES]; + } ____cacheline_aligned_in_smp; u32 max_n_shift; }; @@ -499,9 +509,18 @@ struct arm_smmu_queue { u32 __iomem *cons_reg; }; +struct arm_smmu_queue_poll { + ktime_t timeout; + unsigned int delay; + unsigned int spin_cnt; + bool wfe; +}; + struct arm_smmu_cmdq { struct arm_smmu_queue q; - spinlock_t lock; + atomic_long_t *valid_map; + atomic_t owner_prod; + atomic_t lock; }; struct arm_smmu_evtq { @@ -581,8 +600,6 @@ struct arm_smmu_device { int gerr_irq; int combined_irq; - u32 sync_nr; - u8 prev_cmd_opcode; unsigned long ias; /* IPA */ unsigned long oas; /* PA */ @@ -601,12 +618,6 @@ struct arm_smmu_device { struct arm_smmu_strtab_cfg strtab_cfg; - /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */ - union { - u32 sync_count; - u64 padding; - }; - /* IOMMU core code handle */ struct iommu_device iommu; }; @@ -690,6 +701,21 @@ static void parse_driver_options(struct arm_smmu_device *smmu) } /* Low-level queue manipulation functions */ +static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n) +{ + u32 space, prod, cons; + + prod = Q_IDX(q, q->prod); + cons = Q_IDX(q, q->cons); + + if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons)) + space = (1 << q->max_n_shift) - (prod - cons); + else + space = cons - prod; + + return space >= n; +} + static bool queue_full(struct arm_smmu_ll_queue *q) { return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && @@ -702,9 +728,12 @@ static bool queue_empty(struct arm_smmu_ll_queue *q) Q_WRP(q, q->prod) == Q_WRP(q, q->cons); } -static void queue_sync_cons_in(struct arm_smmu_queue *q) +static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod) { - q->llq.cons = readl_relaxed(q->cons_reg); + return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) && + (Q_IDX(q, q->cons) > Q_IDX(q, prod))) || + ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) && + (Q_IDX(q, q->cons) <= Q_IDX(q, prod))); } static void queue_sync_cons_out(struct arm_smmu_queue *q) @@ -735,46 +764,34 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q) return ret; } -static void queue_sync_prod_out(struct arm_smmu_queue *q) +static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n) { - writel(q->llq.prod, q->prod_reg); + u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n; + return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); } -static void queue_inc_prod(struct arm_smmu_ll_queue *q) +static void queue_poll_init(struct arm_smmu_device *smmu, + struct arm_smmu_queue_poll *qp) { - u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1; - q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); + qp->delay = 1; + qp->spin_cnt = 0; + qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); + qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US); } -/* - * Wait for the SMMU to consume items. If sync is true, wait until the queue - * is empty. Otherwise, wait until there is at least one free slot. - */ -static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe) +static int queue_poll(struct arm_smmu_queue_poll *qp) { - ktime_t timeout; - unsigned int delay = 1, spin_cnt = 0; + if (ktime_compare(ktime_get(), qp->timeout) > 0) + return -ETIMEDOUT; - /* Wait longer if it's a CMD_SYNC */ - timeout = ktime_add_us(ktime_get(), sync ? - ARM_SMMU_CMDQ_SYNC_TIMEOUT_US : - ARM_SMMU_POLL_TIMEOUT_US); - - while (queue_sync_cons_in(q), - (sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) { - if (ktime_compare(ktime_get(), timeout) > 0) - return -ETIMEDOUT; - - if (wfe) { - wfe(); - } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) { - cpu_relax(); - continue; - } else { - udelay(delay); - delay *= 2; - spin_cnt = 0; - } + if (qp->wfe) { + wfe(); + } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) { + cpu_relax(); + } else { + udelay(qp->delay); + qp->delay *= 2; + qp->spin_cnt = 0; } return 0; @@ -788,17 +805,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords) *dst++ = cpu_to_le64(*src++); } -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent) -{ - if (queue_full(&q->llq)) - return -ENOSPC; - - queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords); - queue_inc_prod(&q->llq); - queue_sync_prod_out(q); - return 0; -} - static void queue_read(__le64 *dst, u64 *src, size_t n_dwords) { int i; @@ -881,20 +887,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp); break; case CMDQ_OP_CMD_SYNC: - if (ent->sync.msiaddr) + if (ent->sync.msiaddr) { cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ); - else + cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; + } else { cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV); + } cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH); cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB); - /* - * Commands are written little-endian, but we want the SMMU to - * receive MSIData, and thus write it back to memory, in CPU - * byte order, so big-endian needs an extra byteswap here. - */ - cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA, - cpu_to_le32(ent->sync.msidata)); - cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; break; default: return -ENOENT; @@ -903,6 +903,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) return 0; } +static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, + u32 prod) +{ + struct arm_smmu_queue *q = &smmu->cmdq.q; + struct arm_smmu_cmdq_ent ent = { + .opcode = CMDQ_OP_CMD_SYNC, + }; + + /* + * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI + * payload, so the write will zero the entire command on that platform. + */ + if (smmu->features & ARM_SMMU_FEAT_MSI && + smmu->features & ARM_SMMU_FEAT_COHERENCY) { + ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) * + q->ent_dwords * 8; + } + + arm_smmu_cmdq_build_cmd(cmd, &ent); +} + static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) { static const char *cerror_str[] = { @@ -961,109 +982,440 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); } -static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) +/* + * Command queue locking. + * This is a form of bastardised rwlock with the following major changes: + * + * - The only LOCK routines are exclusive_trylock() and shared_lock(). + * Neither have barrier semantics, and instead provide only a control + * dependency. + * + * - The UNLOCK routines are supplemented with shared_tryunlock(), which + * fails if the caller appears to be the last lock holder (yes, this is + * racy). All successful UNLOCK routines have RELEASE semantics. + */ +static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq) { - struct arm_smmu_queue *q = &smmu->cmdq.q; - bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); + int val; - smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]); + /* + * We can try to avoid the cmpxchg() loop by simply incrementing the + * lock counter. When held in exclusive state, the lock counter is set + * to INT_MIN so these increments won't hurt as the value will remain + * negative. + */ + if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0) + return; - while (queue_insert_raw(q, cmd) == -ENOSPC) { - if (queue_poll_cons(q, false, wfe)) - dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); + do { + val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0); + } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val); +} + +static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq) +{ + (void)atomic_dec_return_release(&cmdq->lock); +} + +static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq) +{ + if (atomic_read(&cmdq->lock) == 1) + return false; + + arm_smmu_cmdq_shared_unlock(cmdq); + return true; +} + +#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \ +({ \ + bool __ret; \ + local_irq_save(flags); \ + __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \ + if (!__ret) \ + local_irq_restore(flags); \ + __ret; \ +}) + +#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \ +({ \ + atomic_set_release(&cmdq->lock, 0); \ + local_irq_restore(flags); \ +}) + + +/* + * Command queue insertion. + * This is made fiddly by our attempts to achieve some sort of scalability + * since there is one queue shared amongst all of the CPUs in the system. If + * you like mixed-size concurrency, dependency ordering and relaxed atomics, + * then you'll *love* this monstrosity. + * + * The basic idea is to split the queue up into ranges of commands that are + * owned by a given CPU; the owner may not have written all of the commands + * itself, but is responsible for advancing the hardware prod pointer when + * the time comes. The algorithm is roughly: + * + * 1. Allocate some space in the queue. At this point we also discover + * whether the head of the queue is currently owned by another CPU, + * or whether we are the owner. + * + * 2. Write our commands into our allocated slots in the queue. + * + * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map. + * + * 4. If we are an owner: + * a. Wait for the previous owner to finish. + * b. Mark the queue head as unowned, which tells us the range + * that we are responsible for publishing. + * c. Wait for all commands in our owned range to become valid. + * d. Advance the hardware prod pointer. + * e. Tell the next owner we've finished. + * + * 5. If we are inserting a CMD_SYNC (we may or may not have been an + * owner), then we need to stick around until it has completed: + * a. If we have MSIs, the SMMU can write back into the CMD_SYNC + * to clear the first 4 bytes. + * b. Otherwise, we spin waiting for the hardware cons pointer to + * advance past our command. + * + * The devil is in the details, particularly the use of locking for handling + * SYNC completion and freeing up space in the queue before we think that it is + * full. + */ +static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod, bool set) +{ + u32 swidx, sbidx, ewidx, ebidx; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + .prod = sprod, + }; + + ewidx = BIT_WORD(Q_IDX(&llq, eprod)); + ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG; + + while (llq.prod != eprod) { + unsigned long mask; + atomic_long_t *ptr; + u32 limit = BITS_PER_LONG; + + swidx = BIT_WORD(Q_IDX(&llq, llq.prod)); + sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG; + + ptr = &cmdq->valid_map[swidx]; + + if ((swidx == ewidx) && (sbidx < ebidx)) + limit = ebidx; + + mask = GENMASK(limit - 1, sbidx); + + /* + * The valid bit is the inverse of the wrap bit. This means + * that a zero-initialised queue is invalid and, after marking + * all entries as valid, they become invalid again when we + * wrap. + */ + if (set) { + atomic_long_xor(mask, ptr); + } else { /* Poll */ + unsigned long valid; + + valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask; + atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid); + } + + llq.prod = queue_inc_prod_n(&llq, limit - sbidx); } } -static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, - struct arm_smmu_cmdq_ent *ent) +/* Mark all entries in the range [sprod, eprod) as valid */ +static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod) +{ + __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true); +} + +/* Wait for all entries in the range [sprod, eprod) to become valid */ +static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod) +{ + __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false); +} + +/* Wait for the command queue to become non-full */ +static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) +{ + unsigned long flags; + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + int ret = 0; + + /* + * Try to update our copy of cons by grabbing exclusive cmdq access. If + * that fails, spin until somebody else updates it for us. + */ + if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) { + WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg)); + arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags); + llq->val = READ_ONCE(cmdq->q.llq.val); + return 0; + } + + queue_poll_init(smmu, &qp); + do { + llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + if (!queue_full(llq)) + break; + + ret = queue_poll(&qp); + } while (!ret); + + return ret; +} + +/* + * Wait until the SMMU signals a CMD_SYNC completion MSI. + * Must be called with the cmdq lock held in some capacity. + */ +static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) +{ + int ret = 0; + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); + + queue_poll_init(smmu, &qp); + + /* + * The MSI won't generate an event, since it's being written back + * into the command queue. + */ + qp.wfe = false; + smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp))); + llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1); + return ret; +} + +/* + * Wait until the SMMU cons index passes llq->prod. + * Must be called with the cmdq lock held in some capacity. + */ +static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) +{ + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + u32 prod = llq->prod; + int ret = 0; + + queue_poll_init(smmu, &qp); + llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + do { + if (queue_consumed(llq, prod)) + break; + + ret = queue_poll(&qp); + + /* + * This needs to be a readl() so that our subsequent call + * to arm_smmu_cmdq_shared_tryunlock() can fail accurately. + * + * Specifically, we need to ensure that we observe all + * shared_lock()s by other CMD_SYNCs that share our owner, + * so that a failing call to tryunlock() means that we're + * the last one out and therefore we can safely advance + * cmdq->q.llq.cons. Roughly speaking: + * + * CPU 0 CPU1 CPU2 (us) + * + * if (sync) + * shared_lock(); + * + * dma_wmb(); + * set_valid_map(); + * + * if (owner) { + * poll_valid_map(); + * + * writel(prod_reg); + * + * readl(cons_reg); + * tryunlock(); + * + * Requires us to see CPU 0's shared_lock() acquisition. + */ + llq->cons = readl(cmdq->q.cons_reg); + } while (!ret); + + return ret; +} + +static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) +{ + if (smmu->features & ARM_SMMU_FEAT_MSI && + smmu->features & ARM_SMMU_FEAT_COHERENCY) + return __arm_smmu_cmdq_poll_until_msi(smmu, llq); + + return __arm_smmu_cmdq_poll_until_consumed(smmu, llq); +} + +static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, + u32 prod, int n) +{ + int i; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + .prod = prod, + }; + + for (i = 0; i < n; ++i) { + u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS]; + + prod = queue_inc_prod_n(&llq, i); + queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS); + } +} + +static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + u64 *cmds, int n, bool sync) +{ + u64 cmd_sync[CMDQ_ENT_DWORDS]; + u32 prod; + unsigned long flags; + bool owner; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + }, head = llq; + int ret = 0; + + /* 1. Allocate some space in the queue */ + local_irq_save(flags); + llq.val = READ_ONCE(cmdq->q.llq.val); + do { + u64 old; + + while (!queue_has_space(&llq, n + sync)) { + local_irq_restore(flags); + if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) + dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); + local_irq_save(flags); + } + + head.cons = llq.cons; + head.prod = queue_inc_prod_n(&llq, n + sync) | + CMDQ_PROD_OWNED_FLAG; + + old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); + if (old == llq.val) + break; + + llq.val = old; + } while (1); + owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); + head.prod &= ~CMDQ_PROD_OWNED_FLAG; + llq.prod &= ~CMDQ_PROD_OWNED_FLAG; + + /* + * 2. Write our commands into the queue + * Dependency ordering from the cmpxchg() loop above. + */ + arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); + if (sync) { + prod = queue_inc_prod_n(&llq, n); + arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod); + queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); + + /* + * In order to determine completion of our CMD_SYNC, we must + * ensure that the queue can't wrap twice without us noticing. + * We achieve that by taking the cmdq lock as shared before + * marking our slot as valid. + */ + arm_smmu_cmdq_shared_lock(cmdq); + } + + /* 3. Mark our slots as valid, ensuring commands are visible first */ + dma_wmb(); + arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod); + + /* 4. If we are the owner, take control of the SMMU hardware */ + if (owner) { + /* a. Wait for previous owner to finish */ + atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod); + + /* b. Stop gathering work by clearing the owned flag */ + prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG, + &cmdq->q.llq.atomic.prod); + prod &= ~CMDQ_PROD_OWNED_FLAG; + + /* + * c. Wait for any gathered work to be written to the queue. + * Note that we read our own entries so that we have the control + * dependency required by (d). + */ + arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod); + + /* + * d. Advance the hardware prod pointer + * Control dependency ordering from the entries becoming valid. + */ + writel_relaxed(prod, cmdq->q.prod_reg); + + /* + * e. Tell the next owner we're done + * Make sure we've updated the hardware first, so that we don't + * race to update prod and potentially move it backwards. + */ + atomic_set_release(&cmdq->owner_prod, prod); + } + + /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ + if (sync) { + llq.prod = queue_inc_prod_n(&llq, n); + ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq); + if (ret) { + dev_err_ratelimited(smmu->dev, + "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n", + llq.prod, + readl_relaxed(cmdq->q.prod_reg), + readl_relaxed(cmdq->q.cons_reg)); + } + + /* + * Try to unlock the cmq lock. This will fail if we're the last + * reader, in which case we can safely update cmdq->q.llq.cons + */ + if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) { + WRITE_ONCE(cmdq->q.llq.cons, llq.cons); + arm_smmu_cmdq_shared_unlock(cmdq); + } + } + + local_irq_restore(flags); + return ret; +} + +static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) { u64 cmd[CMDQ_ENT_DWORDS]; - unsigned long flags; if (arm_smmu_cmdq_build_cmd(cmd, ent)) { dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", ent->opcode); - return; + return -EINVAL; } - spin_lock_irqsave(&smmu->cmdq.lock, flags); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); -} - -/* - * The difference between val and sync_idx is bounded by the maximum size of - * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic. - */ -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx) -{ - ktime_t timeout; - u32 val; - - timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US); - val = smp_cond_load_acquire(&smmu->sync_count, - (int)(VAL - sync_idx) >= 0 || - !ktime_before(ktime_get(), timeout)); - - return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; -} - -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu) -{ - u64 cmd[CMDQ_ENT_DWORDS]; - unsigned long flags; - struct arm_smmu_cmdq_ent ent = { - .opcode = CMDQ_OP_CMD_SYNC, - .sync = { - .msiaddr = virt_to_phys(&smmu->sync_count), - }, - }; - - spin_lock_irqsave(&smmu->cmdq.lock, flags); - - /* Piggy-back on the previous command if it's a SYNC */ - if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) { - ent.sync.msidata = smmu->sync_nr; - } else { - ent.sync.msidata = ++smmu->sync_nr; - arm_smmu_cmdq_build_cmd(cmd, &ent); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - } - - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); - - return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); -} - -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) -{ - u64 cmd[CMDQ_ENT_DWORDS]; - unsigned long flags; - bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); - struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; - int ret; - - arm_smmu_cmdq_build_cmd(cmd, &ent); - - spin_lock_irqsave(&smmu->cmdq.lock, flags); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); - - return ret; + return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); } static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) { - int ret; - bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) && - (smmu->features & ARM_SMMU_FEAT_COHERENCY); - - ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu) - : __arm_smmu_cmdq_issue_sync(smmu); - if (ret) - dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n"); - return ret; + return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); } /* Context descriptor manipulation functions */ @@ -1580,9 +1932,9 @@ static void arm_smmu_tlb_inv_context(void *cookie) /* * NOTE: when io-pgtable is in non-strict mode, we may get here with * PTEs previously cleared by unmaps on the current CPU not yet visible - * to the SMMU. We are relying on the DSB implicit in - * queue_sync_prod_out() to guarantee those are observed before the - * TLBI. Do be careful, 007. + * to the SMMU. We are relying on the dma_wmb() implicit during cmd + * insertion to guarantee those are observed before the TLBI. Do be + * careful, 007. */ arm_smmu_cmdq_issue_cmd(smmu, &cmd); arm_smmu_cmdq_issue_sync(smmu); @@ -2359,18 +2711,49 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, return 0; } +static void arm_smmu_cmdq_free_bitmap(void *data) +{ + unsigned long *bitmap = data; + bitmap_free(bitmap); +} + +static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) +{ + int ret = 0; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + unsigned int nents = 1 << cmdq->q.llq.max_n_shift; + atomic_long_t *bitmap; + + atomic_set(&cmdq->owner_prod, 0); + atomic_set(&cmdq->lock, 0); + + bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); + if (!bitmap) { + dev_err(smmu->dev, "failed to allocate cmdq bitmap\n"); + ret = -ENOMEM; + } else { + cmdq->valid_map = bitmap; + devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap); + } + + return ret; +} + static int arm_smmu_init_queues(struct arm_smmu_device *smmu) { int ret; /* cmdq */ - spin_lock_init(&smmu->cmdq.lock); ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD, ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS, "cmdq"); if (ret) return ret; + ret = arm_smmu_cmdq_init(smmu); + if (ret) + return ret; + /* evtq */ ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD, ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS, @@ -2951,9 +3334,15 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) /* Queue sizes, capped to ensure natural alignment */ smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, FIELD_GET(IDR1_CMDQS, reg)); - if (!smmu->cmdq.q.llq.max_n_shift) { - /* Odd alignment restrictions on the base, so ignore for now */ - dev_err(smmu->dev, "unit-length command queue not supported\n"); + if (smmu->cmdq.q.llq.max_n_shift < ilog2(BITS_PER_LONG)) { + /* + * The cmdq valid_map relies on the total number of entries + * being a multiple of BITS_PER_LONG. There's also no way + * we can handle the weird alignment restrictions on the + * base pointer for a unit-length queue. + */ + dev_err(smmu->dev, "command queue size < %d entries not supported\n", + BITS_PER_LONG); return -ENXIO; }