This tag contains the following changes for kernel 5.10-rc1:

- Stop using the DRM's dma-fence module and instead use kernel completions.
 - Support PCIe AER
 - Use dma_mmap_coherent for memory allocated using dma_alloc_coherent
 - Use smallest possible alignment when allocating virtual addresses in our
   MMU driver.
 - Refactor MMU driver code to be device-oriented
 - Allow user to check CS status without any sleep
 - Add an option to map a Command Buffer to the Device's MMU
 - Expose sync manager resource allocation to user through INFO IOCTL
 - Convert code to use standard BIT(), GENMASK() and FIELD_PREP()
 - Many small fixes (casting, better error messages, remove unused
   defines, h/w configuration fixes, etc.)
 -----BEGIN PGP SIGNATURE-----
 
 iQFKBAABCgA0FiEE7TEboABC71LctBLFZR1NuKta54AFAl9qIHgWHG9kZWQuZ2Fi
 YmF5QGdtYWlsLmNvbQAKCRBlHU24q1rngKKmB/9EzD+xOFKFM7NOWdbkmJQ8di7P
 eLHlqSh8EpIKyvVtiNp5oMMUwBM7bBK4gsvHqPK4cQNUN6wvqvDYt9lRhjgSua1A
 8heVtWpot0+d8PVT64PDWYxRfkI4UiOhQDpyj2vhrBkepW9cQlxs/DTlJHfDAQRd
 ihY3H94tX5DO5/wy7W9XC5BChvgMMoj9KIP5+wYdReaPbgQyIx9x7L8GbqE1aS9C
 daHmFGXxSfJffxDGkIot3XczrmoBIx+9qtL7EZo2HkC6s1IyBnX1KgxvAJR5urt8
 FFd0ma3Md+8PLkEeNX3VJrDAnQPskCvmrU2B61PftnI0EC3eW7mRufM0+kiM
 =zQ26
 -----END PGP SIGNATURE-----

Merge tag 'misc-habanalabs-next-2020-09-22' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next

Oded writes:

This tag contains the following changes for kernel 5.10-rc1:

- Stop using the DRM's dma-fence module and instead use kernel completions.
- Support PCIe AER
- Use dma_mmap_coherent for memory allocated using dma_alloc_coherent
- Use smallest possible alignment when allocating virtual addresses in our
  MMU driver.
- Refactor MMU driver code to be device-oriented
- Allow user to check CS status without any sleep
- Add an option to map a Command Buffer to the Device's MMU
- Expose sync manager resource allocation to user through INFO IOCTL
- Convert code to use standard BIT(), GENMASK() and FIELD_PREP()
- Many small fixes (casting, better error messages, remove unused
  defines, h/w configuration fixes, etc.)

* tag 'misc-habanalabs-next-2020-09-22' of git://people.freedesktop.org/~gabbayo/linux: (46 commits)
  habanalabs: update scratchpad register map
  habanalabs: add indication of security-enabled F/W
  habanalabs/gaudi: fix DMA completions max outstanding to 15
  habanalabs/gaudi: remove axi drain support
  habanalabs: update firmware interface file
  habanalabs: Add an option to map CB to device MMU
  habanalabs: Save context in a command buffer object
  habanalabs: no need for DMA_SHARED_BUFFER
  habanalabs: allow to wait on CS without sleep
  habanalabs/gaudi: increase timeout for boot fit load
  habanalabs: add debugfs support for MMU with 6 HOPs
  habanalabs: add num_hops to hl_mmu_properties
  habanalabs: refactor MMU as device-oriented
  habanalabs: rename mmu.c to mmu_v1.c
  habanalabs: use smallest possible alignment for virtual addresses
  habanalabs: check flag before reset because of f/w event
  habanalabs: increase PQ COMP_OFFSET by one nibble
  habanalabs: Fix alignment issue in cpucp_info structure
  habanalabs: remove unused define
  habanalabs: remove unused ASIC function pointer
  ...
This commit is contained in:
Greg Kroah-Hartman 2020-09-22 18:38:09 +02:00
commit 9e07279310
33 changed files with 8652 additions and 7694 deletions

View File

@ -2,13 +2,17 @@ What: /sys/class/habanalabs/hl<n>/armcp_kernel_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the Linux kernel running on the device's CPU
Description: Version of the Linux kernel running on the device's CPU.
Will be DEPRECATED in Linux kernel version 5.10, and be
replaced with cpucp_kernel_ver
What: /sys/class/habanalabs/hl<n>/armcp_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the application running on the device's CPU
Will be DEPRECATED in Linux kernel version 5.10, and be
replaced with cpucp_ver
What: /sys/class/habanalabs/hl<n>/clk_max_freq_mhz
Date: Jun 2019
@ -33,6 +37,18 @@ KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Version of the Device's CPLD F/W
What: /sys/class/habanalabs/hl<n>/cpucp_kernel_ver
Date: Oct 2020
KernelVersion: 5.10
Contact: oded.gabbay@gmail.com
Description: Version of the Linux kernel running on the device's CPU
What: /sys/class/habanalabs/hl<n>/cpucp_ver
Date: Oct 2020
KernelVersion: 5.10
Contact: oded.gabbay@gmail.com
Description: Version of the application running on the device's CPU
What: /sys/class/habanalabs/hl<n>/device_type
Date: Jan 2019
KernelVersion: 5.1

View File

@ -7,7 +7,6 @@ config HABANA_AI
tristate "HabanaAI accelerators (habanalabs)"
depends on PCI && HAS_IOMEM
select FRAME_VECTOR
select DMA_SHARED_BUFFER
select GENERIC_ALLOCATOR
select HWMON
help

View File

@ -3,5 +3,5 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
common/asid.o common/habanalabs_ioctl.o \
common/command_buffer.o common/hw_queue.o common/irq.o \
common/sysfs.o common/hwmon.o common/memory.o \
common/command_submission.o common/mmu.o common/firmware_if.o \
common/pci.o
common/command_submission.o common/mmu.o common/mmu_v1.o \
common/firmware_if.o common/pci.o

View File

@ -13,6 +13,131 @@
#include <linux/uaccess.h>
#include <linux/genalloc.h>
static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_vm_va_block *va_block, *tmp;
dma_addr_t bus_addr;
u64 virt_addr;
u32 page_size = prop->pmmu.page_size;
s32 offset;
int rc;
if (!hdev->supports_cb_mapping) {
dev_err_ratelimited(hdev->dev,
"Cannot map CB because no VA range is allocated for CB mapping\n");
return -EINVAL;
}
if (!hdev->mmu_enable) {
dev_err_ratelimited(hdev->dev,
"Cannot map CB because MMU is disabled\n");
return -EINVAL;
}
INIT_LIST_HEAD(&cb->va_block_list);
for (bus_addr = cb->bus_address;
bus_addr < cb->bus_address + cb->size;
bus_addr += page_size) {
virt_addr = (u64) gen_pool_alloc(ctx->cb_va_pool, page_size);
if (!virt_addr) {
dev_err(hdev->dev,
"Failed to allocate device virtual address for CB\n");
rc = -ENOMEM;
goto err_va_pool_free;
}
va_block = kzalloc(sizeof(*va_block), GFP_KERNEL);
if (!va_block) {
rc = -ENOMEM;
gen_pool_free(ctx->cb_va_pool, virt_addr, page_size);
goto err_va_pool_free;
}
va_block->start = virt_addr;
va_block->end = virt_addr + page_size;
va_block->size = page_size;
list_add_tail(&va_block->node, &cb->va_block_list);
}
mutex_lock(&ctx->mmu_lock);
bus_addr = cb->bus_address;
offset = 0;
list_for_each_entry(va_block, &cb->va_block_list, node) {
rc = hl_mmu_map(ctx, va_block->start, bus_addr, va_block->size,
list_is_last(&va_block->node,
&cb->va_block_list));
if (rc) {
dev_err(hdev->dev, "Failed to map VA %#llx to CB\n",
va_block->start);
goto err_va_umap;
}
bus_addr += va_block->size;
offset += va_block->size;
}
hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR);
mutex_unlock(&ctx->mmu_lock);
cb->is_mmu_mapped = true;
return 0;
err_va_umap:
list_for_each_entry(va_block, &cb->va_block_list, node) {
if (offset <= 0)
break;
hl_mmu_unmap(ctx, va_block->start, va_block->size,
offset <= va_block->size);
offset -= va_block->size;
}
hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
mutex_unlock(&ctx->mmu_lock);
err_va_pool_free:
list_for_each_entry_safe(va_block, tmp, &cb->va_block_list, node) {
gen_pool_free(ctx->cb_va_pool, va_block->start, va_block->size);
list_del(&va_block->node);
kfree(va_block);
}
return rc;
}
static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb)
{
struct hl_device *hdev = ctx->hdev;
struct hl_vm_va_block *va_block, *tmp;
mutex_lock(&ctx->mmu_lock);
list_for_each_entry(va_block, &cb->va_block_list, node)
if (hl_mmu_unmap(ctx, va_block->start, va_block->size,
list_is_last(&va_block->node,
&cb->va_block_list)))
dev_warn_ratelimited(hdev->dev,
"Failed to unmap CB's va 0x%llx\n",
va_block->start);
hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
mutex_unlock(&ctx->mmu_lock);
list_for_each_entry_safe(va_block, tmp, &cb->va_block_list, node) {
gen_pool_free(ctx->cb_va_pool, va_block->start, va_block->size);
list_del(&va_block->node);
kfree(va_block);
}
}
static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
{
if (cb->is_internal)
@ -47,6 +172,11 @@ static void cb_release(struct kref *ref)
hl_debugfs_remove_cb(cb);
if (cb->is_mmu_mapped)
cb_unmap_mem(cb->ctx, cb);
hl_ctx_put(cb->ctx);
cb_do_release(hdev, cb);
}
@ -107,11 +237,12 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
}
int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
u32 cb_size, u64 *handle, int ctx_id, bool internal_cb)
struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
bool map_cb, u64 *handle)
{
struct hl_cb *cb;
bool alloc_new_cb = true;
int rc;
int rc, ctx_id = ctx->asid;
/*
* Can't use generic function to check this because of special case
@ -163,7 +294,21 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
}
cb->hdev = hdev;
cb->ctx_id = ctx_id;
cb->ctx = ctx;
hl_ctx_get(hdev, cb->ctx);
if (map_cb) {
if (ctx_id == HL_KERNEL_ASID_ID) {
dev_err(hdev->dev,
"CB mapping is not supported for kernel context\n");
rc = -EINVAL;
goto release_cb;
}
rc = cb_map_mem(ctx, cb);
if (rc)
goto release_cb;
}
spin_lock(&mgr->cb_lock);
rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC);
@ -171,10 +316,10 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
if (rc < 0) {
dev_err(hdev->dev, "Failed to allocate IDR for a new CB\n");
goto release_cb;
goto unmap_mem;
}
cb->id = rc;
cb->id = (u64) rc;
kref_init(&cb->refcount);
spin_lock_init(&cb->lock);
@ -183,14 +328,18 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
* idr is 32-bit so we can safely OR it with a mask that is above
* 32 bit
*/
*handle = cb->id | HL_MMAP_CB_MASK;
*handle = cb->id | HL_MMAP_TYPE_CB;
*handle <<= PAGE_SHIFT;
hl_debugfs_add_cb(cb);
return 0;
unmap_mem:
if (cb->is_mmu_mapped)
cb_unmap_mem(cb->ctx, cb);
release_cb:
hl_ctx_put(cb->ctx);
cb_do_release(hdev, cb);
out_err:
*handle = 0;
@ -250,9 +399,10 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
args->in.cb_size, HL_MAX_CB_SIZE);
rc = -EINVAL;
} else {
rc = hl_cb_create(hdev, &hpriv->cb_mgr,
args->in.cb_size, &handle,
hpriv->ctx->asid, false);
rc = hl_cb_create(hdev, &hpriv->cb_mgr, hpriv->ctx,
args->in.cb_size, false,
!!(args->in.flags & HL_CB_FLAGS_MAP),
&handle);
}
memset(args, 0, sizeof(*args));
@ -300,11 +450,14 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
{
struct hl_device *hdev = hpriv->hdev;
struct hl_cb *cb;
phys_addr_t address;
u32 handle, user_cb_size;
int rc;
/* We use the page offset to hold the idr and thus we need to clear
* it before doing the mmap itself
*/
handle = vma->vm_pgoff;
vma->vm_pgoff = 0;
/* reference was taken here */
cb = hl_cb_get(hdev, &hpriv->cb_mgr, handle);
@ -356,12 +509,8 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
vma->vm_private_data = cb;
/* Calculate address for CB */
address = virt_to_phys((void *) (uintptr_t) cb->kernel_address);
rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address,
address, cb->size);
rc = hdev->asic_funcs->cb_mmap(hdev, vma, (void *) cb->kernel_address,
cb->bus_address, cb->size);
if (rc) {
spin_lock(&cb->lock);
cb->mmap = false;
@ -425,7 +574,7 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
if (kref_put(&cb->refcount, cb_release) != 1)
dev_err(hdev->dev,
"CB %d for CTX ID %d is still alive\n",
id, cb->ctx_id);
id, cb->ctx->asid);
}
idr_destroy(&mgr->cb_handles);
@ -438,8 +587,8 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
struct hl_cb *cb;
int rc;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
HL_KERNEL_ASID_ID, internal_cb);
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx, cb_size,
internal_cb, false, &cb_handle);
if (rc) {
dev_err(hdev->dev,
"Failed to allocate CB for the kernel driver %d\n", rc);
@ -495,3 +644,45 @@ int hl_cb_pool_fini(struct hl_device *hdev)
return 0;
}
int hl_cb_va_pool_init(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
int rc;
if (!hdev->supports_cb_mapping)
return 0;
ctx->cb_va_pool = gen_pool_create(__ffs(prop->pmmu.page_size), -1);
if (!ctx->cb_va_pool) {
dev_err(hdev->dev,
"Failed to create VA gen pool for CB mapping\n");
return -ENOMEM;
}
rc = gen_pool_add(ctx->cb_va_pool, prop->cb_va_start_addr,
prop->cb_va_end_addr - prop->cb_va_start_addr, -1);
if (rc) {
dev_err(hdev->dev,
"Failed to add memory to VA gen pool for CB mapping\n");
goto err_pool_destroy;
}
return 0;
err_pool_destroy:
gen_pool_destroy(ctx->cb_va_pool);
return rc;
}
void hl_cb_va_pool_fini(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
if (!hdev->supports_cb_mapping)
return;
gen_pool_destroy(ctx->cb_va_pool);
}

View File

@ -38,26 +38,10 @@ void hl_sob_reset_error(struct kref *ref)
hw_sob->q_idx, hw_sob->sob_id);
}
static const char *hl_fence_get_driver_name(struct dma_fence *fence)
{
return "HabanaLabs";
}
static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
{
struct hl_cs_compl *hl_cs_compl =
container_of(fence, struct hl_cs_compl, base_fence);
return dev_name(hl_cs_compl->hdev->dev);
}
static bool hl_fence_enable_signaling(struct dma_fence *fence)
{
return true;
}
static void hl_fence_release(struct dma_fence *fence)
static void hl_fence_release(struct kref *kref)
{
struct hl_fence *fence =
container_of(kref, struct hl_fence, refcount);
struct hl_cs_compl *hl_cs_cmpl =
container_of(fence, struct hl_cs_compl, base_fence);
struct hl_device *hdev = hl_cs_cmpl->hdev;
@ -99,15 +83,27 @@ static void hl_fence_release(struct dma_fence *fence)
}
free:
kfree_rcu(hl_cs_cmpl, base_fence.rcu);
kfree(hl_cs_cmpl);
}
static const struct dma_fence_ops hl_fence_ops = {
.get_driver_name = hl_fence_get_driver_name,
.get_timeline_name = hl_fence_get_timeline_name,
.enable_signaling = hl_fence_enable_signaling,
.release = hl_fence_release
};
void hl_fence_put(struct hl_fence *fence)
{
if (fence)
kref_put(&fence->refcount, hl_fence_release);
}
void hl_fence_get(struct hl_fence *fence)
{
if (fence)
kref_get(&fence->refcount);
}
static void hl_fence_init(struct hl_fence *fence)
{
kref_init(&fence->refcount);
fence->error = 0;
init_completion(&fence->completion);
}
static void cs_get(struct hl_cs *cs)
{
@ -256,6 +252,8 @@ static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx)
ctx->cs_counters.parsing_drop_cnt;
hdev->aggregated_cs_counters.queue_full_drop_cnt +=
ctx->cs_counters.queue_full_drop_cnt;
hdev->aggregated_cs_counters.max_cs_in_flight_drop_cnt +=
ctx->cs_counters.max_cs_in_flight_drop_cnt;
}
static void cs_do_release(struct kref *ref)
@ -336,7 +334,7 @@ static void cs_do_release(struct kref *ref)
* In case the wait for signal CS was submitted, the put occurs
* in init_signal_wait_cs() right before hanging on the PQ.
*/
dma_fence_put(cs->signal_fence);
hl_fence_put(cs->signal_fence);
}
/*
@ -348,19 +346,18 @@ static void cs_do_release(struct kref *ref)
hl_ctx_put(cs->ctx);
/* We need to mark an error for not submitted because in that case
* the dma fence release flow is different. Mainly, we don't need
* the hl fence release flow is different. Mainly, we don't need
* to handle hw_sob for signal/wait
*/
if (cs->timedout)
dma_fence_set_error(cs->fence, -ETIMEDOUT);
cs->fence->error = -ETIMEDOUT;
else if (cs->aborted)
dma_fence_set_error(cs->fence, -EIO);
cs->fence->error = -EIO;
else if (!cs->submitted)
dma_fence_set_error(cs->fence, -EBUSY);
dma_fence_signal(cs->fence);
dma_fence_put(cs->fence);
cs->fence->error = -EBUSY;
complete_all(&cs->fence->completion);
hl_fence_put(cs->fence);
cs_counters_aggregate(hdev, cs->ctx);
kfree(cs->jobs_in_queue_cnt);
@ -401,7 +398,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
enum hl_cs_type cs_type, struct hl_cs **cs_new)
{
struct hl_cs_compl *cs_cmpl;
struct dma_fence *other = NULL;
struct hl_fence *other = NULL;
struct hl_cs *cs;
int rc;
@ -434,9 +431,11 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
cs_cmpl->cs_seq = ctx->cs_sequence;
other = ctx->cs_pending[cs_cmpl->cs_seq &
(hdev->asic_prop.max_pending_cs - 1)];
if ((other) && (!dma_fence_is_signaled(other))) {
dev_dbg(hdev->dev,
if (other && !completion_done(&other->completion)) {
dev_dbg_ratelimited(hdev->dev,
"Rejecting CS because of too many in-flights CS\n");
ctx->cs_counters.max_cs_in_flight_drop_cnt++;
rc = -EAGAIN;
goto free_fence;
}
@ -448,8 +447,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
goto free_fence;
}
dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
ctx->asid, ctx->cs_sequence);
/* init hl_fence */
hl_fence_init(&cs_cmpl->base_fence);
cs->sequence = cs_cmpl->cs_seq;
@ -458,9 +457,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
&cs_cmpl->base_fence;
ctx->cs_sequence++;
dma_fence_get(&cs_cmpl->base_fence);
hl_fence_get(&cs_cmpl->base_fence);
dma_fence_put(other);
hl_fence_put(other);
spin_unlock(&ctx->cs_lock);
@ -690,8 +689,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
rc = -ENOMEM;
if (is_kernel_allocated_cb)
goto release_cb;
else
goto free_cs_object;
goto free_cs_object;
}
job->id = i + 1;
@ -773,7 +772,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
struct hl_ctx *ctx = hpriv->ctx;
struct hl_cs_chunk *cs_chunk_array, *chunk;
struct hw_queue_properties *hw_queue_prop;
struct dma_fence *sig_fence = NULL;
struct hl_fence *sig_fence = NULL;
struct hl_cs_job *job;
struct hl_cs *cs;
struct hl_cb *cb;
@ -883,14 +882,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
dev_err(hdev->dev,
"CS seq 0x%llx is not of a signal CS\n",
signal_seq);
dma_fence_put(sig_fence);
hl_fence_put(sig_fence);
rc = -EINVAL;
goto free_signal_seq_array;
}
if (dma_fence_is_signaled(sig_fence)) {
if (completion_done(&sig_fence->completion)) {
/* signal CS already finished */
dma_fence_put(sig_fence);
hl_fence_put(sig_fence);
rc = 0;
goto free_signal_seq_array;
}
@ -902,7 +901,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
rc = allocate_cs(hdev, ctx, cs_type, &cs);
if (rc) {
if (cs_type == CS_TYPE_WAIT)
dma_fence_put(sig_fence);
hl_fence_put(sig_fence);
hl_ctx_put(ctx);
goto free_signal_seq_array;
}
@ -1162,7 +1161,7 @@ out:
static long _hl_cs_wait_ioctl(struct hl_device *hdev,
struct hl_ctx *ctx, u64 timeout_us, u64 seq)
{
struct dma_fence *fence;
struct hl_fence *fence;
unsigned long timeout;
long rc;
@ -1181,12 +1180,18 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
"Can't wait on CS %llu because current CS is at seq %llu\n",
seq, ctx->cs_sequence);
} else if (fence) {
rc = dma_fence_wait_timeout(fence, true, timeout);
if (!timeout_us)
rc = completion_done(&fence->completion);
else
rc = wait_for_completion_interruptible_timeout(
&fence->completion, timeout);
if (fence->error == -ETIMEDOUT)
rc = -ETIMEDOUT;
else if (fence->error == -EIO)
rc = -EIO;
dma_fence_put(fence);
hl_fence_put(fence);
} else {
dev_dbg(hdev->dev,
"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",

View File

@ -23,7 +23,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
*/
for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
dma_fence_put(ctx->cs_pending[i]);
hl_fence_put(ctx->cs_pending[i]);
kfree(ctx->cs_pending);
@ -37,6 +37,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
hl_device_set_debug_mode(hdev, false);
hl_cb_va_pool_fini(ctx);
hl_vm_ctx_fini(ctx);
hl_asid_free(hdev, ctx->asid);
} else {
@ -128,7 +129,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
atomic_set(&ctx->thread_ctx_switch_token, 1);
ctx->thread_ctx_switch_wait_token = 0;
ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
sizeof(struct dma_fence *),
sizeof(struct hl_fence *),
GFP_KERNEL);
if (!ctx->cs_pending)
return -ENOMEM;
@ -155,15 +156,24 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
goto err_asid_free;
}
rc = hl_cb_va_pool_init(ctx);
if (rc) {
dev_err(hdev->dev,
"Failed to init VA pool for mapped CB\n");
goto err_vm_ctx_fini;
}
rc = hdev->asic_funcs->ctx_init(ctx);
if (rc) {
dev_err(hdev->dev, "ctx_init failed\n");
goto err_vm_ctx_fini;
goto err_cb_va_pool_fini;
}
}
return 0;
err_cb_va_pool_fini:
hl_cb_va_pool_fini(ctx);
err_vm_ctx_fini:
hl_vm_ctx_fini(ctx);
err_asid_free:
@ -184,10 +194,10 @@ int hl_ctx_put(struct hl_ctx *ctx)
return kref_put(&ctx->refcount, hl_ctx_do_release);
}
struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
{
struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
struct dma_fence *fence;
struct hl_fence *fence;
spin_lock(&ctx->cs_lock);
@ -201,8 +211,9 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
return NULL;
}
fence = dma_fence_get(
ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)];
hl_fence_get(fence);
spin_unlock(&ctx->cs_lock);
return fence;

View File

@ -21,7 +21,7 @@ static struct dentry *hl_debug_root;
static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
u8 i2c_reg, long *val)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
if (hl_device_disabled_or_in_reset(hdev))
@ -29,8 +29,8 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_RD <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_RD <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.i2c_bus = i2c_bus;
pkt.i2c_addr = i2c_addr;
pkt.i2c_reg = i2c_reg;
@ -47,7 +47,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
u8 i2c_reg, u32 val)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
if (hl_device_disabled_or_in_reset(hdev))
@ -55,8 +55,8 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_WR <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_WR <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.i2c_bus = i2c_bus;
pkt.i2c_addr = i2c_addr;
pkt.i2c_reg = i2c_reg;
@ -73,7 +73,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
if (hl_device_disabled_or_in_reset(hdev))
@ -81,8 +81,8 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_LED_SET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_LED_SET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.led_index = cpu_to_le32(led);
pkt.value = cpu_to_le64(state);
@ -110,8 +110,8 @@ static int command_buffers_show(struct seq_file *s, void *data)
seq_puts(s, "---------------------------------------------------------------\n");
}
seq_printf(s,
" %03d %d 0x%08x %d %d %d\n",
cb->id, cb->ctx_id, cb->size,
" %03llu %d 0x%08x %d %d %d\n",
cb->id, cb->ctx->asid, cb->size,
kref_read(&cb->refcount),
cb->mmap, cb->cs_cnt);
}
@ -354,6 +354,14 @@ static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
mmu_specs->hop4_shift);
}
static inline u64 get_hop5_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_specs,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop5_mask,
mmu_specs->hop5_shift);
}
static inline u64 get_next_hop_addr(u64 curr_pte)
{
if (curr_pte & PAGE_PRESENT_MASK)
@ -377,6 +385,7 @@ static int mmu_show(struct seq_file *s, void *data)
hop2_addr = 0, hop2_pte_addr = 0, hop2_pte = 0,
hop3_addr = 0, hop3_pte_addr = 0, hop3_pte = 0,
hop4_addr = 0, hop4_pte_addr = 0, hop4_pte = 0,
hop5_addr = 0, hop5_pte_addr = 0, hop5_pte = 0,
virt_addr = dev_entry->mmu_addr;
if (!hdev->mmu_enable)
@ -428,20 +437,49 @@ static int mmu_show(struct seq_file *s, void *data)
hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
if (!(hop3_pte & LAST_MASK)) {
if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
if (!(hop3_pte & LAST_MASK)) {
hop4_addr = get_next_hop_addr(hop3_pte);
if (hop4_addr == ULLONG_MAX)
goto not_mapped;
hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
hop4_addr, virt_addr);
hop4_pte = hdev->asic_funcs->read_pte(hdev,
hop4_pte_addr);
if (!(hop4_pte & PAGE_PRESENT_MASK))
goto not_mapped;
} else {
if (!(hop3_pte & PAGE_PRESENT_MASK))
goto not_mapped;
}
} else {
hop4_addr = get_next_hop_addr(hop3_pte);
if (hop4_addr == ULLONG_MAX)
goto not_mapped;
hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
virt_addr);
hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
if (!(hop4_pte & PAGE_PRESENT_MASK))
goto not_mapped;
} else {
if (!(hop3_pte & PAGE_PRESENT_MASK))
goto not_mapped;
hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
hop4_addr, virt_addr);
hop4_pte = hdev->asic_funcs->read_pte(hdev,
hop4_pte_addr);
if (!(hop4_pte & LAST_MASK)) {
hop5_addr = get_next_hop_addr(hop4_pte);
if (hop5_addr == ULLONG_MAX)
goto not_mapped;
hop5_pte_addr = get_hop5_pte_addr(ctx, mmu_prop,
hop5_addr, virt_addr);
hop5_pte = hdev->asic_funcs->read_pte(hdev,
hop5_pte_addr);
if (!(hop5_pte & PAGE_PRESENT_MASK))
goto not_mapped;
} else {
if (!(hop4_pte & PAGE_PRESENT_MASK))
goto not_mapped;
}
}
seq_printf(s, "asid: %u, virt_addr: 0x%llx\n",
@ -463,10 +501,22 @@ static int mmu_show(struct seq_file *s, void *data)
seq_printf(s, "hop3_pte_addr: 0x%llx\n", hop3_pte_addr);
seq_printf(s, "hop3_pte: 0x%llx\n", hop3_pte);
if (!(hop3_pte & LAST_MASK)) {
if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
if (!(hop3_pte & LAST_MASK)) {
seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
}
} else {
seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
if (!(hop4_pte & LAST_MASK)) {
seq_printf(s, "hop5_addr: 0x%llx\n", hop5_addr);
seq_printf(s, "hop5_pte_addr: 0x%llx\n", hop5_pte_addr);
seq_printf(s, "hop5_pte: 0x%llx\n", hop5_pte);
}
}
goto out;

View File

@ -123,9 +123,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct hl_fpriv *hpriv = filp->private_data;
unsigned long vm_pgoff;
if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
vma->vm_pgoff ^= HL_MMAP_CB_MASK;
vm_pgoff = vma->vm_pgoff;
vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
case HL_MMAP_TYPE_CB:
return hl_cb_mmap(hpriv, vma);
}
@ -286,7 +290,7 @@ static int device_early_init(struct hl_device *hdev)
}
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
snprintf(workq_name, 32, "hl-free-jobs-%u", i);
snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
if (hdev->cq_wq[i] == NULL) {
dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
@ -317,6 +321,10 @@ static int device_early_init(struct hl_device *hdev)
goto free_chip_info;
}
rc = hl_mmu_if_set_funcs(hdev);
if (rc)
goto free_idle_busy_ts_arr;
hl_cb_mgr_init(&hdev->kernel_cb_mgr);
mutex_init(&hdev->send_cpu_message_lock);
@ -330,6 +338,8 @@ static int device_early_init(struct hl_device *hdev)
return 0;
free_idle_busy_ts_arr:
kfree(hdev->idle_busy_ts_arr);
free_chip_info:
kfree(hdev->hl_chip_info);
free_eq_wq:
@ -871,7 +881,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
* so this message won't be sent
*/
if (hl_fw_send_pci_access_msg(hdev,
ARMCP_PACKET_DISABLE_PCI_ACCESS))
CPUCP_PACKET_DISABLE_PCI_ACCESS))
dev_warn(hdev->dev,
"Failed to disable PCI access by F/W\n");
}

View File

@ -68,9 +68,9 @@ out:
int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
{
struct armcp_packet pkt = {};
struct cpucp_packet pkt = {};
pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT);
return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
sizeof(pkt), 0, NULL);
@ -79,7 +79,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
u16 len, u32 timeout, long *result)
{
struct armcp_packet *pkt;
struct cpucp_packet *pkt;
dma_addr_t pkt_dma_addr;
u32 tmp;
int rc = 0;
@ -111,7 +111,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
}
rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
(tmp == ARMCP_PACKET_FENCE_VAL), 1000,
(tmp == CPUCP_PACKET_FENCE_VAL), 1000,
timeout, true);
hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
@ -124,12 +124,12 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
tmp = le32_to_cpu(pkt->ctl);
rc = (tmp & ARMCP_PKT_CTL_RC_MASK) >> ARMCP_PKT_CTL_RC_SHIFT;
rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
if (rc) {
dev_err(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
rc,
(tmp & ARMCP_PKT_CTL_OPCODE_MASK)
>> ARMCP_PKT_CTL_OPCODE_SHIFT);
(tmp & CPUCP_PKT_CTL_OPCODE_MASK)
>> CPUCP_PKT_CTL_OPCODE_SHIFT);
rc = -EIO;
} else if (result) {
*result = (long) le64_to_cpu(pkt->result);
@ -145,14 +145,14 @@ out:
int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
long result;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.value = cpu_to_le64(event_type);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@ -167,15 +167,15 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
size_t irq_arr_size)
{
struct armcp_unmask_irq_arr_packet *pkt;
struct cpucp_unmask_irq_arr_packet *pkt;
size_t total_pkt_size;
long result;
int rc;
total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) +
total_pkt_size = sizeof(struct cpucp_unmask_irq_arr_packet) +
irq_arr_size;
/* data should be aligned to 8 bytes in order to ArmCP to copy it */
/* data should be aligned to 8 bytes in order to CPU-CP to copy it */
total_pkt_size = (total_pkt_size + 0x7) & ~0x7;
/* total_pkt_size is casted to u16 later on */
@ -191,8 +191,8 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0]));
memcpy(&pkt->irqs, irq_arr, irq_arr_size);
pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
total_pkt_size, 0, &result);
@ -207,19 +207,19 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
int hl_fw_test_cpu_queue(struct hl_device *hdev)
{
struct armcp_packet test_pkt = {};
struct cpucp_packet test_pkt = {};
long result;
int rc;
test_pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEST <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
test_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
sizeof(test_pkt), 0, &result);
if (!rc) {
if (result != ARMCP_PACKET_FENCE_VAL)
if (result != CPUCP_PACKET_FENCE_VAL)
dev_err(hdev->dev,
"CPU queue test failed (0x%08lX)\n", result);
} else {
@ -251,61 +251,61 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
int hl_fw_send_heartbeat(struct hl_device *hdev)
{
struct armcp_packet hb_pkt = {};
struct cpucp_packet hb_pkt = {};
long result;
int rc;
hb_pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEST <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
sizeof(hb_pkt), 0, &result);
if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
rc = -EIO;
return rc;
}
int hl_fw_armcp_info_get(struct hl_device *hdev)
int hl_fw_cpucp_info_get(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct armcp_packet pkt = {};
void *armcp_info_cpu_addr;
dma_addr_t armcp_info_dma_addr;
struct cpucp_packet pkt = {};
void *cpucp_info_cpu_addr;
dma_addr_t cpucp_info_dma_addr;
long result;
int rc;
armcp_info_cpu_addr =
cpucp_info_cpu_addr =
hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
sizeof(struct armcp_info),
&armcp_info_dma_addr);
if (!armcp_info_cpu_addr) {
sizeof(struct cpucp_info),
&cpucp_info_dma_addr);
if (!cpucp_info_cpu_addr) {
dev_err(hdev->dev,
"Failed to allocate DMA memory for ArmCP info packet\n");
"Failed to allocate DMA memory for CPU-CP info packet\n");
return -ENOMEM;
}
memset(armcp_info_cpu_addr, 0, sizeof(struct armcp_info));
memset(cpucp_info_cpu_addr, 0, sizeof(struct cpucp_info));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_INFO_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.addr = cpu_to_le64(armcp_info_dma_addr);
pkt.data_max_size = cpu_to_le32(sizeof(struct armcp_info));
pkt.ctl = cpu_to_le32(CPUCP_PACKET_INFO_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.addr = cpu_to_le64(cpucp_info_dma_addr);
pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_info));
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_ARMCP_INFO_TIMEOUT_USEC, &result);
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
dev_err(hdev->dev,
"Failed to handle ArmCP info pkt, error %d\n", rc);
"Failed to handle CPU-CP info pkt, error %d\n", rc);
goto out;
}
memcpy(&prop->armcp_info, armcp_info_cpu_addr,
sizeof(prop->armcp_info));
memcpy(&prop->cpucp_info, cpucp_info_cpu_addr,
sizeof(prop->cpucp_info));
rc = hl_build_hwmon_channel_info(hdev, prop->armcp_info.sensors);
rc = hl_build_hwmon_channel_info(hdev, prop->cpucp_info.sensors);
if (rc) {
dev_err(hdev->dev,
"Failed to build hwmon channel info, error %d\n", rc);
@ -315,14 +315,14 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)
out:
hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
sizeof(struct armcp_info), armcp_info_cpu_addr);
sizeof(struct cpucp_info), cpucp_info_cpu_addr);
return rc;
}
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
{
struct armcp_packet pkt = {};
struct cpucp_packet pkt = {};
void *eeprom_info_cpu_addr;
dma_addr_t eeprom_info_dma_addr;
long result;
@ -333,23 +333,24 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
max_size, &eeprom_info_dma_addr);
if (!eeprom_info_cpu_addr) {
dev_err(hdev->dev,
"Failed to allocate DMA memory for ArmCP EEPROM packet\n");
"Failed to allocate DMA memory for CPU-CP EEPROM packet\n");
return -ENOMEM;
}
memset(eeprom_info_cpu_addr, 0, max_size);
pkt.ctl = cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_EEPROM_DATA_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.addr = cpu_to_le64(eeprom_info_dma_addr);
pkt.data_max_size = cpu_to_le32(max_size);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_ARMCP_EEPROM_TIMEOUT_USEC, &result);
HL_CPUCP_EEPROM_TIMEOUT_USEC, &result);
if (rc) {
dev_err(hdev->dev,
"Failed to handle ArmCP EEPROM packet, error %d\n", rc);
"Failed to handle CPU-CP EEPROM packet, error %d\n",
rc);
goto out;
}
@ -363,6 +364,77 @@ out:
return rc;
}
int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
struct hl_info_pci_counters *counters)
{
struct cpucp_packet pkt = {};
long result;
int rc;
pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
/* Fetch PCI rx counter */
pkt.index = cpu_to_le32(cpucp_pcie_throughput_rx);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
dev_err(hdev->dev,
"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->rx_throughput = result;
/* Fetch PCI tx counter */
pkt.index = cpu_to_le32(cpucp_pcie_throughput_tx);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
dev_err(hdev->dev,
"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->tx_throughput = result;
/* Fetch PCI replay counter */
pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_REPLAY_CNT_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
dev_err(hdev->dev,
"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->replay_cnt = (u32) result;
return rc;
}
int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
{
struct cpucp_packet pkt = {};
long result;
int rc;
pkt.ctl = cpu_to_le32(CPUCP_PACKET_TOTAL_ENERGY_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
dev_err(hdev->dev,
"Failed to handle CpuCP total energy pkt, error %d\n",
rc);
return rc;
}
*total_energy = result;
return rc;
}
static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
{
u32 err_val;
@ -402,8 +474,11 @@ static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
"Device boot error - NIC F/W initialization failed\n");
}
static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status)
static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
{
/* Some of the status codes below are deprecated in newer f/w
* versions but we keep them here for backward compatibility
*/
switch (status) {
case CPU_BOOT_STATUS_NA:
dev_err(hdev->dev,
@ -449,6 +524,48 @@ static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status)
}
}
int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
u32 boot_err0_reg, u32 timeout)
{
u32 status;
int rc;
if (!hdev->cpu_enable)
return 0;
/* Need to check two possible scenarios:
*
* CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT - for newer firmwares where
* the preboot is waiting for the boot fit
*
* All other status values - for older firmwares where the uboot was
* loaded from the FLASH
*/
rc = hl_poll_timeout(
hdev,
cpu_boot_status_reg,
status,
(status == CPU_BOOT_STATUS_IN_UBOOT) ||
(status == CPU_BOOT_STATUS_DRAM_RDY) ||
(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
(status == CPU_BOOT_STATUS_SRAM_AVAIL) ||
(status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT),
10000,
timeout);
if (rc) {
dev_err(hdev->dev, "Failed to read preboot version\n");
detect_cpu_boot_status(hdev, status);
fw_read_errors(hdev, boot_err0_reg);
return -EIO;
}
hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
return 0;
}
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
u32 boot_err0_reg, bool skip_bmc,
@ -514,15 +631,11 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
10000,
cpu_timeout);
/* Read U-Boot, preboot versions now in case we will later fail */
/* Read U-Boot version now in case we will later fail */
hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_UBOOT);
hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
/* Some of the status codes below are deprecated in newer f/w
* versions but we keep them here for backward compatibility
*/
if (rc) {
hl_detect_cpu_boot_status(hdev, status);
detect_cpu_boot_status(hdev, status);
rc = -EIO;
goto out;
}

View File

@ -8,21 +8,33 @@
#ifndef HABANALABSP_H_
#define HABANALABSP_H_
#include "../include/common/armcp_if.h"
#include "../include/common/cpucp_if.h"
#include "../include/common/qman_if.h"
#include <uapi/misc/habanalabs.h>
#include <linux/cdev.h>
#include <linux/iopoll.h>
#include <linux/irqreturn.h>
#include <linux/dma-fence.h>
#include <linux/dma-direction.h>
#include <linux/scatterlist.h>
#include <linux/hashtable.h>
#include <linux/bitfield.h>
#define HL_NAME "habanalabs"
#define HL_MMAP_CB_MASK (0x8000000000000000ull >> PAGE_SHIFT)
/* Use upper bits of mmap offset to store habana driver specific information.
* bits[63:62] - Encode mmap type
* bits[45:0] - mmap offset value
*
* NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
* defines are w.r.t to PAGE_SIZE
*/
#define HL_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT)
#define HL_MMAP_TYPE_MASK (0x3ull << HL_MMAP_TYPE_SHIFT)
#define HL_MMAP_TYPE_CB (0x2ull << HL_MMAP_TYPE_SHIFT)
#define HL_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFull >> PAGE_SHIFT)
#define HL_MMAP_OFFSET_VALUE_GET(off) (off & HL_MMAP_OFFSET_VALUE_MASK)
#define HL_PENDING_RESET_PER_SEC 30
@ -34,8 +46,8 @@
#define HL_PLL_LOW_JOB_FREQ_USEC 5000000 /* 5 s */
#define HL_ARMCP_INFO_TIMEOUT_USEC 10000000 /* 10s */
#define HL_ARMCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */
#define HL_CPUCP_INFO_TIMEOUT_USEC 10000000 /* 10s */
#define HL_CPUCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */
#define HL_PCI_ELBI_TIMEOUT_MSEC 10 /* 10ms */
@ -66,6 +78,8 @@
#define HL_PCI_NUM_BARS 6
#define HL_MAX_DCORES 4
/**
* struct pgt_info - MMU hop page info.
* @node: hash linked-list node for the pgts shadow hash of pgts.
@ -222,12 +236,15 @@ enum hl_device_hw_state {
* @hop2_shift: shift of hop 2 mask.
* @hop3_shift: shift of hop 3 mask.
* @hop4_shift: shift of hop 4 mask.
* @hop5_shift: shift of hop 5 mask.
* @hop0_mask: mask to get the PTE address in hop 0.
* @hop1_mask: mask to get the PTE address in hop 1.
* @hop2_mask: mask to get the PTE address in hop 2.
* @hop3_mask: mask to get the PTE address in hop 3.
* @hop4_mask: mask to get the PTE address in hop 4.
* @hop5_mask: mask to get the PTE address in hop 5.
* @page_size: default page size used to allocate memory.
* @num_hops: The amount of hops supported by the translation table.
*/
struct hl_mmu_properties {
u64 start_addr;
@ -237,18 +254,21 @@ struct hl_mmu_properties {
u64 hop2_shift;
u64 hop3_shift;
u64 hop4_shift;
u64 hop5_shift;
u64 hop0_mask;
u64 hop1_mask;
u64 hop2_mask;
u64 hop3_mask;
u64 hop4_mask;
u64 hop5_mask;
u32 page_size;
u32 num_hops;
};
/**
* struct asic_fixed_properties - ASIC specific immutable properties.
* @hw_queues_props: H/W queues properties.
* @armcp_info: received various information from ArmCP regarding the H/W, e.g.
* @cpucp_info: received various information from CPU-CP regarding the H/W, e.g.
* available sensors.
* @uboot_ver: F/W U-boot version.
* @preboot_ver: F/W Preboot version.
@ -271,6 +291,10 @@ struct hl_mmu_properties {
* @pcie_aux_dbi_reg_addr: Address of the PCIE_AUX DBI register.
* @mmu_pgt_addr: base physical address in DRAM of MMU page tables.
* @mmu_dram_default_page_addr: DRAM default page physical address.
* @cb_va_start_addr: virtual start address of command buffers which are mapped
* to the device's MMU.
* @cb_va_end_addr: virtual end address of command buffers which are mapped to
* the device's MMU.
* @mmu_pgt_size: MMU page tables total size.
* @mmu_pte_size: PTE size in MMU page tables.
* @mmu_hop_table_size: MMU hop table size.
@ -292,12 +316,16 @@ struct hl_mmu_properties {
* @max_queues: maximum amount of queues in the system
* @sync_stream_first_sob: first sync object available for sync stream use
* @sync_stream_first_mon: first monitor available for sync stream use
* @first_available_user_sob: first sob available for the user
* @first_available_user_mon: first monitor available for the user
* @tpc_enabled_mask: which TPCs are enabled.
* @completion_queues_count: number of completion queues.
* @fw_security_disabled: true if security measures are disabled in firmware,
* false otherwise
*/
struct asic_fixed_properties {
struct hw_queue_properties *hw_queues_props;
struct armcp_info armcp_info;
struct cpucp_info cpucp_info;
char uboot_ver[VERSION_MAX_LEN];
char preboot_ver[VERSION_MAX_LEN];
struct hl_mmu_properties dmmu;
@ -317,6 +345,8 @@ struct asic_fixed_properties {
u64 pcie_aux_dbi_reg_addr;
u64 mmu_pgt_addr;
u64 mmu_dram_default_page_addr;
u64 cb_va_start_addr;
u64 cb_va_end_addr;
u32 mmu_pgt_size;
u32 mmu_pte_size;
u32 mmu_hop_table_size;
@ -338,13 +368,29 @@ struct asic_fixed_properties {
u32 max_queues;
u16 sync_stream_first_sob;
u16 sync_stream_first_mon;
u16 first_available_user_sob[HL_MAX_DCORES];
u16 first_available_user_mon[HL_MAX_DCORES];
u8 tpc_enabled_mask;
u8 completion_queues_count;
u8 fw_security_disabled;
};
/**
* struct hl_fence - software synchronization primitive
* @completion: fence is implemented using completion
* @refcount: refcount for this fence
* @error: mark this fence with error
*
*/
struct hl_fence {
struct completion completion;
struct kref refcount;
int error;
};
/**
* struct hl_cs_compl - command submission completion object.
* @base_fence: kernel fence object.
* @base_fence: hl fence object.
* @lock: spinlock to protect fence.
* @hdev: habanalabs device structure.
* @hw_sob: the H/W SOB used in this signal/wait CS.
@ -353,7 +399,7 @@ struct asic_fixed_properties {
* @sob_val: the SOB value that is used in this signal/wait CS.
*/
struct hl_cs_compl {
struct dma_fence base_fence;
struct hl_fence base_fence;
spinlock_t lock;
struct hl_device *hdev;
struct hl_hw_sob *hw_sob;
@ -380,36 +426,41 @@ struct hl_cb_mgr {
* struct hl_cb - describes a Command Buffer.
* @refcount: reference counter for usage of the CB.
* @hdev: pointer to device this CB belongs to.
* @ctx: pointer to the CB owner's context.
* @lock: spinlock to protect mmap/cs flows.
* @debugfs_list: node in debugfs list of command buffers.
* @pool_list: node in pool list of command buffers.
* @va_block_list: list of virtual addresses blocks of the CB if it is mapped to
* the device's MMU.
* @id: the CB's ID.
* @kernel_address: Holds the CB's kernel virtual address.
* @bus_address: Holds the CB's DMA address.
* @mmap_size: Holds the CB's size that was mmaped.
* @size: holds the CB's size.
* @id: the CB's ID.
* @cs_cnt: holds number of CS that this CB participates in.
* @ctx_id: holds the ID of the owner's context.
* @mmap: true if the CB is currently mmaped to user.
* @is_pool: true if CB was acquired from the pool, false otherwise.
* @is_internal: internaly allocated
* @is_mmu_mapped: true if the CB is mapped to the device's MMU.
*/
struct hl_cb {
struct kref refcount;
struct hl_device *hdev;
struct hl_ctx *ctx;
spinlock_t lock;
struct list_head debugfs_list;
struct list_head pool_list;
struct list_head va_block_list;
u64 id;
u64 kernel_address;
dma_addr_t bus_address;
u32 mmap_size;
u32 size;
u32 id;
u32 cs_cnt;
u32 ctx_id;
u8 mmap;
u8 is_pool;
u8 is_internal;
u8 is_mmu_mapped;
};
@ -435,7 +486,7 @@ struct hl_cs_job;
#define HL_EQ_LENGTH 64
#define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE)
/* Host <-> ArmCP shared memory size */
/* Host <-> CPU-CP shared memory size */
#define HL_CPU_ACCESSIBLE_MEM_SIZE SZ_2M
/**
@ -617,7 +668,7 @@ enum div_select_defs {
* @debugfs_read32: debug interface for reading u32 from DRAM/SRAM.
* @debugfs_write32: debug interface for writing u32 to DRAM/SRAM.
* @add_device_attr: add ASIC specific device attributes.
* @handle_eqe: handle event queue entry (IRQ) from ArmCP.
* @handle_eqe: handle event queue entry (IRQ) from CPU-CP.
* @set_pll_profile: change PLL profile (manual/automatic).
* @get_events_stat: retrieve event queue entries histogram.
* @read_pte: read MMU page table entry from DRAM.
@ -626,7 +677,7 @@ enum div_select_defs {
* (L1 only) or hard (L0 & L1) flush.
* @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
* ASID-VA-size mask.
* @send_heartbeat: send is-alive packet to ArmCP and verify response.
* @send_heartbeat: send is-alive packet to CPU-CP and verify response.
* @set_clock_gating: enable/disable clock gating per engine according to
* clock gating mask in hdev
* @disable_clock_gating: disable clock gating completely
@ -644,8 +695,6 @@ enum div_select_defs {
* ASIC
* @get_hw_state: retrieve the H/W state
* @pci_bars_map: Map PCI BARs.
* @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
* old address the bar pointed to or U64_MAX for failure
* @init_iatu: Initialize the iATU unit inside the PCI controller.
* @rreg: Read a register. Needed for simulator support.
* @wreg: Write a register. Needed for simulator support.
@ -679,7 +728,7 @@ struct hl_asic_funcs {
int (*suspend)(struct hl_device *hdev);
int (*resume)(struct hl_device *hdev);
int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
u64 kaddress, phys_addr_t paddress, u32 size);
void *cpu_addr, dma_addr_t dma_addr, size_t size);
void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
void (*pqe_write)(struct hl_device *hdev, __le64 *pqe,
struct hl_bd *bd);
@ -736,7 +785,7 @@ struct hl_asic_funcs {
void (*set_clock_gating)(struct hl_device *hdev);
void (*disable_clock_gating)(struct hl_device *hdev);
int (*debug_coresight)(struct hl_device *hdev, void *data);
bool (*is_device_idle)(struct hl_device *hdev, u32 *mask,
bool (*is_device_idle)(struct hl_device *hdev, u64 *mask,
struct seq_file *s);
int (*soft_reset_late_init)(struct hl_device *hdev);
void (*hw_queues_lock)(struct hl_device *hdev);
@ -748,7 +797,6 @@ struct hl_asic_funcs {
u16 len, u32 timeout, long *result);
enum hl_device_hw_state (*get_hw_state)(struct hl_device *hdev);
int (*pci_bars_map)(struct hl_device *hdev);
u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
int (*init_iatu)(struct hl_device *hdev);
u32 (*rreg)(struct hl_device *hdev, u32 reg);
void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
@ -800,7 +848,7 @@ struct hl_va_range {
* @hdev: pointer to the device structure.
* @refcount: reference counter for the context. Context is released only when
* this hits 0l. It is incremented on CS and CS_WAIT.
* @cs_pending: array of DMA fence objects representing pending CS.
* @cs_pending: array of hl fence objects representing pending CS.
* @host_va_range: holds available virtual addresses for host mappings.
* @host_huge_va_range: holds available virtual addresses for host mappings
* with huge pages.
@ -809,6 +857,8 @@ struct hl_va_range {
* @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
* MMU hash or walking the PGT requires talking this lock.
* @debugfs_list: node in debugfs list of contexts.
* @cb_va_pool: device VA pool for command buffers which are mapped to the
* device's MMU.
* @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
* to user so user could inquire about CS. It is used as
* index to cs_pending array.
@ -832,7 +882,7 @@ struct hl_ctx {
struct hl_fpriv *hpriv;
struct hl_device *hdev;
struct kref refcount;
struct dma_fence **cs_pending;
struct hl_fence **cs_pending;
struct hl_va_range *host_va_range;
struct hl_va_range *host_huge_va_range;
struct hl_va_range *dram_va_range;
@ -840,6 +890,7 @@ struct hl_ctx {
struct mutex mmu_lock;
struct list_head debugfs_list;
struct hl_cs_counters cs_counters;
struct gen_pool *cb_va_pool;
u64 cs_sequence;
u64 *dram_default_hops;
spinlock_t cs_lock;
@ -919,8 +970,8 @@ struct hl_cs {
struct list_head job_list;
spinlock_t job_lock;
struct kref refcount;
struct dma_fence *fence;
struct dma_fence *signal_fence;
struct hl_fence *fence;
struct hl_fence *signal_fence;
struct work_struct finish_work;
struct delayed_work work_tdr;
struct list_head mirror_node;
@ -1395,6 +1446,44 @@ struct hl_device_idle_busy_ts {
ktime_t busy_to_idle_ts;
};
/**
* struct hl_mmu_priv - used for holding per-device mmu internal information.
* @mmu_pgt_pool: pool of page tables used by MMU for allocating hops.
* @mmu_shadow_hop0: shadow array of hop0 tables.
*/
struct hl_mmu_priv {
struct gen_pool *mmu_pgt_pool;
void *mmu_shadow_hop0;
};
/**
* struct hl_mmu_funcs - Device related MMU functions.
* @init: initialize the MMU module.
* @fini: release the MMU module.
* @ctx_init: Initialize a context for using the MMU module.
* @ctx_fini: disable a ctx from using the mmu module.
* @map: maps a virtual address to physical address for a context.
* @unmap: unmap a virtual address of a context.
* @flush: flush all writes from all cores to reach device MMU.
* @swap_out: marks all mapping of the given context as swapped out.
* @swap_in: marks all mapping of the given context as swapped in.
*/
struct hl_mmu_funcs {
int (*init)(struct hl_device *hdev);
void (*fini)(struct hl_device *hdev);
int (*ctx_init)(struct hl_ctx *ctx);
void (*ctx_fini)(struct hl_ctx *ctx);
int (*map)(struct hl_ctx *ctx,
u64 virt_addr, u64 phys_addr, u32 page_size,
bool is_dram_addr);
int (*unmap)(struct hl_ctx *ctx,
u64 virt_addr, bool is_dram_addr);
void (*flush)(struct hl_ctx *ctx);
void (*swap_out)(struct hl_ctx *ctx);
void (*swap_in)(struct hl_ctx *ctx);
};
/**
* struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
@ -1407,8 +1496,8 @@ struct hl_device_idle_busy_ts {
* @dev: related kernel basic device structure.
* @dev_ctrl: related kernel device structure for the control device
* @work_freq: delayed work to lower device frequency if possible.
* @work_heartbeat: delayed work for ArmCP is-alive check.
* @asic_name: ASIC specific nmae.
* @work_heartbeat: delayed work for CPU-CP is-alive check.
* @asic_name: ASIC specific name.
* @asic_type: ASIC specific type.
* @completion_queue: array of hl_cq.
* @cq_wq: work queues of completion queues for executing work in process
@ -1419,22 +1508,20 @@ struct hl_device_idle_busy_ts {
* @hw_queues_mirror_list: CS mirror list for TDR.
* @hw_queues_mirror_lock: protects hw_queues_mirror_list.
* @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
* @event_queue: event queue for IRQ from ArmCP.
* @event_queue: event queue for IRQ from CPU-CP.
* @dma_pool: DMA pool for small allocations.
* @cpu_accessible_dma_mem: Host <-> ArmCP shared memory CPU address.
* @cpu_accessible_dma_address: Host <-> ArmCP shared memory DMA address.
* @cpu_accessible_dma_pool: Host <-> ArmCP shared memory pool.
* @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address.
* @cpu_accessible_dma_address: Host <-> CPU-CP shared memory DMA address.
* @cpu_accessible_dma_pool: Host <-> CPU-CP shared memory pool.
* @asid_bitmap: holds used/available ASIDs.
* @asid_mutex: protects asid_bitmap.
* @send_cpu_message_lock: enforces only one message in Host <-> ArmCP queue.
* @send_cpu_message_lock: enforces only one message in Host <-> CPU-CP queue.
* @debug_lock: protects critical section of setting debug mode for device
* @asic_prop: ASIC specific immutable properties.
* @asic_funcs: ASIC specific functions.
* @asic_specific: ASIC specific information to use only from ASIC files.
* @mmu_pgt_pool: pool of available MMU hops.
* @vm: virtual memory manager for MMU.
* @mmu_cache_lock: protects MMU cache invalidation as it can serve one context.
* @mmu_shadow_hop0: shadow mapping of the MMU hop 0 zone.
* @hwmon_dev: H/W monitor device.
* @pm_mng_profile: current power management profile.
* @hl_chip_info: ASIC's sensors information.
@ -1452,6 +1539,8 @@ struct hl_device_idle_busy_ts {
* @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy
* and vice-versa
* @aggregated_cs_counters: aggregated cs counters among all contexts
* @mmu_priv: device-specific MMU data.
* @mmu_func: device-related MMU functions.
* @dram_used_mem: current DRAM memory consumption.
* @timeout_jiffies: device CS timeout value.
* @max_power: the max power of the device, as configured by the sysadmin. This
@ -1471,6 +1560,7 @@ struct hl_device_idle_busy_ts {
* @soft_reset_cnt: number of soft reset since the driver was loaded.
* @hard_reset_cnt: number of hard reset since the driver was loaded.
* @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
* @clk_throttling_reason: bitmask represents the current clk throttling reasons
* @id: device minor.
* @id_control: minor of the control device
* @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@ -1479,7 +1569,7 @@ struct hl_device_idle_busy_ts {
* @late_init_done: is late init stage was done during initialization.
* @hwmon_initialized: is H/W monitor sensors was initialized.
* @hard_reset_pending: is there a hard reset work pending.
* @heartbeat: is heartbeat sanity check towards ArmCP enabled.
* @heartbeat: is heartbeat sanity check towards CPU-CP enabled.
* @reset_on_lockup: true if a reset should be done in case of stuck CS, false
* otherwise.
* @dram_supports_virtual_memory: is MMU enabled towards DRAM.
@ -1501,6 +1591,7 @@ struct hl_device_idle_busy_ts {
* @sync_stream_queue_idx: helper index for sync stream queues initialization.
* @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported.
* @supports_cb_mapping: is mapping a CB to the device's MMU supported.
*/
struct hl_device {
struct pci_dev *pdev;
@ -1513,7 +1604,7 @@ struct hl_device {
struct device *dev_ctrl;
struct delayed_work work_freq;
struct delayed_work work_heartbeat;
char asic_name[16];
char asic_name[32];
enum hl_asic_type asic_type;
struct hl_cq *completion_queue;
struct workqueue_struct **cq_wq;
@ -1535,10 +1626,8 @@ struct hl_device {
struct asic_fixed_properties asic_prop;
const struct hl_asic_funcs *asic_funcs;
void *asic_specific;
struct gen_pool *mmu_pgt_pool;
struct hl_vm vm;
struct mutex mmu_cache_lock;
void *mmu_shadow_hop0;
struct device *hwmon_dev;
enum hl_pm_mng_profile pm_mng_profile;
struct hwmon_chip_info *hl_chip_info;
@ -1562,19 +1651,23 @@ struct hl_device {
struct hl_cs_counters aggregated_cs_counters;
struct hl_mmu_priv mmu_priv;
struct hl_mmu_funcs mmu_func;
atomic64_t dram_used_mem;
u64 timeout_jiffies;
u64 max_power;
u64 clock_gating_mask;
atomic_t in_reset;
enum hl_pll_frequency curr_pll_profile;
enum armcp_card_types card_type;
enum cpucp_card_types card_type;
int cs_active_cnt;
u32 major;
u32 high_pll;
u32 soft_reset_cnt;
u32 hard_reset_cnt;
u32 idle_busy_ts_idx;
u32 clk_throttling_reason;
u16 id;
u16 id_control;
u16 cpu_pci_msb_addr;
@ -1598,6 +1691,7 @@ struct hl_device {
u8 sync_stream_queue_idx;
u8 supports_coresight;
u8 supports_soft_reset;
u8 supports_cb_mapping;
/* Parameters for bring-up */
u8 mmu_enable;
@ -1739,7 +1833,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
void hl_ctx_do_release(struct kref *ref);
void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx);
int hl_ctx_put(struct hl_ctx *ctx);
struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
@ -1755,7 +1849,7 @@ int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms);
int hl_build_hwmon_channel_info(struct hl_device *hdev,
struct armcp_sensor *sensors_arr);
struct cpucp_sensor *sensors_arr);
int hl_sysfs_init(struct hl_device *hdev);
void hl_sysfs_fini(struct hl_device *hdev);
@ -1763,8 +1857,9 @@ void hl_sysfs_fini(struct hl_device *hdev);
int hl_hwmon_init(struct hl_device *hdev);
void hl_hwmon_fini(struct hl_device *hdev);
int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size,
u64 *handle, int ctx_id, bool internal_cb);
int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
bool map_cb, u64 *handle);
int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr,
@ -1776,11 +1871,15 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
bool internal_cb);
int hl_cb_pool_init(struct hl_device *hdev);
int hl_cb_pool_fini(struct hl_device *hdev);
int hl_cb_va_pool_init(struct hl_ctx *ctx);
void hl_cb_va_pool_fini(struct hl_ctx *ctx);
void hl_cs_rollback_all(struct hl_device *hdev);
struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
void hl_sob_reset_error(struct kref *ref);
void hl_fence_put(struct hl_fence *fence);
void hl_fence_get(struct hl_fence *fence);
void goya_set_asic_funcs(struct hl_device *hdev);
void gaudi_set_asic_funcs(struct hl_device *hdev);
@ -1810,6 +1909,8 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
bool flush_pte);
void hl_mmu_swap_out(struct hl_ctx *ctx);
void hl_mmu_swap_in(struct hl_ctx *ctx);
int hl_mmu_if_set_funcs(struct hl_device *hdev);
void hl_mmu_v1_set_funcs(struct hl_device *hdev);
int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
void __iomem *dst);
@ -1825,23 +1926,28 @@ void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
void *vaddr);
int hl_fw_send_heartbeat(struct hl_device *hdev);
int hl_fw_armcp_info_get(struct hl_device *hdev);
int hl_fw_cpucp_info_get(struct hl_device *hdev);
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
struct hl_info_pci_counters *counters);
int hl_fw_cpucp_total_energy_get(struct hl_device *hdev,
u64 *total_energy);
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
u32 boot_err0_reg, bool skip_bmc,
u32 cpu_timeout, u32 boot_fit_timeout);
int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
u32 boot_err0_reg, u32 timeout);
int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
bool is_wc[3]);
int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data);
int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
u64 addr);
int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region,
struct hl_inbound_pci_region *pci_region);
int hl_pci_set_outbound_region(struct hl_device *hdev,
struct hl_outbound_pci_region *pci_region);
int hl_pci_init(struct hl_device *hdev);
int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
u32 boot_err0_reg, u32 preboot_ver_timeout);
void hl_pci_fini(struct hl_device *hdev);
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);

View File

@ -11,6 +11,7 @@
#include "habanalabs.h"
#include <linux/pci.h>
#include <linux/aer.h>
#include <linux/module.h>
#define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
@ -408,6 +409,8 @@ static int hl_pci_probe(struct pci_dev *pdev,
pci_set_drvdata(pdev, hdev);
pci_enable_pcie_error_reporting(pdev);
rc = hl_device_init(hdev, hl_class);
if (rc) {
dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
@ -440,22 +443,93 @@ static void hl_pci_remove(struct pci_dev *pdev)
return;
hl_device_fini(hdev);
pci_disable_pcie_error_reporting(pdev);
pci_set_drvdata(pdev, NULL);
destroy_hdev(hdev);
}
/**
* hl_pci_err_detected - a PCI bus error detected on this device
*
* @pdev: pointer to pci device
* @state: PCI error type
*
* Called by the PCI subsystem whenever a non-correctable
* PCI bus error is detected
*/
static pci_ers_result_t
hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
{
struct hl_device *hdev = pci_get_drvdata(pdev);
enum pci_ers_result result;
switch (state) {
case pci_channel_io_normal:
return PCI_ERS_RESULT_CAN_RECOVER;
case pci_channel_io_frozen:
dev_warn(hdev->dev, "frozen state error detected\n");
result = PCI_ERS_RESULT_NEED_RESET;
break;
case pci_channel_io_perm_failure:
dev_warn(hdev->dev, "failure state error detected\n");
result = PCI_ERS_RESULT_DISCONNECT;
break;
default:
result = PCI_ERS_RESULT_NONE;
}
hdev->asic_funcs->halt_engines(hdev, true);
return result;
}
/**
* hl_pci_err_resume - resume after a PCI slot reset
*
* @pdev: pointer to pci device
*
*/
static void hl_pci_err_resume(struct pci_dev *pdev)
{
struct hl_device *hdev = pci_get_drvdata(pdev);
dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
hl_device_resume(hdev);
}
/**
* hl_pci_err_slot_reset - a PCI slot reset has just happened
*
* @pdev: pointer to pci device
*
* Determine if the driver can recover from the PCI slot reset
*/
static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
{
return PCI_ERS_RESULT_RECOVERED;
}
static const struct dev_pm_ops hl_pm_ops = {
.suspend = hl_pmops_suspend,
.resume = hl_pmops_resume,
};
static const struct pci_error_handlers hl_pci_err_handler = {
.error_detected = hl_pci_err_detected,
.slot_reset = hl_pci_err_slot_reset,
.resume = hl_pci_err_resume,
};
static struct pci_driver hl_pci_driver = {
.name = HL_NAME,
.id_table = ids,
.probe = hl_pci_probe,
.remove = hl_pci_remove,
.driver.pm = &hl_pm_ops,
.err_handler = &hl_pci_err_handler,
};
/*

View File

@ -8,6 +8,7 @@
#include <uapi/misc/habanalabs.h>
#include "habanalabs.h"
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
@ -64,14 +65,14 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
hw_ip.dram_enabled = 1;
hw_ip.num_of_events = prop->num_of_events;
memcpy(hw_ip.armcp_version, prop->armcp_info.armcp_version,
memcpy(hw_ip.cpucp_version, prop->cpucp_info.cpucp_version,
min(VERSION_MAX_LEN, HL_INFO_VERSION_MAX_LEN));
memcpy(hw_ip.card_name, prop->armcp_info.card_name,
memcpy(hw_ip.card_name, prop->cpucp_info.card_name,
min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN));
hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version);
hw_ip.module_id = le32_to_cpu(prop->armcp_info.card_location);
hw_ip.cpld_version = le32_to_cpu(prop->cpucp_info.cpld_version);
hw_ip.module_id = le32_to_cpu(prop->cpucp_info.card_location);
hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
@ -131,7 +132,7 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
return -EINVAL;
hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev,
&hw_idle.busy_engines_mask, NULL);
&hw_idle.busy_engines_mask_ext, NULL);
return copy_to_user(out, &hw_idle,
min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
@ -276,10 +277,45 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
}
static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
struct hl_info_pci_counters pci_counters = {0};
u32 max_size = args->return_size;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
int rc;
if ((!max_size) || (!out))
return -EINVAL;
rc = hl_fw_cpucp_pci_counters_get(hdev, &pci_counters);
if (rc)
return rc;
return copy_to_user(out, &pci_counters,
min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0;
}
static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
struct hl_info_clk_throttle clk_throttle = {0};
u32 max_size = args->return_size;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
if ((!max_size) || (!out))
return -EINVAL;
clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
return copy_to_user(out, &clk_throttle,
min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
}
static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
struct hl_info_cs_counters cs_counters = {0};
struct hl_info_cs_counters cs_counters = { {0} };
u32 max_size = args->return_size;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
@ -297,6 +333,51 @@ static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0;
}
static int sync_manager_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_info_sync_manager sm_info = {0};
u32 max_size = args->return_size;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
if ((!max_size) || (!out))
return -EINVAL;
if (args->dcore_id >= HL_MAX_DCORES)
return -EINVAL;
sm_info.first_available_sync_object =
prop->first_available_user_sob[args->dcore_id];
sm_info.first_available_monitor =
prop->first_available_user_mon[args->dcore_id];
return copy_to_user(out, &sm_info, min_t(size_t, (size_t) max_size,
sizeof(sm_info))) ? -EFAULT : 0;
}
static int total_energy_consumption_info(struct hl_fpriv *hpriv,
struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
struct hl_info_energy total_energy = {0};
u32 max_size = args->return_size;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
int rc;
if ((!max_size) || (!out))
return -EINVAL;
rc = hl_fw_cpucp_total_energy_get(hdev,
&total_energy.total_energy_consumption);
if (rc)
return rc;
return copy_to_user(out, &total_energy,
min((size_t) max_size, sizeof(total_energy))) ? -EFAULT : 0;
}
static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
struct device *dev)
{
@ -360,6 +441,18 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_CS_COUNTERS:
return cs_counters_info(hpriv, args);
case HL_INFO_PCI_COUNTERS:
return pci_counters_info(hpriv, args);
case HL_INFO_CLK_THROTTLE_REASON:
return clk_throttle_info(hpriv, args);
case HL_INFO_SYNC_MANAGER:
return sync_manager_info(hpriv, args);
case HL_INFO_TOTAL_ENERGY:
return total_energy_consumption_info(hpriv, args);
default:
dev_err(dev, "Invalid request %d\n", args->op);
rc = -ENOTTY;

View File

@ -288,10 +288,10 @@ static void ext_queue_schedule_job(struct hl_cs_job *job)
ptr = cb->bus_address;
cq_pkt.data = cpu_to_le32(
((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
& CQ_ENTRY_SHADOW_INDEX_MASK) |
(1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT) |
(1 << CQ_ENTRY_READY_SHIFT));
((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
& CQ_ENTRY_SHADOW_INDEX_MASK) |
FIELD_PREP(CQ_ENTRY_SHADOW_INDEX_VALID_MASK, 1) |
FIELD_PREP(CQ_ENTRY_READY_MASK, 1));
/*
* No need to protect pi_offset because scheduling to the
@ -474,7 +474,7 @@ static void init_signal_wait_cs(struct hl_cs *cs)
* wait CS was submitted.
*/
mb();
dma_fence_put(cs->signal_fence);
hl_fence_put(cs->signal_fence);
cs->signal_fence = NULL;
}
}

View File

@ -13,7 +13,7 @@
#define HWMON_NR_SENSOR_TYPES (hwmon_pwm + 1)
int hl_build_hwmon_channel_info(struct hl_device *hdev,
struct armcp_sensor *sensors_arr)
struct cpucp_sensor *sensors_arr)
{
u32 counts[HWMON_NR_SENSOR_TYPES] = {0};
u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL};
@ -24,7 +24,7 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
enum hwmon_sensor_types type;
int rc, i, j;
for (i = 0 ; i < ARMCP_MAX_SENSORS ; i++) {
for (i = 0 ; i < CPUCP_MAX_SENSORS ; i++) {
type = le32_to_cpu(sensors_arr[i].type);
if ((type == 0) && (sensors_arr[i].flags == 0))
@ -311,13 +311,13 @@ static const struct hwmon_ops hl_hwmon_ops = {
int hl_get_temperature(struct hl_device *hdev,
int sensor_index, u32 attr, long *value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
@ -337,13 +337,13 @@ int hl_get_temperature(struct hl_device *hdev,
int hl_set_temperature(struct hl_device *hdev,
int sensor_index, u32 attr, long value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_SET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_SET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
pkt.value = __cpu_to_le64(value);
@ -362,13 +362,13 @@ int hl_set_temperature(struct hl_device *hdev,
int hl_get_voltage(struct hl_device *hdev,
int sensor_index, u32 attr, long *value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
@ -388,13 +388,13 @@ int hl_get_voltage(struct hl_device *hdev,
int hl_get_current(struct hl_device *hdev,
int sensor_index, u32 attr, long *value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
@ -414,13 +414,13 @@ int hl_get_current(struct hl_device *hdev,
int hl_get_fan_speed(struct hl_device *hdev,
int sensor_index, u32 attr, long *value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_FAN_SPEED_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_FAN_SPEED_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
@ -440,13 +440,13 @@ int hl_get_fan_speed(struct hl_device *hdev,
int hl_get_pwm_info(struct hl_device *hdev,
int sensor_index, u32 attr, long *value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
@ -466,13 +466,13 @@ int hl_get_pwm_info(struct hl_device *hdev,
void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
long value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_SET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_SET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
pkt.value = cpu_to_le64(value);
@ -489,13 +489,13 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
int hl_set_voltage(struct hl_device *hdev,
int sensor_index, u32 attr, long value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_SET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_SET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
pkt.value = __cpu_to_le64(value);
@ -514,13 +514,13 @@ int hl_set_voltage(struct hl_device *hdev,
int hl_set_current(struct hl_device *hdev,
int sensor_index, u32 attr, long value)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_SET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_SET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
pkt.value = __cpu_to_le64(value);
@ -549,7 +549,7 @@ int hl_hwmon_init(struct hl_device *hdev)
hdev->hl_chip_info->ops = &hl_hwmon_ops;
hdev->hwmon_dev = hwmon_device_register_with_info(dev,
prop->armcp_info.card_name, hdev,
prop->cpucp_info.card_name, hdev,
hdev->hl_chip_info, NULL);
if (IS_ERR(hdev->hwmon_dev)) {
rc = PTR_ERR(hdev->hwmon_dev);

View File

@ -11,7 +11,7 @@
/**
* struct hl_eqe_work - This structure is used to schedule work of EQ
* entry and armcp_reset event
* entry and cpucp_reset event
*
* @eq_work: workqueue object to run when EQ entry is received
* @hdev: pointer to device structure

View File

@ -505,41 +505,32 @@ static inline int add_va_block(struct hl_device *hdev,
}
/*
* get_va_block - get a virtual block with the requested size
*
* @hdev : pointer to the habanalabs device structure
* @va_range : pointer to the virtual addresses range
* @size : requested block size
* @hint_addr : hint for request address by the user
* @is_userptr : is host or DRAM memory
* get_va_block() - get a virtual block for the given size and alignment.
* @hdev: pointer to the habanalabs device structure.
* @va_range: pointer to the virtual addresses range.
* @size: requested block size.
* @hint_addr: hint for requested address by the user.
* @va_block_align: required alignment of the virtual block start address.
*
* This function does the following:
* - Iterate on the virtual block list to find a suitable virtual block for the
* requested size
* - Reserve the requested block and update the list
* - Return the start address of the virtual block
* given size and alignment.
* - Reserve the requested block and update the list.
* - Return the start address of the virtual block.
*/
static u64 get_va_block(struct hl_device *hdev,
struct hl_va_range *va_range, u64 size, u64 hint_addr,
bool is_userptr)
static u64 get_va_block(struct hl_device *hdev, struct hl_va_range *va_range,
u64 size, u64 hint_addr, u32 va_block_align)
{
struct hl_vm_va_block *va_block, *new_va_block = NULL;
u64 valid_start, valid_size, prev_start, prev_end, page_mask,
u64 valid_start, valid_size, prev_start, prev_end, align_mask,
res_valid_start = 0, res_valid_size = 0;
u32 page_size;
bool add_prev = false;
if (is_userptr)
/*
* We cannot know if the user allocated memory with huge pages
* or not, hence we continue with the biggest possible
* granularity.
*/
page_size = hdev->asic_prop.pmmu_huge.page_size;
else
page_size = hdev->asic_prop.dmmu.page_size;
align_mask = ~((u64)va_block_align - 1);
page_mask = ~((u64)page_size - 1);
/* check if hint_addr is aligned */
if (hint_addr & (va_block_align - 1))
hint_addr = 0;
mutex_lock(&va_range->lock);
@ -549,9 +540,9 @@ static u64 get_va_block(struct hl_device *hdev,
/* calc the first possible aligned addr */
valid_start = va_block->start;
if (valid_start & (page_size - 1)) {
valid_start &= page_mask;
valid_start += page_size;
if (valid_start & (va_block_align - 1)) {
valid_start &= align_mask;
valid_start += va_block_align;
if (valid_start > va_block->end)
continue;
}
@ -863,7 +854,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
struct hl_va_range *va_range;
enum vm_type_t *vm_type;
u64 ret_vaddr, hint_addr;
u32 handle = 0;
u32 handle = 0, va_block_align;
int rc;
bool is_userptr = args->flags & HL_MEM_USERPTR;
@ -873,6 +864,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
if (is_userptr) {
u64 addr = args->map_host.host_virt_addr,
size = args->map_host.mem_size;
u32 page_size = hdev->asic_prop.pmmu.page_size,
huge_page_size = hdev->asic_prop.pmmu_huge.page_size;
rc = dma_map_host_va(hdev, addr, size, &userptr);
if (rc) {
@ -892,6 +885,27 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
vm_type = (enum vm_type_t *) userptr;
hint_addr = args->map_host.hint_addr;
handle = phys_pg_pack->handle;
/* get required alignment */
if (phys_pg_pack->page_size == page_size) {
va_range = ctx->host_va_range;
/*
* huge page alignment may be needed in case of regular
* page mapping, depending on the host VA alignment
*/
if (addr & (huge_page_size - 1))
va_block_align = page_size;
else
va_block_align = huge_page_size;
} else {
/*
* huge page alignment is needed in case of huge page
* mapping
*/
va_range = ctx->host_huge_va_range;
va_block_align = huge_page_size;
}
} else {
handle = lower_32_bits(args->map_device.handle);
@ -912,6 +926,10 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
vm_type = (enum vm_type_t *) phys_pg_pack;
hint_addr = args->map_device.hint_addr;
/* DRAM VA alignment is the same as the DRAM page size */
va_range = ctx->dram_va_range;
va_block_align = hdev->asic_prop.dmmu.page_size;
}
/*
@ -933,16 +951,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
goto hnode_err;
}
if (is_userptr)
if (phys_pg_pack->page_size == hdev->asic_prop.pmmu.page_size)
va_range = ctx->host_va_range;
else
va_range = ctx->host_huge_va_range;
else
va_range = ctx->dram_va_range;
ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
hint_addr, is_userptr);
hint_addr, va_block_align);
if (!ret_vaddr) {
dev_err(hdev->dev, "no available va block for handle %u\n",
handle);

View File

@ -1,258 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright 2016-2019 HabanaLabs, Ltd.
* Copyright 2016-2020 HabanaLabs, Ltd.
* All Rights Reserved.
*/
#include "habanalabs.h"
#include "../include/hw_ip/mmu/mmu_general.h"
#include <linux/genalloc.h>
#include <linux/slab.h>
static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
{
struct pgt_info *pgt_info = NULL;
hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
(unsigned long) hop_addr)
if (hop_addr == pgt_info->shadow_addr)
break;
return pgt_info;
}
static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
{
struct hl_device *hdev = ctx->hdev;
gen_pool_free(hdev->mmu_pgt_pool, pgt_info->phys_addr,
hdev->asic_prop.mmu_hop_table_size);
hash_del(&pgt_info->node);
kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
kfree(pgt_info);
}
static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
{
struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
_free_hop(ctx, pgt_info);
}
static u64 alloc_hop(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct pgt_info *pgt_info;
u64 phys_addr, shadow_addr;
pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
if (!pgt_info)
return ULLONG_MAX;
phys_addr = (u64) gen_pool_alloc(hdev->mmu_pgt_pool,
prop->mmu_hop_table_size);
if (!phys_addr) {
dev_err(hdev->dev, "failed to allocate page\n");
goto pool_add_err;
}
shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
GFP_KERNEL);
if (!shadow_addr)
goto shadow_err;
pgt_info->phys_addr = phys_addr;
pgt_info->shadow_addr = shadow_addr;
pgt_info->ctx = ctx;
pgt_info->num_of_ptes = 0;
hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
return shadow_addr;
shadow_err:
gen_pool_free(hdev->mmu_pgt_pool, phys_addr, prop->mmu_hop_table_size);
pool_add_err:
kfree(pgt_info);
return ULLONG_MAX;
}
static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
{
return ctx->hdev->asic_prop.mmu_pgt_addr +
(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
}
static inline u64 get_hop0_addr(struct hl_ctx *ctx)
{
return (u64) (uintptr_t) ctx->hdev->mmu_shadow_hop0 +
(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
}
static inline void flush(struct hl_ctx *ctx)
{
/* flush all writes from all cores to reach PCI */
mb();
ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
}
/* transform the value to physical address when writing to H/W */
static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
{
/*
* The value to write is actually the address of the next shadow hop +
* flags at the 12 LSBs.
* Hence in order to get the value to write to the physical PTE, we
* clear the 12 LSBs and translate the shadow hop to its associated
* physical hop, and add back the original 12 LSBs.
*/
u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
(val & FLAGS_MASK);
ctx->hdev->asic_funcs->write_pte(ctx->hdev,
get_phys_addr(ctx, shadow_pte_addr),
phys_val);
*(u64 *) (uintptr_t) shadow_pte_addr = val;
}
/* do not transform the value to physical address when writing to H/W */
static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
u64 val)
{
ctx->hdev->asic_funcs->write_pte(ctx->hdev,
get_phys_addr(ctx, shadow_pte_addr),
val);
*(u64 *) (uintptr_t) shadow_pte_addr = val;
}
/* clear the last and present bits */
static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
{
/* no need to transform the value to physical address */
write_final_pte(ctx, pte_addr, 0);
}
static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
{
get_pgt_info(ctx, hop_addr)->num_of_ptes++;
}
/*
* put_pte - decrement the num of ptes and free the hop if possible
*
* @ctx: pointer to the context structure
* @hop_addr: addr of the hop
*
* This function returns the number of ptes left on this hop. If the number is
* 0, it means the pte was freed.
*/
static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
{
struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
int num_of_ptes_left;
pgt_info->num_of_ptes--;
/*
* Need to save the number of ptes left because free_hop might free
* the pgt_info
*/
num_of_ptes_left = pgt_info->num_of_ptes;
if (!num_of_ptes_left)
_free_hop(ctx, pgt_info);
return num_of_ptes_left;
}
static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
u64 virt_addr, u64 mask, u64 shift)
{
return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
((virt_addr & mask) >> shift);
}
static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
mmu_prop->hop0_shift);
}
static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
mmu_prop->hop1_shift);
}
static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
mmu_prop->hop2_shift);
}
static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
mmu_prop->hop3_shift);
}
static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
mmu_prop->hop4_shift);
}
static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
{
if (curr_pte & PAGE_PRESENT_MASK)
return curr_pte & HOP_PHYS_ADDR_MASK;
else
return ULLONG_MAX;
}
static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
bool *is_new_hop)
{
u64 hop_addr = get_next_hop_addr(ctx, curr_pte);
if (hop_addr == ULLONG_MAX) {
hop_addr = alloc_hop(ctx);
*is_new_hop = (hop_addr != ULLONG_MAX);
}
return hop_addr;
}
/* translates shadow address inside hop to a physical address */
static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
{
u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
u64 shadow_hop_addr = shadow_addr & ~page_mask;
u64 pte_offset = shadow_addr & page_mask;
u64 phys_hop_addr;
if (shadow_hop_addr != get_hop0_addr(ctx))
phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
else
phys_hop_addr = get_phys_hop0_addr(ctx);
return phys_hop_addr + pte_offset;
}
#include "habanalabs.h"
static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
{
@ -263,155 +18,6 @@ static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
prop->dmmu.end_addr);
}
static int dram_default_mapping_init(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
hop2_pte_addr, hop3_pte_addr, pte_val;
int rc, i, j, hop3_allocated = 0;
if ((!hdev->dram_supports_virtual_memory) ||
(!hdev->dram_default_page_mapping) ||
(ctx->asid == HL_KERNEL_ASID_ID))
return 0;
num_of_hop3 = prop->dram_size_for_default_page_mapping;
do_div(num_of_hop3, prop->dram_page_size);
do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
/* add hop1 and hop2 */
total_hops = num_of_hop3 + 2;
ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops, GFP_KERNEL);
if (!ctx->dram_default_hops)
return -ENOMEM;
hop0_addr = get_hop0_addr(ctx);
hop1_addr = alloc_hop(ctx);
if (hop1_addr == ULLONG_MAX) {
dev_err(hdev->dev, "failed to alloc hop 1\n");
rc = -ENOMEM;
goto hop1_err;
}
ctx->dram_default_hops[total_hops - 1] = hop1_addr;
hop2_addr = alloc_hop(ctx);
if (hop2_addr == ULLONG_MAX) {
dev_err(hdev->dev, "failed to alloc hop 2\n");
rc = -ENOMEM;
goto hop2_err;
}
ctx->dram_default_hops[total_hops - 2] = hop2_addr;
for (i = 0 ; i < num_of_hop3 ; i++) {
ctx->dram_default_hops[i] = alloc_hop(ctx);
if (ctx->dram_default_hops[i] == ULLONG_MAX) {
dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
rc = -ENOMEM;
goto hop3_err;
}
hop3_allocated++;
}
/* need only pte 0 in hops 0 and 1 */
pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop0_addr, pte_val);
pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop1_addr, pte_val);
get_pte(ctx, hop1_addr);
hop2_pte_addr = hop2_addr;
for (i = 0 ; i < num_of_hop3 ; i++) {
pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
PAGE_PRESENT_MASK;
write_pte(ctx, hop2_pte_addr, pte_val);
get_pte(ctx, hop2_addr);
hop2_pte_addr += HL_PTE_SIZE;
}
pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) |
LAST_MASK | PAGE_PRESENT_MASK;
for (i = 0 ; i < num_of_hop3 ; i++) {
hop3_pte_addr = ctx->dram_default_hops[i];
for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
write_final_pte(ctx, hop3_pte_addr, pte_val);
get_pte(ctx, ctx->dram_default_hops[i]);
hop3_pte_addr += HL_PTE_SIZE;
}
}
flush(ctx);
return 0;
hop3_err:
for (i = 0 ; i < hop3_allocated ; i++)
free_hop(ctx, ctx->dram_default_hops[i]);
free_hop(ctx, hop2_addr);
hop2_err:
free_hop(ctx, hop1_addr);
hop1_err:
kfree(ctx->dram_default_hops);
return rc;
}
static void dram_default_mapping_fini(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
hop2_pte_addr, hop3_pte_addr;
int i, j;
if ((!hdev->dram_supports_virtual_memory) ||
(!hdev->dram_default_page_mapping) ||
(ctx->asid == HL_KERNEL_ASID_ID))
return;
num_of_hop3 = prop->dram_size_for_default_page_mapping;
do_div(num_of_hop3, prop->dram_page_size);
do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
hop0_addr = get_hop0_addr(ctx);
/* add hop1 and hop2 */
total_hops = num_of_hop3 + 2;
hop1_addr = ctx->dram_default_hops[total_hops - 1];
hop2_addr = ctx->dram_default_hops[total_hops - 2];
for (i = 0 ; i < num_of_hop3 ; i++) {
hop3_pte_addr = ctx->dram_default_hops[i];
for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
clear_pte(ctx, hop3_pte_addr);
put_pte(ctx, ctx->dram_default_hops[i]);
hop3_pte_addr += HL_PTE_SIZE;
}
}
hop2_pte_addr = hop2_addr;
hop2_pte_addr = hop2_addr;
for (i = 0 ; i < num_of_hop3 ; i++) {
clear_pte(ctx, hop2_pte_addr);
put_pte(ctx, hop2_addr);
hop2_pte_addr += HL_PTE_SIZE;
}
clear_pte(ctx, hop1_addr);
put_pte(ctx, hop1_addr);
clear_pte(ctx, hop0_addr);
kfree(ctx->dram_default_hops);
flush(ctx);
}
/**
* hl_mmu_init() - initialize the MMU module.
* @hdev: habanalabs device structure.
@ -424,45 +30,10 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
*/
int hl_mmu_init(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
int rc;
if (!hdev->mmu_enable)
return 0;
hdev->mmu_pgt_pool =
gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
if (!hdev->mmu_pgt_pool) {
dev_err(hdev->dev, "Failed to create page gen pool\n");
return -ENOMEM;
}
rc = gen_pool_add(hdev->mmu_pgt_pool, prop->mmu_pgt_addr +
prop->mmu_hop0_tables_total_size,
prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
-1);
if (rc) {
dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
goto err_pool_add;
}
hdev->mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
prop->mmu_hop_table_size,
GFP_KERNEL | __GFP_ZERO);
if (ZERO_OR_NULL_PTR(hdev->mmu_shadow_hop0)) {
rc = -ENOMEM;
goto err_pool_add;
}
/* MMU H/W init will be done in device hw_init() */
if (hdev->mmu_enable)
return hdev->mmu_func.init(hdev);
return 0;
err_pool_add:
gen_pool_destroy(hdev->mmu_pgt_pool);
return rc;
}
/**
@ -477,13 +48,8 @@ err_pool_add:
*/
void hl_mmu_fini(struct hl_device *hdev)
{
if (!hdev->mmu_enable)
return;
/* MMU H/W fini was already done in device hw_fini() */
kvfree(hdev->mmu_shadow_hop0);
gen_pool_destroy(hdev->mmu_pgt_pool);
if (hdev->mmu_enable)
hdev->mmu_func.fini(hdev);
}
/**
@ -498,13 +64,10 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
if (!hdev->mmu_enable)
return 0;
if (hdev->mmu_enable)
return hdev->mmu_func.ctx_init(ctx);
mutex_init(&ctx->mmu_lock);
hash_init(ctx->mmu_shadow_hash);
return dram_default_mapping_init(ctx);
return 0;
}
/*
@ -520,160 +83,9 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx)
void hl_mmu_ctx_fini(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct pgt_info *pgt_info;
struct hlist_node *tmp;
int i;
if (!hdev->mmu_enable)
return;
dram_default_mapping_fini(ctx);
if (!hash_empty(ctx->mmu_shadow_hash))
dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
ctx->asid);
hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
dev_err_ratelimited(hdev->dev,
"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
_free_hop(ctx, pgt_info);
}
mutex_destroy(&ctx->mmu_lock);
}
static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 hop0_addr = 0, hop0_pte_addr = 0,
hop1_addr = 0, hop1_pte_addr = 0,
hop2_addr = 0, hop2_pte_addr = 0,
hop3_addr = 0, hop3_pte_addr = 0,
hop4_addr = 0, hop4_pte_addr = 0,
curr_pte;
bool is_huge, clear_hop3 = true;
/* shifts and masks are the same in PMMU and HPMMU, use one of them */
mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
hop0_addr = get_hop0_addr(ctx);
hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
hop1_addr = get_next_hop_addr(ctx, curr_pte);
if (hop1_addr == ULLONG_MAX)
goto not_mapped;
hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
hop2_addr = get_next_hop_addr(ctx, curr_pte);
if (hop2_addr == ULLONG_MAX)
goto not_mapped;
hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
hop3_addr = get_next_hop_addr(ctx, curr_pte);
if (hop3_addr == ULLONG_MAX)
goto not_mapped;
hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
is_huge = curr_pte & LAST_MASK;
if (is_dram_addr && !is_huge) {
dev_err(hdev->dev,
"DRAM unmapping should use huge pages only\n");
return -EFAULT;
}
if (!is_huge) {
hop4_addr = get_next_hop_addr(ctx, curr_pte);
if (hop4_addr == ULLONG_MAX)
goto not_mapped;
hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
clear_hop3 = false;
}
if (hdev->dram_default_page_mapping && is_dram_addr) {
u64 default_pte = (prop->mmu_dram_default_page_addr &
HOP_PHYS_ADDR_MASK) | LAST_MASK |
PAGE_PRESENT_MASK;
if (curr_pte == default_pte) {
dev_err(hdev->dev,
"DRAM: hop3 PTE points to zero page, can't unmap, va: 0x%llx\n",
virt_addr);
goto not_mapped;
}
if (!(curr_pte & PAGE_PRESENT_MASK)) {
dev_err(hdev->dev,
"DRAM: hop3 PTE is cleared! can't unmap, va: 0x%llx\n",
virt_addr);
goto not_mapped;
}
write_final_pte(ctx, hop3_pte_addr, default_pte);
put_pte(ctx, hop3_addr);
} else {
if (!(curr_pte & PAGE_PRESENT_MASK))
goto not_mapped;
if (hop4_addr)
clear_pte(ctx, hop4_pte_addr);
else
clear_pte(ctx, hop3_pte_addr);
if (hop4_addr && !put_pte(ctx, hop4_addr))
clear_hop3 = true;
if (!clear_hop3)
goto mapped;
clear_pte(ctx, hop3_pte_addr);
if (put_pte(ctx, hop3_addr))
goto mapped;
clear_pte(ctx, hop2_pte_addr);
if (put_pte(ctx, hop2_addr))
goto mapped;
clear_pte(ctx, hop1_pte_addr);
if (put_pte(ctx, hop1_addr))
goto mapped;
clear_pte(ctx, hop0_pte_addr);
}
mapped:
return 0;
not_mapped:
dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
virt_addr);
return -EINVAL;
if (hdev->mmu_enable)
hdev->mmu_func.ctx_fini(ctx);
}
/*
@ -738,7 +150,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
real_virt_addr = virt_addr;
for (i = 0 ; i < npages ; i++) {
rc = _hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr);
rc = hdev->mmu_func.unmap(ctx, real_virt_addr, is_dram_addr);
if (rc)
break;
@ -746,172 +158,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
}
if (flush_pte)
flush(ctx);
return rc;
}
static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
u32 page_size, bool is_dram_addr)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 hop0_addr = 0, hop0_pte_addr = 0,
hop1_addr = 0, hop1_pte_addr = 0,
hop2_addr = 0, hop2_pte_addr = 0,
hop3_addr = 0, hop3_pte_addr = 0,
hop4_addr = 0, hop4_pte_addr = 0,
curr_pte = 0;
bool hop1_new = false, hop2_new = false, hop3_new = false,
hop4_new = false, is_huge;
int rc = -ENOMEM;
/*
* This mapping function can map a page or a huge page. For huge page
* there are only 3 hops rather than 4. Currently the DRAM allocation
* uses huge pages only but user memory could have been allocated with
* one of the two page sizes. Since this is a common code for all the
* three cases, we need this hugs page check.
*/
if (is_dram_addr) {
mmu_prop = &prop->dmmu;
is_huge = true;
} else if (page_size == prop->pmmu_huge.page_size) {
mmu_prop = &prop->pmmu_huge;
is_huge = true;
} else {
mmu_prop = &prop->pmmu;
is_huge = false;
}
hop0_addr = get_hop0_addr(ctx);
hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
if (hop1_addr == ULLONG_MAX)
goto err;
hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
if (hop2_addr == ULLONG_MAX)
goto err;
hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
if (hop3_addr == ULLONG_MAX)
goto err;
hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
if (!is_huge) {
hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new);
if (hop4_addr == ULLONG_MAX)
goto err;
hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
}
if (hdev->dram_default_page_mapping && is_dram_addr) {
u64 default_pte = (prop->mmu_dram_default_page_addr &
HOP_PHYS_ADDR_MASK) | LAST_MASK |
PAGE_PRESENT_MASK;
if (curr_pte != default_pte) {
dev_err(hdev->dev,
"DRAM: mapping already exists for virt_addr 0x%llx\n",
virt_addr);
rc = -EINVAL;
goto err;
}
if (hop1_new || hop2_new || hop3_new || hop4_new) {
dev_err(hdev->dev,
"DRAM mapping should not allocate more hops\n");
rc = -EFAULT;
goto err;
}
} else if (curr_pte & PAGE_PRESENT_MASK) {
dev_err(hdev->dev,
"mapping already exists for virt_addr 0x%llx\n",
virt_addr);
dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop0_pte_addr, hop0_pte_addr);
dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop1_pte_addr, hop1_pte_addr);
dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop2_pte_addr, hop2_pte_addr);
dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop3_pte_addr, hop3_pte_addr);
if (!is_huge)
dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop4_pte_addr,
hop4_pte_addr);
rc = -EINVAL;
goto err;
}
curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
| PAGE_PRESENT_MASK;
if (is_huge)
write_final_pte(ctx, hop3_pte_addr, curr_pte);
else
write_final_pte(ctx, hop4_pte_addr, curr_pte);
if (hop1_new) {
curr_pte =
(hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop0_pte_addr, curr_pte);
}
if (hop2_new) {
curr_pte =
(hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop1_pte_addr, curr_pte);
get_pte(ctx, hop1_addr);
}
if (hop3_new) {
curr_pte =
(hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop2_pte_addr, curr_pte);
get_pte(ctx, hop2_addr);
}
if (!is_huge) {
if (hop4_new) {
curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
PAGE_PRESENT_MASK;
write_pte(ctx, hop3_pte_addr, curr_pte);
get_pte(ctx, hop3_addr);
}
get_pte(ctx, hop4_addr);
} else {
get_pte(ctx, hop3_addr);
}
return 0;
err:
if (hop4_new)
free_hop(ctx, hop4_addr);
if (hop3_new)
free_hop(ctx, hop3_addr);
if (hop2_new)
free_hop(ctx, hop2_addr);
if (hop1_new)
free_hop(ctx, hop1_addr);
hdev->mmu_func.flush(ctx);
return rc;
}
@ -984,7 +231,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
real_phys_addr = phys_addr;
for (i = 0 ; i < npages ; i++) {
rc = _hl_mmu_map(ctx, real_virt_addr, real_phys_addr,
rc = hdev->mmu_func.map(ctx, real_virt_addr, real_phys_addr,
real_page_size, is_dram_addr);
if (rc)
goto err;
@ -995,21 +242,21 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
}
if (flush_pte)
flush(ctx);
hdev->mmu_func.flush(ctx);
return 0;
err:
real_virt_addr = virt_addr;
for (i = 0 ; i < mapped_cnt ; i++) {
if (_hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr))
if (hdev->mmu_func.unmap(ctx, real_virt_addr, is_dram_addr))
dev_warn_ratelimited(hdev->dev,
"failed to unmap va: 0x%llx\n", real_virt_addr);
real_virt_addr += real_page_size;
}
flush(ctx);
hdev->mmu_func.flush(ctx);
return rc;
}
@ -1022,7 +269,10 @@ err:
*/
void hl_mmu_swap_out(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
if (hdev->mmu_enable)
hdev->mmu_func.swap_out(ctx);
}
/*
@ -1033,5 +283,27 @@ void hl_mmu_swap_out(struct hl_ctx *ctx)
*/
void hl_mmu_swap_in(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
if (hdev->mmu_enable)
hdev->mmu_func.swap_in(ctx);
}
int hl_mmu_if_set_funcs(struct hl_device *hdev)
{
if (!hdev->mmu_enable)
return 0;
switch (hdev->asic_type) {
case ASIC_GOYA:
case ASIC_GAUDI:
hl_mmu_v1_set_funcs(hdev);
break;
default:
dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
hdev->asic_type);
return -EOPNOTSUPP;
}
return 0;
}

View File

@ -0,0 +1,863 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright 2016-2019 HabanaLabs, Ltd.
* All Rights Reserved.
*/
#include "habanalabs.h"
#include "../include/hw_ip/mmu/mmu_general.h"
#include <linux/genalloc.h>
#include <linux/slab.h>
static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
{
struct pgt_info *pgt_info = NULL;
hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
(unsigned long) hop_addr)
if (hop_addr == pgt_info->shadow_addr)
break;
return pgt_info;
}
static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
{
struct hl_device *hdev = ctx->hdev;
gen_pool_free(hdev->mmu_priv.mmu_pgt_pool, pgt_info->phys_addr,
hdev->asic_prop.mmu_hop_table_size);
hash_del(&pgt_info->node);
kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
kfree(pgt_info);
}
static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
{
struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
_free_hop(ctx, pgt_info);
}
static u64 alloc_hop(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct pgt_info *pgt_info;
u64 phys_addr, shadow_addr;
pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
if (!pgt_info)
return ULLONG_MAX;
phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.mmu_pgt_pool,
prop->mmu_hop_table_size);
if (!phys_addr) {
dev_err(hdev->dev, "failed to allocate page\n");
goto pool_add_err;
}
shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
GFP_KERNEL);
if (!shadow_addr)
goto shadow_err;
pgt_info->phys_addr = phys_addr;
pgt_info->shadow_addr = shadow_addr;
pgt_info->ctx = ctx;
pgt_info->num_of_ptes = 0;
hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
return shadow_addr;
shadow_err:
gen_pool_free(hdev->mmu_priv.mmu_pgt_pool, phys_addr,
prop->mmu_hop_table_size);
pool_add_err:
kfree(pgt_info);
return ULLONG_MAX;
}
static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
{
return ctx->hdev->asic_prop.mmu_pgt_addr +
(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
}
static inline u64 get_hop0_addr(struct hl_ctx *ctx)
{
return (u64) (uintptr_t) ctx->hdev->mmu_priv.mmu_shadow_hop0 +
(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
}
static void flush(struct hl_ctx *ctx)
{
/* flush all writes from all cores to reach PCI */
mb();
ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
}
/* transform the value to physical address when writing to H/W */
static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
{
/*
* The value to write is actually the address of the next shadow hop +
* flags at the 12 LSBs.
* Hence in order to get the value to write to the physical PTE, we
* clear the 12 LSBs and translate the shadow hop to its associated
* physical hop, and add back the original 12 LSBs.
*/
u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
(val & FLAGS_MASK);
ctx->hdev->asic_funcs->write_pte(ctx->hdev,
get_phys_addr(ctx, shadow_pte_addr),
phys_val);
*(u64 *) (uintptr_t) shadow_pte_addr = val;
}
/* do not transform the value to physical address when writing to H/W */
static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
u64 val)
{
ctx->hdev->asic_funcs->write_pte(ctx->hdev,
get_phys_addr(ctx, shadow_pte_addr),
val);
*(u64 *) (uintptr_t) shadow_pte_addr = val;
}
/* clear the last and present bits */
static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
{
/* no need to transform the value to physical address */
write_final_pte(ctx, pte_addr, 0);
}
static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
{
get_pgt_info(ctx, hop_addr)->num_of_ptes++;
}
/*
* put_pte - decrement the num of ptes and free the hop if possible
*
* @ctx: pointer to the context structure
* @hop_addr: addr of the hop
*
* This function returns the number of ptes left on this hop. If the number is
* 0, it means the pte was freed.
*/
static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
{
struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
int num_of_ptes_left;
pgt_info->num_of_ptes--;
/*
* Need to save the number of ptes left because free_hop might free
* the pgt_info
*/
num_of_ptes_left = pgt_info->num_of_ptes;
if (!num_of_ptes_left)
_free_hop(ctx, pgt_info);
return num_of_ptes_left;
}
static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
u64 virt_addr, u64 mask, u64 shift)
{
return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
((virt_addr & mask) >> shift);
}
static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
mmu_prop->hop0_shift);
}
static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
mmu_prop->hop1_shift);
}
static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
mmu_prop->hop2_shift);
}
static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
mmu_prop->hop3_shift);
}
static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
mmu_prop->hop4_shift);
}
static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
{
if (curr_pte & PAGE_PRESENT_MASK)
return curr_pte & HOP_PHYS_ADDR_MASK;
else
return ULLONG_MAX;
}
static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
bool *is_new_hop)
{
u64 hop_addr = get_next_hop_addr(ctx, curr_pte);
if (hop_addr == ULLONG_MAX) {
hop_addr = alloc_hop(ctx);
*is_new_hop = (hop_addr != ULLONG_MAX);
}
return hop_addr;
}
/* translates shadow address inside hop to a physical address */
static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
{
u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
u64 shadow_hop_addr = shadow_addr & ~page_mask;
u64 pte_offset = shadow_addr & page_mask;
u64 phys_hop_addr;
if (shadow_hop_addr != get_hop0_addr(ctx))
phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
else
phys_hop_addr = get_phys_hop0_addr(ctx);
return phys_hop_addr + pte_offset;
}
static int dram_default_mapping_init(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
hop2_pte_addr, hop3_pte_addr, pte_val;
int rc, i, j, hop3_allocated = 0;
if ((!hdev->dram_supports_virtual_memory) ||
(!hdev->dram_default_page_mapping) ||
(ctx->asid == HL_KERNEL_ASID_ID))
return 0;
num_of_hop3 = prop->dram_size_for_default_page_mapping;
do_div(num_of_hop3, prop->dram_page_size);
do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
/* add hop1 and hop2 */
total_hops = num_of_hop3 + 2;
ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops, GFP_KERNEL);
if (!ctx->dram_default_hops)
return -ENOMEM;
hop0_addr = get_hop0_addr(ctx);
hop1_addr = alloc_hop(ctx);
if (hop1_addr == ULLONG_MAX) {
dev_err(hdev->dev, "failed to alloc hop 1\n");
rc = -ENOMEM;
goto hop1_err;
}
ctx->dram_default_hops[total_hops - 1] = hop1_addr;
hop2_addr = alloc_hop(ctx);
if (hop2_addr == ULLONG_MAX) {
dev_err(hdev->dev, "failed to alloc hop 2\n");
rc = -ENOMEM;
goto hop2_err;
}
ctx->dram_default_hops[total_hops - 2] = hop2_addr;
for (i = 0 ; i < num_of_hop3 ; i++) {
ctx->dram_default_hops[i] = alloc_hop(ctx);
if (ctx->dram_default_hops[i] == ULLONG_MAX) {
dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
rc = -ENOMEM;
goto hop3_err;
}
hop3_allocated++;
}
/* need only pte 0 in hops 0 and 1 */
pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop0_addr, pte_val);
pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop1_addr, pte_val);
get_pte(ctx, hop1_addr);
hop2_pte_addr = hop2_addr;
for (i = 0 ; i < num_of_hop3 ; i++) {
pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
PAGE_PRESENT_MASK;
write_pte(ctx, hop2_pte_addr, pte_val);
get_pte(ctx, hop2_addr);
hop2_pte_addr += HL_PTE_SIZE;
}
pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) |
LAST_MASK | PAGE_PRESENT_MASK;
for (i = 0 ; i < num_of_hop3 ; i++) {
hop3_pte_addr = ctx->dram_default_hops[i];
for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
write_final_pte(ctx, hop3_pte_addr, pte_val);
get_pte(ctx, ctx->dram_default_hops[i]);
hop3_pte_addr += HL_PTE_SIZE;
}
}
flush(ctx);
return 0;
hop3_err:
for (i = 0 ; i < hop3_allocated ; i++)
free_hop(ctx, ctx->dram_default_hops[i]);
free_hop(ctx, hop2_addr);
hop2_err:
free_hop(ctx, hop1_addr);
hop1_err:
kfree(ctx->dram_default_hops);
return rc;
}
static void dram_default_mapping_fini(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
hop2_pte_addr, hop3_pte_addr;
int i, j;
if ((!hdev->dram_supports_virtual_memory) ||
(!hdev->dram_default_page_mapping) ||
(ctx->asid == HL_KERNEL_ASID_ID))
return;
num_of_hop3 = prop->dram_size_for_default_page_mapping;
do_div(num_of_hop3, prop->dram_page_size);
do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
hop0_addr = get_hop0_addr(ctx);
/* add hop1 and hop2 */
total_hops = num_of_hop3 + 2;
hop1_addr = ctx->dram_default_hops[total_hops - 1];
hop2_addr = ctx->dram_default_hops[total_hops - 2];
for (i = 0 ; i < num_of_hop3 ; i++) {
hop3_pte_addr = ctx->dram_default_hops[i];
for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
clear_pte(ctx, hop3_pte_addr);
put_pte(ctx, ctx->dram_default_hops[i]);
hop3_pte_addr += HL_PTE_SIZE;
}
}
hop2_pte_addr = hop2_addr;
hop2_pte_addr = hop2_addr;
for (i = 0 ; i < num_of_hop3 ; i++) {
clear_pte(ctx, hop2_pte_addr);
put_pte(ctx, hop2_addr);
hop2_pte_addr += HL_PTE_SIZE;
}
clear_pte(ctx, hop1_addr);
put_pte(ctx, hop1_addr);
clear_pte(ctx, hop0_addr);
kfree(ctx->dram_default_hops);
flush(ctx);
}
/**
* hl_mmu_v1_init() - initialize the MMU module.
* @hdev: habanalabs device structure.
*
* This function does the following:
* - Create a pool of pages for pgt_infos.
* - Create a shadow table for pgt
*
* Return: 0 for success, non-zero for failure.
*/
static int hl_mmu_v1_init(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
int rc;
hdev->mmu_priv.mmu_pgt_pool =
gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
if (!hdev->mmu_priv.mmu_pgt_pool) {
dev_err(hdev->dev, "Failed to create page gen pool\n");
return -ENOMEM;
}
rc = gen_pool_add(hdev->mmu_priv.mmu_pgt_pool, prop->mmu_pgt_addr +
prop->mmu_hop0_tables_total_size,
prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
-1);
if (rc) {
dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
goto err_pool_add;
}
hdev->mmu_priv.mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
prop->mmu_hop_table_size,
GFP_KERNEL | __GFP_ZERO);
if (ZERO_OR_NULL_PTR(hdev->mmu_priv.mmu_shadow_hop0)) {
rc = -ENOMEM;
goto err_pool_add;
}
/* MMU H/W init will be done in device hw_init() */
return 0;
err_pool_add:
gen_pool_destroy(hdev->mmu_priv.mmu_pgt_pool);
return rc;
}
/**
* hl_mmu_fini() - release the MMU module.
* @hdev: habanalabs device structure.
*
* This function does the following:
* - Disable MMU in H/W.
* - Free the pgt_infos pool.
*
* All contexts should be freed before calling this function.
*/
static void hl_mmu_v1_fini(struct hl_device *hdev)
{
/* MMU H/W fini was already done in device hw_fini() */
kvfree(hdev->mmu_priv.mmu_shadow_hop0);
gen_pool_destroy(hdev->mmu_priv.mmu_pgt_pool);
}
/**
* hl_mmu_ctx_init() - initialize a context for using the MMU module.
* @ctx: pointer to the context structure to initialize.
*
* Initialize a mutex to protect the concurrent mapping flow, a hash to hold all
* page tables hops related to this context.
* Return: 0 on success, non-zero otherwise.
*/
static int hl_mmu_v1_ctx_init(struct hl_ctx *ctx)
{
mutex_init(&ctx->mmu_lock);
hash_init(ctx->mmu_shadow_hash);
return dram_default_mapping_init(ctx);
}
/*
* hl_mmu_ctx_fini - disable a ctx from using the mmu module
*
* @ctx: pointer to the context structure
*
* This function does the following:
* - Free any pgts which were not freed yet
* - Free the mutex
* - Free DRAM default page mapping hops
*/
static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
struct pgt_info *pgt_info;
struct hlist_node *tmp;
int i;
dram_default_mapping_fini(ctx);
if (!hash_empty(ctx->mmu_shadow_hash))
dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
ctx->asid);
hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
dev_err_ratelimited(hdev->dev,
"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
_free_hop(ctx, pgt_info);
}
mutex_destroy(&ctx->mmu_lock);
}
static int _hl_mmu_v1_unmap(struct hl_ctx *ctx,
u64 virt_addr, bool is_dram_addr)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 hop0_addr = 0, hop0_pte_addr = 0,
hop1_addr = 0, hop1_pte_addr = 0,
hop2_addr = 0, hop2_pte_addr = 0,
hop3_addr = 0, hop3_pte_addr = 0,
hop4_addr = 0, hop4_pte_addr = 0,
curr_pte;
bool is_huge, clear_hop3 = true;
/* shifts and masks are the same in PMMU and HPMMU, use one of them */
mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
hop0_addr = get_hop0_addr(ctx);
hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
hop1_addr = get_next_hop_addr(ctx, curr_pte);
if (hop1_addr == ULLONG_MAX)
goto not_mapped;
hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
hop2_addr = get_next_hop_addr(ctx, curr_pte);
if (hop2_addr == ULLONG_MAX)
goto not_mapped;
hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
hop3_addr = get_next_hop_addr(ctx, curr_pte);
if (hop3_addr == ULLONG_MAX)
goto not_mapped;
hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
is_huge = curr_pte & LAST_MASK;
if (is_dram_addr && !is_huge) {
dev_err(hdev->dev,
"DRAM unmapping should use huge pages only\n");
return -EFAULT;
}
if (!is_huge) {
hop4_addr = get_next_hop_addr(ctx, curr_pte);
if (hop4_addr == ULLONG_MAX)
goto not_mapped;
hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
clear_hop3 = false;
}
if (hdev->dram_default_page_mapping && is_dram_addr) {
u64 default_pte = (prop->mmu_dram_default_page_addr &
HOP_PHYS_ADDR_MASK) | LAST_MASK |
PAGE_PRESENT_MASK;
if (curr_pte == default_pte) {
dev_err(hdev->dev,
"DRAM: hop3 PTE points to zero page, can't unmap, va: 0x%llx\n",
virt_addr);
goto not_mapped;
}
if (!(curr_pte & PAGE_PRESENT_MASK)) {
dev_err(hdev->dev,
"DRAM: hop3 PTE is cleared! can't unmap, va: 0x%llx\n",
virt_addr);
goto not_mapped;
}
write_final_pte(ctx, hop3_pte_addr, default_pte);
put_pte(ctx, hop3_addr);
} else {
if (!(curr_pte & PAGE_PRESENT_MASK))
goto not_mapped;
if (hop4_addr)
clear_pte(ctx, hop4_pte_addr);
else
clear_pte(ctx, hop3_pte_addr);
if (hop4_addr && !put_pte(ctx, hop4_addr))
clear_hop3 = true;
if (!clear_hop3)
goto mapped;
clear_pte(ctx, hop3_pte_addr);
if (put_pte(ctx, hop3_addr))
goto mapped;
clear_pte(ctx, hop2_pte_addr);
if (put_pte(ctx, hop2_addr))
goto mapped;
clear_pte(ctx, hop1_pte_addr);
if (put_pte(ctx, hop1_addr))
goto mapped;
clear_pte(ctx, hop0_pte_addr);
}
mapped:
return 0;
not_mapped:
dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
virt_addr);
return -EINVAL;
}
static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
u32 page_size, bool is_dram_addr)
{
struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 hop0_addr = 0, hop0_pte_addr = 0,
hop1_addr = 0, hop1_pte_addr = 0,
hop2_addr = 0, hop2_pte_addr = 0,
hop3_addr = 0, hop3_pte_addr = 0,
hop4_addr = 0, hop4_pte_addr = 0,
curr_pte = 0;
bool hop1_new = false, hop2_new = false, hop3_new = false,
hop4_new = false, is_huge;
int rc = -ENOMEM;
/*
* This mapping function can map a page or a huge page. For huge page
* there are only 3 hops rather than 4. Currently the DRAM allocation
* uses huge pages only but user memory could have been allocated with
* one of the two page sizes. Since this is a common code for all the
* three cases, we need this hugs page check.
*/
if (is_dram_addr) {
mmu_prop = &prop->dmmu;
is_huge = true;
} else if (page_size == prop->pmmu_huge.page_size) {
mmu_prop = &prop->pmmu_huge;
is_huge = true;
} else {
mmu_prop = &prop->pmmu;
is_huge = false;
}
hop0_addr = get_hop0_addr(ctx);
hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
if (hop1_addr == ULLONG_MAX)
goto err;
hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
if (hop2_addr == ULLONG_MAX)
goto err;
hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
if (hop3_addr == ULLONG_MAX)
goto err;
hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
if (!is_huge) {
hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new);
if (hop4_addr == ULLONG_MAX)
goto err;
hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
}
if (hdev->dram_default_page_mapping && is_dram_addr) {
u64 default_pte = (prop->mmu_dram_default_page_addr &
HOP_PHYS_ADDR_MASK) | LAST_MASK |
PAGE_PRESENT_MASK;
if (curr_pte != default_pte) {
dev_err(hdev->dev,
"DRAM: mapping already exists for virt_addr 0x%llx\n",
virt_addr);
rc = -EINVAL;
goto err;
}
if (hop1_new || hop2_new || hop3_new || hop4_new) {
dev_err(hdev->dev,
"DRAM mapping should not allocate more hops\n");
rc = -EFAULT;
goto err;
}
} else if (curr_pte & PAGE_PRESENT_MASK) {
dev_err(hdev->dev,
"mapping already exists for virt_addr 0x%llx\n",
virt_addr);
dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop0_pte_addr, hop0_pte_addr);
dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop1_pte_addr, hop1_pte_addr);
dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop2_pte_addr, hop2_pte_addr);
dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop3_pte_addr, hop3_pte_addr);
if (!is_huge)
dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
*(u64 *) (uintptr_t) hop4_pte_addr,
hop4_pte_addr);
rc = -EINVAL;
goto err;
}
curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
| PAGE_PRESENT_MASK;
if (is_huge)
write_final_pte(ctx, hop3_pte_addr, curr_pte);
else
write_final_pte(ctx, hop4_pte_addr, curr_pte);
if (hop1_new) {
curr_pte =
(hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop0_pte_addr, curr_pte);
}
if (hop2_new) {
curr_pte =
(hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop1_pte_addr, curr_pte);
get_pte(ctx, hop1_addr);
}
if (hop3_new) {
curr_pte =
(hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop2_pte_addr, curr_pte);
get_pte(ctx, hop2_addr);
}
if (!is_huge) {
if (hop4_new) {
curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
PAGE_PRESENT_MASK;
write_pte(ctx, hop3_pte_addr, curr_pte);
get_pte(ctx, hop3_addr);
}
get_pte(ctx, hop4_addr);
} else {
get_pte(ctx, hop3_addr);
}
return 0;
err:
if (hop4_new)
free_hop(ctx, hop4_addr);
if (hop3_new)
free_hop(ctx, hop3_addr);
if (hop2_new)
free_hop(ctx, hop2_addr);
if (hop1_new)
free_hop(ctx, hop1_addr);
return rc;
}
/*
* hl_mmu_v1_swap_out - marks all mapping of the given ctx as swapped out
*
* @ctx: pointer to the context structure
*
*/
static void hl_mmu_v1_swap_out(struct hl_ctx *ctx)
{
}
/*
* hl_mmu_v1_swap_in - marks all mapping of the given ctx as swapped in
*
* @ctx: pointer to the context structure
*
*/
static void hl_mmu_v1_swap_in(struct hl_ctx *ctx)
{
}
/*
* hl_mmu_v1_prepare - prepare mmu for working with mmu v1
*
* @hdev: pointer to the device structure
*/
void hl_mmu_v1_set_funcs(struct hl_device *hdev)
{
struct hl_mmu_funcs *mmu = &hdev->mmu_func;
mmu->init = hl_mmu_v1_init;
mmu->fini = hl_mmu_v1_fini;
mmu->ctx_init = hl_mmu_v1_ctx_init;
mmu->ctx_fini = hl_mmu_v1_ctx_fini;
mmu->map = _hl_mmu_v1_map;
mmu->unmap = _hl_mmu_v1_unmap;
mmu->flush = flush;
mmu->swap_out = hl_mmu_v1_swap_out;
mmu->swap_in = hl_mmu_v1_swap_in;
}

View File

@ -9,7 +9,6 @@
#include "../include/hw_ip/pci/pci_general.h"
#include <linux/pci.h>
#include <linux/bitfield.h>
#define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC (HL_PCI_ELBI_TIMEOUT_MSEC * 10)
@ -339,12 +338,17 @@ static int hl_pci_set_dma_mask(struct hl_device *hdev)
/**
* hl_pci_init() - PCI initialization code.
* @hdev: Pointer to hl_device structure.
* @cpu_boot_status_reg: status register of the device's CPU
* @boot_err0_reg: boot error register of the device's CPU
* @preboot_ver_timeout: how much to wait before bailing out on reading
* the preboot version
*
* Set DMA masks, initialize the PCI controller and map the PCI BARs.
*
* Return: 0 on success, non-zero for failure.
*/
int hl_pci_init(struct hl_device *hdev)
int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
u32 boot_err0_reg, u32 preboot_ver_timeout)
{
struct pci_dev *pdev = hdev->pdev;
int rc;
@ -376,6 +380,15 @@ int hl_pci_init(struct hl_device *hdev)
if (rc)
goto unmap_pci_bars;
/* Before continuing in the initialization, we need to read the preboot
* version to determine whether we run with a security-enabled firmware
* The check will be done in each ASIC's specific code
*/
rc = hl_fw_read_preboot_ver(hdev, cpu_boot_status_reg, boot_err0_reg,
preboot_ver_timeout);
if (rc)
goto unmap_pci_bars;
return 0;
unmap_pci_bars:

View File

@ -11,18 +11,18 @@
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
long result;
int rc;
memset(&pkt, 0, sizeof(pkt));
if (curr)
pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_CURR_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_CURR_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
else
pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.pll_index = cpu_to_le32(pll_index);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@ -40,13 +40,13 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_SET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_SET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.pll_index = cpu_to_le32(pll_index);
pkt.value = cpu_to_le64(freq);
@ -61,14 +61,14 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
u64 hl_get_max_power(struct hl_device *hdev)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
long result;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
0, &result);
@ -83,13 +83,13 @@ u64 hl_get_max_power(struct hl_device *hdev)
void hl_set_max_power(struct hl_device *hdev)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_SET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_SET <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.value = cpu_to_le64(hdev->max_power);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@ -112,7 +112,7 @@ static ssize_t armcp_kernel_ver_show(struct device *dev,
{
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "%s", hdev->asic_prop.armcp_info.kernel_version);
return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version);
}
static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr,
@ -120,7 +120,7 @@ static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr,
{
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "%s\n", hdev->asic_prop.armcp_info.armcp_version);
return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version);
}
static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
@ -129,7 +129,23 @@ static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "0x%08x\n",
hdev->asic_prop.armcp_info.cpld_version);
hdev->asic_prop.cpucp_info.cpld_version);
}
static ssize_t cpucp_kernel_ver_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version);
}
static ssize_t cpucp_ver_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version);
}
static ssize_t infineon_ver_show(struct device *dev,
@ -138,7 +154,7 @@ static ssize_t infineon_ver_show(struct device *dev,
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "0x%04x\n",
hdev->asic_prop.armcp_info.infineon_version);
hdev->asic_prop.cpucp_info.infineon_version);
}
static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
@ -146,7 +162,7 @@ static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
{
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "%s\n", hdev->asic_prop.armcp_info.fuse_version);
return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.fuse_version);
}
static ssize_t thermal_ver_show(struct device *dev,
@ -154,7 +170,7 @@ static ssize_t thermal_ver_show(struct device *dev,
{
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "%s", hdev->asic_prop.armcp_info.thermal_version);
return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.thermal_version);
}
static ssize_t preboot_btl_ver_show(struct device *dev,
@ -356,6 +372,8 @@ out:
static DEVICE_ATTR_RO(armcp_kernel_ver);
static DEVICE_ATTR_RO(armcp_ver);
static DEVICE_ATTR_RO(cpld_ver);
static DEVICE_ATTR_RO(cpucp_kernel_ver);
static DEVICE_ATTR_RO(cpucp_ver);
static DEVICE_ATTR_RO(device_type);
static DEVICE_ATTR_RO(fuse_ver);
static DEVICE_ATTR_WO(hard_reset);
@ -380,6 +398,8 @@ static struct attribute *hl_dev_attrs[] = {
&dev_attr_armcp_kernel_ver.attr,
&dev_attr_armcp_ver.attr,
&dev_attr_cpld_ver.attr,
&dev_attr_cpucp_kernel_ver.attr,
&dev_attr_cpucp_ver.attr,
&dev_attr_device_type.attr,
&dev_attr_fuse_ver.attr,
&dev_attr_hard_reset.attr,

View File

@ -21,7 +21,6 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/iommu.h>
#include <linux/seq_file.h>
#include <linux/bitfield.h>
/*
* Gaudi security scheme:
@ -360,13 +359,14 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
u32 tpc_id);
static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev);
static int gaudi_armcp_info_get(struct hl_device *hdev);
static int gaudi_cpucp_info_get(struct hl_device *hdev);
static void gaudi_disable_clock_gating(struct hl_device *hdev);
static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid);
static int gaudi_get_fixed_properties(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
u32 num_sync_stream_queues = 0;
int i;
prop->max_queues = GAUDI_QUEUE_ID_SIZE;
@ -383,6 +383,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].requires_kernel_cb = 1;
prop->hw_queues_props[i].supports_sync_stream = 1;
num_sync_stream_queues++;
} else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) {
prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
prop->hw_queues_props[i].driver_only = 1;
@ -440,6 +441,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->pmmu.end_addr =
(VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1;
prop->pmmu.page_size = PAGE_SIZE_4KB;
prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
/* PMMU and HPMMU are the same except of page size */
memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@ -464,11 +466,16 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;
strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
strncpy(prop->cpucp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
prop->max_pending_cs = GAUDI_MAX_PENDING_CS;
prop->first_available_user_sob[HL_GAUDI_WS_DCORE] =
num_sync_stream_queues * HL_RSVD_SOBS;
prop->first_available_user_mon[HL_GAUDI_WS_DCORE] =
num_sync_stream_queues * HL_RSVD_MONS;
return 0;
}
@ -592,10 +599,15 @@ static int gaudi_early_init(struct hl_device *hdev)
prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID);
rc = hl_pci_init(hdev);
rc = hl_pci_init(hdev, mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS,
mmCPU_BOOT_ERR0, GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC);
if (rc)
goto free_queue_props;
/* GAUDI Firmware does not yet support security */
prop->fw_security_disabled = true;
dev_info(hdev->dev, "firmware-level security is disabled\n");
return 0;
free_queue_props:
@ -675,10 +687,10 @@ static int _gaudi_init_tpc_mem(struct hl_device *hdev,
init_tpc_mem_pkt->tsize = cpu_to_le32(tpc_kernel_size);
ctl = ((PACKET_LIN_DMA << GAUDI_PKT_CTL_OPCODE_SHIFT) |
(1 << GAUDI_PKT_LIN_DMA_CTL_LIN_SHIFT) |
(1 << GAUDI_PKT_CTL_RB_SHIFT) |
(1 << GAUDI_PKT_CTL_MB_SHIFT));
ctl = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_LIN_MASK, 1);
ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1);
ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
init_tpc_mem_pkt->ctl = cpu_to_le32(ctl);
@ -780,13 +792,13 @@ static int gaudi_late_init(struct hl_device *hdev)
struct gaudi_device *gaudi = hdev->asic_specific;
int rc;
rc = gaudi->armcp_info_get(hdev);
rc = gaudi->cpucp_info_get(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to get armcp info\n");
dev_err(hdev->dev, "Failed to get cpucp info\n");
return rc;
}
rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS);
if (rc) {
dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
return rc;
@ -811,7 +823,7 @@ static int gaudi_late_init(struct hl_device *hdev)
return 0;
disable_pci_access:
hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS);
return rc;
}
@ -981,7 +993,7 @@ static int gaudi_sw_init(struct hl_device *hdev)
}
}
gaudi->armcp_info_get = gaudi_armcp_info_get;
gaudi->cpucp_info_get = gaudi_cpucp_info_get;
gaudi->max_freq_value = GAUDI_MAX_CLK_FREQ;
@ -1911,6 +1923,9 @@ static void gaudi_init_dma_core(struct hl_device *hdev, int dma_id)
WREG32(mmDMA0_CORE_RD_MAX_OUTSTAND + dma_offset, 0);
WREG32(mmDMA0_CORE_RD_MAX_SIZE + dma_offset, 0);
/* WA for H/W bug H3-2116 */
WREG32(mmDMA0_CORE_LBW_MAX_OUTSTAND + dma_offset, 15);
/* STOP_ON bit implies no completion to operation in case of RAZWI */
if (hdev->stop_on_err)
dma_err_cfg |= 1 << DMA0_CORE_ERR_CFG_STOP_ON_ERR_SHIFT;
@ -2321,7 +2336,8 @@ static void gaudi_init_tpc_qmans(struct hl_device *hdev)
tpc_offset += mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0;
gaudi->hw_cap_initialized |= 1 << (HW_CAP_TPC_SHIFT + tpc_id);
gaudi->hw_cap_initialized |=
FIELD_PREP(HW_CAP_TPC_MASK, 1 << tpc_id);
}
}
@ -2847,7 +2863,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout)
if (err) {
dev_err(hdev->dev,
"Failed to communicate with ARM CPU (ArmCP timeout)\n");
"Failed to communicate with Device CPU (CPU-CP timeout)\n");
return -EIO;
}
@ -2860,6 +2876,18 @@ static void gaudi_pre_hw_init(struct hl_device *hdev)
/* Perform read from the device to make sure device is up */
RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG);
/* Set the access through PCI bars (Linux driver only) as
* secured
*/
WREG32(mmPCIE_WRAP_LBW_PROT_OVR,
(PCIE_WRAP_LBW_PROT_OVR_RD_EN_MASK |
PCIE_WRAP_LBW_PROT_OVR_WR_EN_MASK));
/* Perform read to flush the waiting writes to ensure
* configuration was set in the device
*/
RREG32(mmPCIE_WRAP_LBW_PROT_OVR);
/*
* Let's mark in the H/W that we have reached this point. We check
* this value in the reset_before_init function to understand whether
@ -2868,31 +2896,6 @@ static void gaudi_pre_hw_init(struct hl_device *hdev)
*/
WREG32(mmHW_STATE, HL_DEVICE_HW_STATE_DIRTY);
/* Set the access through PCI bars (Linux driver only) as secured */
WREG32(mmPCIE_WRAP_LBW_PROT_OVR, (PCIE_WRAP_LBW_PROT_OVR_RD_EN_MASK |
PCIE_WRAP_LBW_PROT_OVR_WR_EN_MASK));
/* Perform read to flush the waiting writes to ensure configuration
* was set in the device
*/
RREG32(mmPCIE_WRAP_LBW_PROT_OVR);
if (hdev->axi_drain) {
WREG32(mmPCIE_WRAP_LBW_DRAIN_CFG,
1 << PCIE_WRAP_LBW_DRAIN_CFG_EN_SHIFT);
WREG32(mmPCIE_WRAP_HBW_DRAIN_CFG,
1 << PCIE_WRAP_HBW_DRAIN_CFG_EN_SHIFT);
/* Perform read to flush the DRAIN cfg */
RREG32(mmPCIE_WRAP_HBW_DRAIN_CFG);
} else {
WREG32(mmPCIE_WRAP_LBW_DRAIN_CFG, 0);
WREG32(mmPCIE_WRAP_HBW_DRAIN_CFG, 0);
/* Perform read to flush the DRAIN cfg */
RREG32(mmPCIE_WRAP_HBW_DRAIN_CFG);
}
/* Configure the reset registers. Must be done as early as possible
* in case we fail during H/W initialization
*/
@ -2900,13 +2903,13 @@ static void gaudi_pre_hw_init(struct hl_device *hdev)
(CFG_RST_H_DMA_MASK |
CFG_RST_H_MME_MASK |
CFG_RST_H_SM_MASK |
CFG_RST_H_TPC_MASK));
CFG_RST_H_TPC_7_MASK));
WREG32(mmPSOC_GLOBAL_CONF_SOFT_RST_CFG_L, CFG_RST_L_TPC_MASK);
WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST_CFG_H,
(CFG_RST_H_HBM_MASK |
CFG_RST_H_TPC_MASK |
CFG_RST_H_TPC_7_MASK |
CFG_RST_H_NIC_MASK |
CFG_RST_H_SM_MASK |
CFG_RST_H_DMA_MASK |
@ -3071,7 +3074,7 @@ static int gaudi_suspend(struct hl_device *hdev)
{
int rc;
rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS);
if (rc)
dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
@ -3084,17 +3087,16 @@ static int gaudi_resume(struct hl_device *hdev)
}
static int gaudi_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
u64 kaddress, phys_addr_t paddress, u32 size)
void *cpu_addr, dma_addr_t dma_addr, size_t size)
{
int rc;
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP |
VM_DONTCOPY | VM_NORESERVE;
rc = remap_pfn_range(vma, vma->vm_start, paddress >> PAGE_SHIFT,
size, vma->vm_page_prot);
rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, dma_addr, size);
if (rc)
dev_err(hdev->dev, "remap_pfn_range error %d", rc);
dev_err(hdev->dev, "dma_mmap_coherent error %d", rc);
return rc;
}
@ -3441,7 +3443,8 @@ static int gaudi_test_queue(struct hl_device *hdev, u32 hw_queue_id)
&fence_dma_addr);
if (!fence_ptr) {
dev_err(hdev->dev,
"Failed to allocate memory for queue testing\n");
"Failed to allocate memory for H/W queue %d testing\n",
hw_queue_id);
return -ENOMEM;
}
@ -3452,14 +3455,16 @@ static int gaudi_test_queue(struct hl_device *hdev, u32 hw_queue_id)
GFP_KERNEL, &pkt_dma_addr);
if (!fence_pkt) {
dev_err(hdev->dev,
"Failed to allocate packet for queue testing\n");
"Failed to allocate packet for H/W queue %d testing\n",
hw_queue_id);
rc = -ENOMEM;
goto free_fence_ptr;
}
tmp = (PACKET_MSG_PROT << GAUDI_PKT_CTL_OPCODE_SHIFT) |
(1 << GAUDI_PKT_CTL_EB_SHIFT) |
(1 << GAUDI_PKT_CTL_MB_SHIFT);
tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT);
tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);
tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
fence_pkt->ctl = cpu_to_le32(tmp);
fence_pkt->value = cpu_to_le32(fence_val);
fence_pkt->addr = cpu_to_le64(fence_dma_addr);
@ -3469,7 +3474,8 @@ static int gaudi_test_queue(struct hl_device *hdev, u32 hw_queue_id)
pkt_dma_addr);
if (rc) {
dev_err(hdev->dev,
"Failed to send fence packet\n");
"Failed to send fence packet to H/W queue %d\n",
hw_queue_id);
goto free_pkt;
}
@ -3959,8 +3965,6 @@ static int gaudi_patch_dma_packet(struct hl_device *hdev,
}
}
new_dma_pkt->ctl = user_dma_pkt->ctl;
ctl = le32_to_cpu(user_dma_pkt->ctl);
if (likely(dma_desc_cnt))
ctl &= ~GAUDI_PKT_CTL_EB_MASK;
@ -4105,8 +4109,9 @@ static int gaudi_parse_cb_mmu(struct hl_device *hdev,
parser->patched_cb_size = parser->user_cb_size +
sizeof(struct packet_msg_prot) * 2;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
&patched_cb_handle, HL_KERNEL_ASID_ID, false);
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx,
parser->patched_cb_size, false, false,
&patched_cb_handle);
if (rc) {
dev_err(hdev->dev,
@ -4178,8 +4183,9 @@ static int gaudi_parse_cb_no_mmu(struct hl_device *hdev,
if (rc)
goto free_userptr;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
&patched_cb_handle, HL_KERNEL_ASID_ID, false);
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx,
parser->patched_cb_size, false, false,
&patched_cb_handle);
if (rc) {
dev_err(hdev->dev,
"Failed to allocate patched CB for DMA CS %d\n", rc);
@ -4275,11 +4281,11 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev,
cq_pkt = (struct packet_msg_prot *) (uintptr_t)
(kernel_address + len - (sizeof(struct packet_msg_prot) * 2));
tmp = (PACKET_MSG_PROT << GAUDI_PKT_CTL_OPCODE_SHIFT) |
(1 << GAUDI_PKT_CTL_MB_SHIFT);
tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT);
tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
if (eb)
tmp |= (1 << GAUDI_PKT_CTL_EB_SHIFT);
tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);
cq_pkt->ctl = cpu_to_le32(tmp);
cq_pkt->value = cpu_to_le32(cq_val);
@ -4287,8 +4293,8 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev,
cq_pkt++;
tmp = (PACKET_MSG_PROT << GAUDI_PKT_CTL_OPCODE_SHIFT) |
(1 << GAUDI_PKT_CTL_MB_SHIFT);
tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT);
tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
cq_pkt->ctl = cpu_to_le32(tmp);
cq_pkt->value = cpu_to_le32(1);
@ -4320,11 +4326,12 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
memset(lin_dma_pkt, 0, sizeof(*lin_dma_pkt));
cb_size = sizeof(*lin_dma_pkt);
ctl = ((PACKET_LIN_DMA << GAUDI_PKT_CTL_OPCODE_SHIFT) |
(1 << GAUDI_PKT_LIN_DMA_CTL_MEMSET_SHIFT) |
(1 << GAUDI_PKT_LIN_DMA_CTL_LIN_SHIFT) |
(1 << GAUDI_PKT_CTL_RB_SHIFT) |
(1 << GAUDI_PKT_CTL_MB_SHIFT));
ctl = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_MEMSET_MASK, 1);
ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_LIN_MASK, 1);
ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1);
lin_dma_pkt->ctl = cpu_to_le32(ctl);
lin_dma_pkt->src_addr = cpu_to_le64(val);
lin_dma_pkt->dst_addr |= cpu_to_le64(addr);
@ -4930,9 +4937,10 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address +
job->job_cb_size - sizeof(struct packet_msg_prot));
tmp = (PACKET_MSG_PROT << GAUDI_PKT_CTL_OPCODE_SHIFT) |
(1 << GAUDI_PKT_CTL_EB_SHIFT) |
(1 << GAUDI_PKT_CTL_MB_SHIFT);
tmp = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_PROT);
tmp |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);
tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
fence_pkt->ctl = cpu_to_le32(tmp);
fence_pkt->value = cpu_to_le32(GAUDI_QMAN0_FENCE_VAL);
fence_pkt->addr = cpu_to_le64(fence_dma_addr);
@ -5606,7 +5614,7 @@ static bool gaudi_tpc_read_interrupts(struct hl_device *hdev, u8 tpc_id,
bool soft_reset_required = false;
/* Accessing the TPC_INTR_CAUSE registers requires disabling the clock
* gating, and thus cannot be done in ArmCP and should be done instead
* gating, and thus cannot be done in CPU-CP and should be done instead
* by the driver.
*/
@ -5653,21 +5661,25 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
{
switch (event_type) {
case GAUDI_EVENT_FIX_POWER_ENV_S:
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to power consumption\n");
break;
case GAUDI_EVENT_FIX_POWER_ENV_E:
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
dev_info_ratelimited(hdev->dev,
"Power envelop is safe, back to optimal clock\n");
break;
case GAUDI_EVENT_FIX_THERMAL_ENV_S:
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to overheating\n");
break;
case GAUDI_EVENT_FIX_THERMAL_ENV_E:
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
dev_info_ratelimited(hdev->dev,
"Thermal envelop is safe, back to optimal clock\n");
break;
@ -6038,7 +6050,7 @@ static int gaudi_send_heartbeat(struct hl_device *hdev)
return hl_fw_send_heartbeat(hdev);
}
static int gaudi_armcp_info_get(struct hl_device *hdev)
static int gaudi_cpucp_info_get(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
struct asic_fixed_properties *prop = &hdev->asic_prop;
@ -6047,19 +6059,19 @@ static int gaudi_armcp_info_get(struct hl_device *hdev)
if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q))
return 0;
rc = hl_fw_armcp_info_get(hdev);
rc = hl_fw_cpucp_info_get(hdev);
if (rc)
return rc;
if (!strlen(prop->armcp_info.card_name))
strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
if (!strlen(prop->cpucp_info.card_name))
strncpy(prop->cpucp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
hdev->card_type = le32_to_cpu(hdev->asic_prop.armcp_info.card_type);
hdev->card_type = le32_to_cpu(hdev->asic_prop.cpucp_info.card_type);
if (hdev->card_type == armcp_card_type_pci)
if (hdev->card_type == cpucp_card_type_pci)
prop->max_power_default = MAX_POWER_DEFAULT_PCI;
else if (hdev->card_type == armcp_card_type_pmc)
else if (hdev->card_type == cpucp_card_type_pmc)
prop->max_power_default = MAX_POWER_DEFAULT_PMC;
hdev->max_power = prop->max_power_default;
@ -6067,7 +6079,7 @@ static int gaudi_armcp_info_get(struct hl_device *hdev)
return 0;
}
static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
static bool gaudi_is_device_idle(struct hl_device *hdev, u64 *mask,
struct seq_file *s)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@ -6099,7 +6111,7 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
is_idle &= is_eng_idle;
if (mask)
*mask |= !is_eng_idle <<
*mask |= ((u64) !is_eng_idle) <<
(GAUDI_ENGINE_ID_DMA_0 + dma_id);
if (s)
seq_printf(s, fmt, dma_id,
@ -6122,7 +6134,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
is_idle &= is_eng_idle;
if (mask)
*mask |= !is_eng_idle << (GAUDI_ENGINE_ID_TPC_0 + i);
*mask |= ((u64) !is_eng_idle) <<
(GAUDI_ENGINE_ID_TPC_0 + i);
if (s)
seq_printf(s, fmt, i,
is_eng_idle ? "Y" : "N",
@ -6150,7 +6163,8 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
is_idle &= is_eng_idle;
if (mask)
*mask |= !is_eng_idle << (GAUDI_ENGINE_ID_MME_0 + i);
*mask |= ((u64) !is_eng_idle) <<
(GAUDI_ENGINE_ID_MME_0 + i);
if (s) {
if (!is_slave)
seq_printf(s, fmt, i,
@ -6288,6 +6302,15 @@ static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
1000,
kernel_timeout);
if (rc) {
dev_err(hdev->dev,
"Timeout while waiting for TPC%d vector pipe\n",
tpc_id);
hdev->asic_funcs->set_clock_gating(hdev);
mutex_unlock(&gaudi->clk_gate_mutex);
return -EIO;
}
rc = hl_poll_timeout(
hdev,
mmTPC0_CFG_WQ_INFLIGHT_CNTR + offset,
@ -6617,7 +6640,6 @@ static const struct hl_asic_funcs gaudi_funcs = {
.send_cpu_message = gaudi_send_cpu_message,
.get_hw_state = gaudi_get_hw_state,
.pci_bars_map = gaudi_pci_bars_map,
.set_dram_bar_base = gaudi_set_hbm_bar_base,
.init_iatu = gaudi_init_iatu,
.rreg = hl_rreg,
.wreg = hl_wreg,

View File

@ -35,8 +35,6 @@
#error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
#endif
#define QMAN_FENCE_TIMEOUT_USEC 10000 /* 10 ms */
#define CORESIGHT_TIMEOUT_USEC 100000 /* 100 ms */
#define GAUDI_MAX_CLK_FREQ 2200000000ull /* 2200 MHz */
@ -44,7 +42,7 @@
#define MAX_POWER_DEFAULT_PCI 200000 /* 200W */
#define MAX_POWER_DEFAULT_PMC 350000 /* 350W */
#define GAUDI_CPU_TIMEOUT_USEC 15000000 /* 15s */
#define GAUDI_CPU_TIMEOUT_USEC 30000000 /* 30s */
#define TPC_ENABLED_MASK 0xFF
@ -142,28 +140,28 @@
#define VA_HOST_SPACE_SIZE (VA_HOST_SPACE_END - \
VA_HOST_SPACE_START) /* 767TB */
#define HW_CAP_PLL 0x00000001
#define HW_CAP_HBM 0x00000002
#define HW_CAP_MMU 0x00000004
#define HW_CAP_MME 0x00000008
#define HW_CAP_CPU 0x00000010
#define HW_CAP_PCI_DMA 0x00000020
#define HW_CAP_MSI 0x00000040
#define HW_CAP_CPU_Q 0x00000080
#define HW_CAP_HBM_DMA 0x00000100
#define HW_CAP_CLK_GATE 0x00000200
#define HW_CAP_SRAM_SCRAMBLER 0x00000400
#define HW_CAP_HBM_SCRAMBLER 0x00000800
#define HW_CAP_PLL BIT(0)
#define HW_CAP_HBM BIT(1)
#define HW_CAP_MMU BIT(2)
#define HW_CAP_MME BIT(3)
#define HW_CAP_CPU BIT(4)
#define HW_CAP_PCI_DMA BIT(5)
#define HW_CAP_MSI BIT(6)
#define HW_CAP_CPU_Q BIT(7)
#define HW_CAP_HBM_DMA BIT(8)
#define HW_CAP_CLK_GATE BIT(9)
#define HW_CAP_SRAM_SCRAMBLER BIT(10)
#define HW_CAP_HBM_SCRAMBLER BIT(11)
#define HW_CAP_TPC0 0x01000000
#define HW_CAP_TPC1 0x02000000
#define HW_CAP_TPC2 0x04000000
#define HW_CAP_TPC3 0x08000000
#define HW_CAP_TPC4 0x10000000
#define HW_CAP_TPC5 0x20000000
#define HW_CAP_TPC6 0x40000000
#define HW_CAP_TPC7 0x80000000
#define HW_CAP_TPC_MASK 0xFF000000
#define HW_CAP_TPC0 BIT(24)
#define HW_CAP_TPC1 BIT(25)
#define HW_CAP_TPC2 BIT(26)
#define HW_CAP_TPC3 BIT(27)
#define HW_CAP_TPC4 BIT(28)
#define HW_CAP_TPC5 BIT(29)
#define HW_CAP_TPC6 BIT(30)
#define HW_CAP_TPC7 BIT(31)
#define HW_CAP_TPC_MASK GENMASK(31, 24)
#define HW_CAP_TPC_SHIFT 24
#define GAUDI_CPU_PCI_MSB_ADDR(addr) (((addr) & GENMASK_ULL(49, 39)) >> 39)
@ -216,7 +214,7 @@ struct gaudi_internal_qman_info {
/**
* struct gaudi_device - ASIC specific manage structure.
* @armcp_info_get: get information on device from ArmCP
* @cpucp_info_get: get information on device from CPU-CP
* @hw_queues_lock: protects the H/W queues from concurrent access.
* @clk_gate_mutex: protects code areas that require clock gating to be disabled
* temporarily
@ -239,7 +237,7 @@ struct gaudi_internal_qman_info {
* 8-bit value so use u8.
*/
struct gaudi_device {
int (*armcp_info_get)(struct hl_device *hdev);
int (*cpucp_info_get)(struct hl_device *hdev);
/* TODO: remove hw_queues_lock after moving to scheduler code */
spinlock_t hw_queues_lock;

File diff suppressed because it is too large Load Diff

View File

@ -426,12 +426,14 @@ int goya_get_fixed_properties(struct hl_device *hdev)
prop->dmmu.start_addr = VA_DDR_SPACE_START;
prop->dmmu.end_addr = VA_DDR_SPACE_END;
prop->dmmu.page_size = PAGE_SIZE_2MB;
prop->dmmu.num_hops = MMU_ARCH_5_HOPS;
/* shifts and masks are the same in PMMU and DMMU */
memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
prop->pmmu.start_addr = VA_HOST_SPACE_START;
prop->pmmu.end_addr = VA_HOST_SPACE_END;
prop->pmmu.page_size = PAGE_SIZE_4KB;
prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
/* PMMU and HPMMU are the same except of page size */
memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@ -449,7 +451,7 @@ int goya_get_fixed_properties(struct hl_device *hdev)
prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;
strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
strncpy(prop->cpucp_info.card_name, GOYA_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
prop->max_pending_cs = GOYA_MAX_PENDING_CS;
@ -598,10 +600,15 @@ static int goya_early_init(struct hl_device *hdev)
prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID);
rc = hl_pci_init(hdev);
rc = hl_pci_init(hdev, mmPSOC_GLOBAL_CONF_CPU_BOOT_STATUS,
mmCPU_BOOT_ERR0, GOYA_BOOT_FIT_REQ_TIMEOUT_USEC);
if (rc)
goto free_queue_props;
/* Goya Firmware does not support security */
prop->fw_security_disabled = true;
dev_info(hdev->dev, "firmware-level security is disabled\n");
if (!hdev->pldm) {
val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
if (val & PSOC_GLOBAL_CONF_BOOT_STRAP_PINS_SRIOV_EN_MASK)
@ -727,9 +734,9 @@ int goya_late_init(struct hl_device *hdev)
if (rc)
return rc;
rc = goya_armcp_info_get(hdev);
rc = goya_cpucp_info_get(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to get armcp info %d\n", rc);
dev_err(hdev->dev, "Failed to get cpucp info %d\n", rc);
return rc;
}
@ -739,7 +746,7 @@ int goya_late_init(struct hl_device *hdev)
*/
WREG32(mmMMU_LOG2_DDR_SIZE, ilog2(prop->dram_size));
rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_ENABLE_PCI_ACCESS);
rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS);
if (rc) {
dev_err(hdev->dev,
"Failed to enable PCI access from CPU %d\n", rc);
@ -2648,7 +2655,7 @@ int goya_suspend(struct hl_device *hdev)
{
int rc;
rc = hl_fw_send_pci_access_msg(hdev, ARMCP_PACKET_DISABLE_PCI_ACCESS);
rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS);
if (rc)
dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
@ -2661,17 +2668,16 @@ int goya_resume(struct hl_device *hdev)
}
static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
u64 kaddress, phys_addr_t paddress, u32 size)
void *cpu_addr, dma_addr_t dma_addr, size_t size)
{
int rc;
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP |
VM_DONTCOPY | VM_NORESERVE;
rc = remap_pfn_range(vma, vma->vm_start, paddress >> PAGE_SHIFT,
size, vma->vm_page_prot);
rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, dma_addr, size);
if (rc)
dev_err(hdev->dev, "remap_pfn_range error %d", rc);
dev_err(hdev->dev, "dma_mmap_coherent error %d", rc);
return rc;
}
@ -2946,7 +2952,8 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
&fence_dma_addr);
if (!fence_ptr) {
dev_err(hdev->dev,
"Failed to allocate memory for queue testing\n");
"Failed to allocate memory for H/W queue %d testing\n",
hw_queue_id);
return -ENOMEM;
}
@ -2957,7 +2964,8 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
GFP_KERNEL, &pkt_dma_addr);
if (!fence_pkt) {
dev_err(hdev->dev,
"Failed to allocate packet for queue testing\n");
"Failed to allocate packet for H/W queue %d testing\n",
hw_queue_id);
rc = -ENOMEM;
goto free_fence_ptr;
}
@ -2974,7 +2982,8 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
pkt_dma_addr);
if (rc) {
dev_err(hdev->dev,
"Failed to send fence packet\n");
"Failed to send fence packet to H/W queue %d\n",
hw_queue_id);
goto free_pkt;
}
@ -3806,8 +3815,9 @@ static int goya_parse_cb_mmu(struct hl_device *hdev,
parser->patched_cb_size = parser->user_cb_size +
sizeof(struct packet_msg_prot) * 2;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
&patched_cb_handle, HL_KERNEL_ASID_ID, false);
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx,
parser->patched_cb_size, false, false,
&patched_cb_handle);
if (rc) {
dev_err(hdev->dev,
@ -3879,8 +3889,9 @@ static int goya_parse_cb_no_mmu(struct hl_device *hdev,
if (rc)
goto free_userptr;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
&patched_cb_handle, HL_KERNEL_ASID_ID, false);
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx,
parser->patched_cb_size, false, false,
&patched_cb_handle);
if (rc) {
dev_err(hdev->dev,
"Failed to allocate patched CB for DMA CS %d\n", rc);
@ -4497,17 +4508,17 @@ static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
size_t irq_arr_size)
{
struct armcp_unmask_irq_arr_packet *pkt;
struct cpucp_unmask_irq_arr_packet *pkt;
size_t total_pkt_size;
long result;
int rc;
int irq_num_entries, irq_arr_index;
__le32 *goya_irq_arr;
total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) +
total_pkt_size = sizeof(struct cpucp_unmask_irq_arr_packet) +
irq_arr_size;
/* data should be aligned to 8 bytes in order to ArmCP to copy it */
/* data should be aligned to 8 bytes in order to CPU-CP to copy it */
total_pkt_size = (total_pkt_size + 0x7) & ~0x7;
/* total_pkt_size is casted to u16 later on */
@ -4531,8 +4542,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
goya_irq_arr[irq_arr_index] =
cpu_to_le32(irq_arr[irq_arr_index]);
pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
total_pkt_size, 0, &result);
@ -4557,14 +4568,14 @@ static int goya_soft_reset_late_init(struct hl_device *hdev)
static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
{
struct armcp_packet pkt;
struct cpucp_packet pkt;
long result;
int rc;
memset(&pkt, 0, sizeof(pkt));
pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ <<
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.value = cpu_to_le64(event_type);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@ -4580,18 +4591,22 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
{
switch (event_type) {
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to power consumption\n");
break;
case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
dev_info_ratelimited(hdev->dev,
"Power envelop is safe, back to optimal clock\n");
break;
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to overheating\n");
break;
case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
dev_info_ratelimited(hdev->dev,
"Thermal envelop is safe, back to optimal clock\n");
break;
@ -4638,7 +4653,8 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
case GOYA_ASYNC_EVENT_ID_L2_RAM_ECC:
case GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET:
goya_print_irq_info(hdev, event_type, false);
hl_device_reset(hdev, true, false);
if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
break;
case GOYA_ASYNC_EVENT_ID_PCIE_DEC:
@ -5096,7 +5112,7 @@ int goya_send_heartbeat(struct hl_device *hdev)
return hl_fw_send_heartbeat(hdev);
}
int goya_armcp_info_get(struct hl_device *hdev)
int goya_cpucp_info_get(struct hl_device *hdev)
{
struct goya_device *goya = hdev->asic_specific;
struct asic_fixed_properties *prop = &hdev->asic_prop;
@ -5106,11 +5122,11 @@ int goya_armcp_info_get(struct hl_device *hdev)
if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q))
return 0;
rc = hl_fw_armcp_info_get(hdev);
rc = hl_fw_cpucp_info_get(hdev);
if (rc)
return rc;
dram_size = le64_to_cpu(prop->armcp_info.dram_size);
dram_size = le64_to_cpu(prop->cpucp_info.dram_size);
if (dram_size) {
if ((!is_power_of_2(dram_size)) ||
(dram_size < DRAM_PHYS_DEFAULT_SIZE)) {
@ -5124,8 +5140,8 @@ int goya_armcp_info_get(struct hl_device *hdev)
prop->dram_end_address = prop->dram_base_address + dram_size;
}
if (!strlen(prop->armcp_info.card_name))
strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
if (!strlen(prop->cpucp_info.card_name))
strncpy(prop->cpucp_info.card_name, GOYA_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
return 0;
@ -5141,7 +5157,7 @@ static void goya_disable_clock_gating(struct hl_device *hdev)
/* clock gating not supported in Goya */
}
static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
static bool goya_is_device_idle(struct hl_device *hdev, u64 *mask,
struct seq_file *s)
{
const char *fmt = "%-5d%-9s%#-14x%#-16x%#x\n";
@ -5166,7 +5182,8 @@ static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
is_idle &= is_eng_idle;
if (mask)
*mask |= !is_eng_idle << (GOYA_ENGINE_ID_DMA_0 + i);
*mask |= ((u64) !is_eng_idle) <<
(GOYA_ENGINE_ID_DMA_0 + i);
if (s)
seq_printf(s, dma_fmt, i, is_eng_idle ? "Y" : "N",
qm_glbl_sts0, dma_core_sts0);
@ -5189,7 +5206,8 @@ static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
is_idle &= is_eng_idle;
if (mask)
*mask |= !is_eng_idle << (GOYA_ENGINE_ID_TPC_0 + i);
*mask |= ((u64) !is_eng_idle) <<
(GOYA_ENGINE_ID_TPC_0 + i);
if (s)
seq_printf(s, fmt, i, is_eng_idle ? "Y" : "N",
qm_glbl_sts0, cmdq_glbl_sts0, tpc_cfg_sts);
@ -5209,7 +5227,7 @@ static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
is_idle &= is_eng_idle;
if (mask)
*mask |= !is_eng_idle << GOYA_ENGINE_ID_MME_0;
*mask |= ((u64) !is_eng_idle) << GOYA_ENGINE_ID_MME_0;
if (s) {
seq_printf(s, fmt, 0, is_eng_idle ? "Y" : "N", qm_glbl_sts0,
cmdq_glbl_sts0, mme_arch_sts);
@ -5369,7 +5387,6 @@ static const struct hl_asic_funcs goya_funcs = {
.send_cpu_message = goya_send_cpu_message,
.get_hw_state = goya_get_hw_state,
.pci_bars_map = goya_pci_bars_map,
.set_dram_bar_base = goya_set_ddr_bar_base,
.init_iatu = goya_init_iatu,
.rreg = hl_rreg,
.wreg = hl_wreg,

View File

@ -207,7 +207,7 @@ void goya_set_max_power(struct hl_device *hdev, u64 value);
void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq);
void goya_add_device_attr(struct hl_device *hdev,
struct attribute_group *dev_attr_grp);
int goya_armcp_info_get(struct hl_device *hdev);
int goya_cpucp_info_get(struct hl_device *hdev);
int goya_debug_coresight(struct hl_device *hdev, void *data);
void goya_halt_coresight(struct hl_device *hdev);

View File

@ -1,12 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0
*
* Copyright 2016-2020 HabanaLabs, Ltd.
* Copyright 2020 HabanaLabs, Ltd.
* All Rights Reserved.
*
*/
#ifndef ARMCP_IF_H
#define ARMCP_IF_H
#ifndef CPUCP_IF_H
#define CPUCP_IF_H
#include <linux/types.h>
@ -50,16 +50,16 @@ enum pq_init_status {
};
/*
* ArmCP Primary Queue Packets
* CpuCP Primary Queue Packets
*
* During normal operation, the host's kernel driver needs to send various
* messages to ArmCP, usually either to SET some value into a H/W periphery or
* messages to CpuCP, usually either to SET some value into a H/W periphery or
* to GET the current value of some H/W periphery. For example, SET the
* frequency of MME/TPC and GET the value of the thermal sensor.
*
* These messages can be initiated either by the User application or by the
* host's driver itself, e.g. power management code. In either case, the
* communication from the host's driver to ArmCP will *always* be in
* communication from the host's driver to CpuCP will *always* be in
* synchronous mode, meaning that the host will send a single message and poll
* until the message was acknowledged and the results are ready (if results are
* needed).
@ -73,21 +73,20 @@ enum pq_init_status {
*
* The message, inputs/outputs (if relevant) and fence object will be located
* on the device DDR at an address that will be determined by the host's driver.
* During device initialization phase, the host will pass to ArmCP that address.
* During device initialization phase, the host will pass to CpuCP that address.
* Most of the message types will contain inputs/outputs inside the message
* itself. The common part of each message will contain the opcode of the
* message (its type) and a field representing a fence object.
*
* When the host's driver wishes to send a message to ArmCP, it will write the
* message contents to the device DDR, clear the fence object and then write the
* value 484 to the mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR register to issue
* the 484 interrupt-id to the ARM core.
* When the host's driver wishes to send a message to CPU CP, it will write the
* message contents to the device DDR, clear the fence object and then write to
* the PSOC_ARC1_AUX_SW_INTR, to issue interrupt 121 to ARC Management CPU.
*
* Upon receiving the 484 interrupt-id, ArmCP will read the message from the
* DDR. In case the message is a SET operation, ArmCP will first perform the
* Upon receiving the interrupt (#121), CpuCP will read the message from the
* DDR. In case the message is a SET operation, CpuCP will first perform the
* operation and then write to the fence object on the device DDR. In case the
* message is a GET operation, ArmCP will first fill the results section on the
* device DDR and then write to the fence object. If an error occurred, ArmCP
* message is a GET operation, CpuCP will first fill the results section on the
* device DDR and then write to the fence object. If an error occurred, CpuCP
* will fill the rc field with the right error code.
*
* In the meantime, the host's driver will poll on the fence object. Once the
@ -96,164 +95,174 @@ enum pq_init_status {
* driver.
*
* To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8
* so the value being put by the host's driver matches the value read by ArmCP
* so the value being put by the host's driver matches the value read by CpuCP
*
* Non-QMAN packets should be limited to values 1 through (2^8 - 1)
*
* Detailed description:
*
* ARMCP_PACKET_DISABLE_PCI_ACCESS -
* CPUCP_PACKET_DISABLE_PCI_ACCESS -
* After receiving this packet the embedded CPU must NOT issue PCI
* transactions (read/write) towards the Host CPU. This also include
* sending MSI-X interrupts.
* This packet is usually sent before the device is moved to D3Hot state.
*
* ARMCP_PACKET_ENABLE_PCI_ACCESS -
* CPUCP_PACKET_ENABLE_PCI_ACCESS -
* After receiving this packet the embedded CPU is allowed to issue PCI
* transactions towards the Host CPU, including sending MSI-X interrupts.
* This packet is usually send after the device is moved to D0 state.
*
* ARMCP_PACKET_TEMPERATURE_GET -
* CPUCP_PACKET_TEMPERATURE_GET -
* Fetch the current temperature / Max / Max Hyst / Critical /
* Critical Hyst of a specified thermal sensor. The packet's
* arguments specify the desired sensor and the field to get.
*
* ARMCP_PACKET_VOLTAGE_GET -
* CPUCP_PACKET_VOLTAGE_GET -
* Fetch the voltage / Max / Min of a specified sensor. The packet's
* arguments specify the sensor and type.
*
* ARMCP_PACKET_CURRENT_GET -
* CPUCP_PACKET_CURRENT_GET -
* Fetch the current / Max / Min of a specified sensor. The packet's
* arguments specify the sensor and type.
*
* ARMCP_PACKET_FAN_SPEED_GET -
* CPUCP_PACKET_FAN_SPEED_GET -
* Fetch the speed / Max / Min of a specified fan. The packet's
* arguments specify the sensor and type.
*
* ARMCP_PACKET_PWM_GET -
* CPUCP_PACKET_PWM_GET -
* Fetch the pwm value / mode of a specified pwm. The packet's
* arguments specify the sensor and type.
*
* ARMCP_PACKET_PWM_SET -
* CPUCP_PACKET_PWM_SET -
* Set the pwm value / mode of a specified pwm. The packet's
* arguments specify the sensor, type and value.
*
* ARMCP_PACKET_FREQUENCY_SET -
* CPUCP_PACKET_FREQUENCY_SET -
* Set the frequency of a specified PLL. The packet's arguments specify
* the PLL and the desired frequency. The actual frequency in the device
* might differ from the requested frequency.
*
* ARMCP_PACKET_FREQUENCY_GET -
* CPUCP_PACKET_FREQUENCY_GET -
* Fetch the frequency of a specified PLL. The packet's arguments specify
* the PLL.
*
* ARMCP_PACKET_LED_SET -
* CPUCP_PACKET_LED_SET -
* Set the state of a specified led. The packet's arguments
* specify the led and the desired state.
*
* ARMCP_PACKET_I2C_WR -
* CPUCP_PACKET_I2C_WR -
* Write 32-bit value to I2C device. The packet's arguments specify the
* I2C bus, address and value.
*
* ARMCP_PACKET_I2C_RD -
* CPUCP_PACKET_I2C_RD -
* Read 32-bit value from I2C device. The packet's arguments specify the
* I2C bus and address.
*
* ARMCP_PACKET_INFO_GET -
* CPUCP_PACKET_INFO_GET -
* Fetch information from the device as specified in the packet's
* structure. The host's driver passes the max size it allows the ArmCP to
* structure. The host's driver passes the max size it allows the CpuCP to
* write to the structure, to prevent data corruption in case of
* mismatched driver/FW versions.
*
* ARMCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
* CPUCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
*
* ARMCP_PACKET_UNMASK_RAZWI_IRQ -
* CPUCP_PACKET_UNMASK_RAZWI_IRQ -
* Unmask the given IRQ. The IRQ number is specified in the value field.
* The packet is sent after receiving an interrupt and printing its
* relevant information.
*
* ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
* CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
* Unmask the given IRQs. The IRQs numbers are specified in an array right
* after the armcp_packet structure, where its first element is the array
* after the cpucp_packet structure, where its first element is the array
* length. The packet is sent after a soft reset was done in order to
* handle any interrupts that were sent during the reset process.
*
* ARMCP_PACKET_TEST -
* Test packet for ArmCP connectivity. The CPU will put the fence value
* CPUCP_PACKET_TEST -
* Test packet for CpuCP connectivity. The CPU will put the fence value
* in the result field.
*
* ARMCP_PACKET_FREQUENCY_CURR_GET -
* CPUCP_PACKET_FREQUENCY_CURR_GET -
* Fetch the current frequency of a specified PLL. The packet's arguments
* specify the PLL.
*
* ARMCP_PACKET_MAX_POWER_GET -
* CPUCP_PACKET_MAX_POWER_GET -
* Fetch the maximal power of the device.
*
* ARMCP_PACKET_MAX_POWER_SET -
* CPUCP_PACKET_MAX_POWER_SET -
* Set the maximal power of the device. The packet's arguments specify
* the power.
*
* ARMCP_PACKET_EEPROM_DATA_GET -
* Get EEPROM data from the ArmCP kernel. The buffer is specified in the
* CPUCP_PACKET_EEPROM_DATA_GET -
* Get EEPROM data from the CpuCP kernel. The buffer is specified in the
* addr field. The CPU will put the returned data size in the result
* field. In addition, the host's driver passes the max size it allows the
* ArmCP to write to the structure, to prevent data corruption in case of
* CpuCP to write to the structure, to prevent data corruption in case of
* mismatched driver/FW versions.
*
* ARMCP_PACKET_TEMPERATURE_SET -
* CPUCP_PACKET_TEMPERATURE_SET -
* Set the value of the offset property of a specified thermal sensor.
* The packet's arguments specify the desired sensor and the field to
* set.
*
* ARMCP_PACKET_VOLTAGE_SET -
* CPUCP_PACKET_VOLTAGE_SET -
* Trigger the reset_history property of a specified voltage sensor.
* The packet's arguments specify the desired sensor and the field to
* set.
*
* ARMCP_PACKET_CURRENT_SET -
* CPUCP_PACKET_CURRENT_SET -
* Trigger the reset_history property of a specified current sensor.
* The packet's arguments specify the desired sensor and the field to
* set.
*
* CPUCP_PACKET_PLL_REG_GET
* Fetch register of PLL from the required PLL IP.
* The packet's arguments specify the PLL IP and the register to get.
* Each register is 32-bit value which is returned in result field.
*
*/
enum armcp_packet_id {
ARMCP_PACKET_DISABLE_PCI_ACCESS = 1, /* internal */
ARMCP_PACKET_ENABLE_PCI_ACCESS, /* internal */
ARMCP_PACKET_TEMPERATURE_GET, /* sysfs */
ARMCP_PACKET_VOLTAGE_GET, /* sysfs */
ARMCP_PACKET_CURRENT_GET, /* sysfs */
ARMCP_PACKET_FAN_SPEED_GET, /* sysfs */
ARMCP_PACKET_PWM_GET, /* sysfs */
ARMCP_PACKET_PWM_SET, /* sysfs */
ARMCP_PACKET_FREQUENCY_SET, /* sysfs */
ARMCP_PACKET_FREQUENCY_GET, /* sysfs */
ARMCP_PACKET_LED_SET, /* debugfs */
ARMCP_PACKET_I2C_WR, /* debugfs */
ARMCP_PACKET_I2C_RD, /* debugfs */
ARMCP_PACKET_INFO_GET, /* IOCTL */
ARMCP_PACKET_FLASH_PROGRAM_REMOVED,
ARMCP_PACKET_UNMASK_RAZWI_IRQ, /* internal */
ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY, /* internal */
ARMCP_PACKET_TEST, /* internal */
ARMCP_PACKET_FREQUENCY_CURR_GET, /* sysfs */
ARMCP_PACKET_MAX_POWER_GET, /* sysfs */
ARMCP_PACKET_MAX_POWER_SET, /* sysfs */
ARMCP_PACKET_EEPROM_DATA_GET, /* sysfs */
ARMCP_RESERVED,
ARMCP_PACKET_TEMPERATURE_SET, /* sysfs */
ARMCP_PACKET_VOLTAGE_SET, /* sysfs */
ARMCP_PACKET_CURRENT_SET, /* sysfs */
enum cpucp_packet_id {
CPUCP_PACKET_DISABLE_PCI_ACCESS = 1, /* internal */
CPUCP_PACKET_ENABLE_PCI_ACCESS, /* internal */
CPUCP_PACKET_TEMPERATURE_GET, /* sysfs */
CPUCP_PACKET_VOLTAGE_GET, /* sysfs */
CPUCP_PACKET_CURRENT_GET, /* sysfs */
CPUCP_PACKET_FAN_SPEED_GET, /* sysfs */
CPUCP_PACKET_PWM_GET, /* sysfs */
CPUCP_PACKET_PWM_SET, /* sysfs */
CPUCP_PACKET_FREQUENCY_SET, /* sysfs */
CPUCP_PACKET_FREQUENCY_GET, /* sysfs */
CPUCP_PACKET_LED_SET, /* debugfs */
CPUCP_PACKET_I2C_WR, /* debugfs */
CPUCP_PACKET_I2C_RD, /* debugfs */
CPUCP_PACKET_INFO_GET, /* IOCTL */
CPUCP_PACKET_FLASH_PROGRAM_REMOVED,
CPUCP_PACKET_UNMASK_RAZWI_IRQ, /* internal */
CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY, /* internal */
CPUCP_PACKET_TEST, /* internal */
CPUCP_PACKET_FREQUENCY_CURR_GET, /* sysfs */
CPUCP_PACKET_MAX_POWER_GET, /* sysfs */
CPUCP_PACKET_MAX_POWER_SET, /* sysfs */
CPUCP_PACKET_EEPROM_DATA_GET, /* sysfs */
CPUCP_RESERVED,
CPUCP_PACKET_TEMPERATURE_SET, /* sysfs */
CPUCP_PACKET_VOLTAGE_SET, /* sysfs */
CPUCP_PACKET_CURRENT_SET, /* sysfs */
CPUCP_PACKET_PCIE_THROUGHPUT_GET, /* internal */
CPUCP_PACKET_PCIE_REPLAY_CNT_GET, /* internal */
CPUCP_PACKET_TOTAL_ENERGY_GET, /* internal */
CPUCP_PACKET_PLL_REG_GET, /* internal */
};
#define ARMCP_PACKET_FENCE_VAL 0xFE8CE7A5
#define CPUCP_PACKET_FENCE_VAL 0xFE8CE7A5
#define ARMCP_PKT_CTL_RC_SHIFT 12
#define ARMCP_PKT_CTL_RC_MASK 0x0000F000
#define CPUCP_PKT_CTL_RC_SHIFT 12
#define CPUCP_PKT_CTL_RC_MASK 0x0000F000
#define ARMCP_PKT_CTL_OPCODE_SHIFT 16
#define ARMCP_PKT_CTL_OPCODE_MASK 0x1FFF0000
#define CPUCP_PKT_CTL_OPCODE_SHIFT 16
#define CPUCP_PKT_CTL_OPCODE_MASK 0x1FFF0000
struct armcp_packet {
struct cpucp_packet {
union {
__le64 value; /* For SET packets */
__le64 result; /* For GET packets */
@ -277,71 +286,97 @@ struct armcp_packet {
__u8 pad; /* unused */
};
struct {/* For PLL register fetch */
__le16 pll_type;
__le16 pll_reg;
};
/* For any general request */
__le32 index;
/* For frequency get/set */
__le32 pll_index;
/* For led set */
__le32 led_index;
/* For get Armcp info/EEPROM data */
/* For get CpuCP info/EEPROM data */
__le32 data_max_size;
};
__le32 reserved;
};
struct armcp_unmask_irq_arr_packet {
struct armcp_packet armcp_pkt;
struct cpucp_unmask_irq_arr_packet {
struct cpucp_packet cpucp_pkt;
__le32 length;
__le32 irqs[0];
};
enum armcp_packet_rc {
armcp_packet_success,
armcp_packet_invalid,
armcp_packet_fault
enum cpucp_packet_rc {
cpucp_packet_success,
cpucp_packet_invalid,
cpucp_packet_fault
};
/*
* armcp_temp_type should adhere to hwmon_temp_attributes
* cpucp_temp_type should adhere to hwmon_temp_attributes
* defined in Linux kernel hwmon.h file
*/
enum armcp_temp_type {
armcp_temp_input,
armcp_temp_max = 6,
armcp_temp_max_hyst,
armcp_temp_crit,
armcp_temp_crit_hyst,
armcp_temp_offset = 19,
armcp_temp_highest = 22,
armcp_temp_reset_history = 23
enum cpucp_temp_type {
cpucp_temp_input,
cpucp_temp_max = 6,
cpucp_temp_max_hyst,
cpucp_temp_crit,
cpucp_temp_crit_hyst,
cpucp_temp_offset = 19,
cpucp_temp_highest = 22,
cpucp_temp_reset_history = 23
};
enum armcp_in_attributes {
armcp_in_input,
armcp_in_min,
armcp_in_max,
armcp_in_highest = 7,
armcp_in_reset_history
enum cpucp_in_attributes {
cpucp_in_input,
cpucp_in_min,
cpucp_in_max,
cpucp_in_highest = 7,
cpucp_in_reset_history
};
enum armcp_curr_attributes {
armcp_curr_input,
armcp_curr_min,
armcp_curr_max,
armcp_curr_highest = 7,
armcp_curr_reset_history
enum cpucp_curr_attributes {
cpucp_curr_input,
cpucp_curr_min,
cpucp_curr_max,
cpucp_curr_highest = 7,
cpucp_curr_reset_history
};
enum armcp_fan_attributes {
armcp_fan_input,
armcp_fan_min = 2,
armcp_fan_max
enum cpucp_fan_attributes {
cpucp_fan_input,
cpucp_fan_min = 2,
cpucp_fan_max
};
enum armcp_pwm_attributes {
armcp_pwm_input,
armcp_pwm_enable
enum cpucp_pwm_attributes {
cpucp_pwm_input,
cpucp_pwm_enable
};
enum cpucp_pcie_throughput_attributes {
cpucp_pcie_throughput_tx,
cpucp_pcie_throughput_rx
};
enum cpucp_pll_reg_attributes {
cpucp_pll_nr_reg,
cpucp_pll_nf_reg,
cpucp_pll_od_reg,
cpucp_pll_div_factor_reg,
cpucp_pll_div_sel_reg
};
enum cpucp_pll_type_attributes {
cpucp_pll_cpu,
cpucp_pll_pci,
};
/* Event Queue Packets */
@ -351,32 +386,32 @@ struct eq_generic_event {
};
/*
* ArmCP info
* CpuCP info
*/
#define CARD_NAME_MAX_LEN 16
#define VERSION_MAX_LEN 128
#define ARMCP_MAX_SENSORS 128
#define CPUCP_MAX_SENSORS 128
struct armcp_sensor {
struct cpucp_sensor {
__le32 type;
__le32 flags;
};
/**
* struct armcp_card_types - ASIC card type.
* @armcp_card_type_pci: PCI card.
* @armcp_card_type_pmc: PCI Mezzanine Card.
* struct cpucp_card_types - ASIC card type.
* @cpucp_card_type_pci: PCI card.
* @cpucp_card_type_pmc: PCI Mezzanine Card.
*/
enum armcp_card_types {
armcp_card_type_pci,
armcp_card_type_pmc
enum cpucp_card_types {
cpucp_card_type_pci,
cpucp_card_type_pmc
};
/**
* struct armcp_info - Info from ArmCP that is necessary to the host's driver
* struct cpucp_info - Info from CpuCP that is necessary to the host's driver
* @sensors: available sensors description.
* @kernel_version: ArmCP linux kernel version.
* @kernel_version: CpuCP linux kernel version.
* @reserved: reserved field.
* @card_type: card configuration type.
* @card_location: in a server, each card has different connections topology
@ -385,12 +420,12 @@ enum armcp_card_types {
* @infineon_version: Infineon main DC-DC version.
* @fuse_version: silicon production FUSE information.
* @thermal_version: thermald S/W version.
* @armcp_version: ArmCP S/W version.
* @cpucp_version: CpuCP S/W version.
* @dram_size: available DRAM size.
* @card_name: card name that will be displayed in HWMON subsystem on the host
*/
struct armcp_info {
struct armcp_sensor sensors[ARMCP_MAX_SENSORS];
struct cpucp_info {
struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
__u8 kernel_version[VERSION_MAX_LEN];
__le32 reserved;
__le32 card_type;
@ -399,9 +434,10 @@ struct armcp_info {
__le32 infineon_version;
__u8 fuse_version[VERSION_MAX_LEN];
__u8 thermal_version[VERSION_MAX_LEN];
__u8 armcp_version[VERSION_MAX_LEN];
__u8 cpucp_version[VERSION_MAX_LEN];
__le32 reserved2;
__le64 dram_size;
char card_name[CARD_NAME_MAX_LEN];
};
#endif /* ARMCP_IF_H */
#endif /* CPUCP_IF_H */

View File

@ -40,7 +40,7 @@ struct hl_bd {
*/
#define BD_CTL_COMP_OFFSET_SHIFT 16
#define BD_CTL_COMP_OFFSET_MASK 0x00FF0000
#define BD_CTL_COMP_OFFSET_MASK 0x0FFF0000
#define BD_CTL_COMP_DATA_SHIFT 0
#define BD_CTL_COMP_DATA_MASK 0x0000FFFF

View File

@ -44,6 +44,8 @@
#define MME_NUMBER_OF_MASTER_ENGINES 2
#define MME_NUMBER_OF_SLAVE_ENGINES 2
#define TPC_NUMBER_OF_ENGINES 8
#define DMA_NUMBER_OF_CHANNELS 8

View File

@ -12,191 +12,160 @@
/* Useful masks for bits in various registers */
#define PCI_DMA_QMAN_ENABLE (\
(0xF << DMA0_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
(0xF << DMA0_QM_GLBL_CFG0_CQF_EN_SHIFT) | \
(0xF << DMA0_QM_GLBL_CFG0_CP_EN_SHIFT))
(FIELD_PREP(DMA0_QM_GLBL_CFG0_PQF_EN_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_CFG0_CQF_EN_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_CFG0_CP_EN_MASK, 0xF)))
#define QMAN_EXTERNAL_MAKE_TRUSTED (\
(0xF << DMA0_QM_GLBL_PROT_PQF_SHIFT) | \
(0xF << DMA0_QM_GLBL_PROT_CQF_SHIFT) | \
(0xF << DMA0_QM_GLBL_PROT_CP_SHIFT) | \
(0x1 << DMA0_QM_GLBL_PROT_ERR_SHIFT))
(FIELD_PREP(DMA0_QM_GLBL_PROT_PQF_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_PROT_CQF_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_PROT_CP_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_PROT_ERR_MASK, 0x1)))
#define QMAN_INTERNAL_MAKE_TRUSTED (\
(0xF << DMA0_QM_GLBL_PROT_PQF_SHIFT) | \
(0x1 << DMA0_QM_GLBL_PROT_ERR_SHIFT))
(FIELD_PREP(DMA0_QM_GLBL_PROT_PQF_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_PROT_ERR_MASK, 0x1)))
#define HBM_DMA_QMAN_ENABLE (\
(0xF << DMA0_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
(0x1F << DMA0_QM_GLBL_CFG0_CQF_EN_SHIFT) | \
(0x1F << DMA0_QM_GLBL_CFG0_CP_EN_SHIFT))
(FIELD_PREP(DMA0_QM_GLBL_CFG0_PQF_EN_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_CFG0_CQF_EN_MASK, 0x1F)) | \
(FIELD_PREP(DMA0_QM_GLBL_CFG0_CP_EN_MASK, 0x1F)))
#define QMAN_MME_ENABLE (\
(0xF << MME0_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
(0x1F << MME0_QM_GLBL_CFG0_CQF_EN_SHIFT) | \
(0x1F << MME0_QM_GLBL_CFG0_CP_EN_SHIFT))
(FIELD_PREP(MME0_QM_GLBL_CFG0_PQF_EN_MASK, 0xF)) | \
(FIELD_PREP(MME0_QM_GLBL_CFG0_CQF_EN_MASK, 0x1F)) | \
(FIELD_PREP(MME0_QM_GLBL_CFG0_CP_EN_MASK, 0x1F)))
#define QMAN_TPC_ENABLE (\
(0xF << TPC0_QM_GLBL_CFG0_PQF_EN_SHIFT) | \
(0x1F << TPC0_QM_GLBL_CFG0_CQF_EN_SHIFT) | \
(0x1F << TPC0_QM_GLBL_CFG0_CP_EN_SHIFT))
(FIELD_PREP(TPC0_QM_GLBL_CFG0_PQF_EN_MASK, 0xF)) | \
(FIELD_PREP(TPC0_QM_GLBL_CFG0_CQF_EN_MASK, 0x1F)) | \
(FIELD_PREP(TPC0_QM_GLBL_CFG0_CP_EN_MASK, 0x1F)))
#define QMAN_UPPER_CP_CGM_PWR_GATE_EN (\
(0x20 << DMA0_QM_CGM_CFG_IDLE_TH_SHIFT) | \
(0xA << DMA0_QM_CGM_CFG_G2F_TH_SHIFT) | \
(0x10 << DMA0_QM_CGM_CFG_CP_IDLE_MASK_SHIFT) | \
(1 << DMA0_QM_CGM_CFG_EN_SHIFT))
(FIELD_PREP(DMA0_QM_CGM_CFG_IDLE_TH_MASK, 0x20)) | \
(FIELD_PREP(DMA0_QM_CGM_CFG_G2F_TH_MASK, 0xA)) | \
(FIELD_PREP(DMA0_QM_CGM_CFG_CP_IDLE_MASK_MASK, 0x10)) | \
(FIELD_PREP(DMA0_QM_CGM_CFG_EN_MASK, 0x1)))
#define QMAN_COMMON_CP_CGM_PWR_GATE_EN (\
(0x20 << DMA0_QM_CGM_CFG_IDLE_TH_SHIFT) | \
(0xA << DMA0_QM_CGM_CFG_G2F_TH_SHIFT) | \
(0xF << DMA0_QM_CGM_CFG_CP_IDLE_MASK_SHIFT) | \
(1 << DMA0_QM_CGM_CFG_EN_SHIFT))
(FIELD_PREP(DMA0_QM_CGM_CFG_IDLE_TH_MASK, 0x20)) | \
(FIELD_PREP(DMA0_QM_CGM_CFG_G2F_TH_MASK, 0xA)) | \
(FIELD_PREP(DMA0_QM_CGM_CFG_CP_IDLE_MASK_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_CGM_CFG_EN_MASK, 0x1)))
#define PCI_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK (\
(0xF << DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_SHIFT) | \
(0xF << DMA0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_SHIFT) | \
(0xF << DMA0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_SHIFT))
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_MASK, 0xF)))
#define PCI_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(0xF << DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
(0xF << DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
(0xF << DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0xF)))
#define HBM_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK (\
(0xF << DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_SHIFT) | \
(0x1F << DMA0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_SHIFT) | \
(0x1F << DMA0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_SHIFT))
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_MASK, 0x1F)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_MASK, 0x1F)))
#define HBM_DMA_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(0xF << DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
(0x1F << DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
(0x1F << DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
(FIELD_PREP(DMA0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
#define TPC_QMAN_GLBL_ERR_CFG_MSG_EN_MASK (\
(0xF << TPC0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_SHIFT) | \
(0x1F << TPC0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_SHIFT) | \
(0x1F << TPC0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_SHIFT))
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_MASK, 0x1F)) | \
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_MASK, 0x1F)))
#define TPC_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(0xF << TPC0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
(0x1F << TPC0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
(0x1F << TPC0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
(FIELD_PREP(TPC0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
#define MME_QMAN_GLBL_ERR_CFG_MSG_EN_MASK (\
(0xF << MME0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_SHIFT) | \
(0x1F << MME0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_SHIFT) | \
(0x1F << MME0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_SHIFT))
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_PQF_ERR_MSG_EN_MASK, 0xF)) | \
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CQF_ERR_MSG_EN_MASK, 0x1F)) | \
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CP_ERR_MSG_EN_MASK, 0x1F)))
#define MME_QMAN_GLBL_ERR_CFG_STOP_ON_ERR_EN_MASK (\
(0xF << MME0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_SHIFT) | \
(0x1F << MME0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_SHIFT) | \
(0x1F << MME0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_SHIFT))
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_PQF_STOP_ON_ERR_MASK, 0xF)) | \
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CQF_STOP_ON_ERR_MASK, 0x1F)) | \
(FIELD_PREP(MME0_QM_GLBL_ERR_CFG_CP_STOP_ON_ERR_MASK, 0x1F)))
#define QMAN_CGM1_PWR_GATE_EN (0xA << DMA0_QM_CGM_CFG1_MASK_TH_SHIFT)
#define QMAN_CGM1_PWR_GATE_EN (FIELD_PREP(DMA0_QM_CGM_CFG1_MASK_TH_MASK, 0xA))
/* RESET registers configuration */
#define CFG_RST_L_PSOC_SHIFT 0
#define CFG_RST_L_PCIE_SHIFT 1
#define CFG_RST_L_PCIE_IF_SHIFT 2
#define CFG_RST_L_HBM_S_PLL_SHIFT 3
#define CFG_RST_L_TPC_S_PLL_SHIFT 4
#define CFG_RST_L_MME_S_PLL_SHIFT 5
#define CFG_RST_L_CPU_PLL_SHIFT 6
#define CFG_RST_L_PCIE_PLL_SHIFT 7
#define CFG_RST_L_NIC_S_PLL_SHIFT 8
#define CFG_RST_L_HBM_N_PLL_SHIFT 9
#define CFG_RST_L_TPC_N_PLL_SHIFT 10
#define CFG_RST_L_MME_N_PLL_SHIFT 11
#define CFG_RST_L_NIC_N_PLL_SHIFT 12
#define CFG_RST_L_DMA_W_PLL_SHIFT 13
#define CFG_RST_L_SIF_W_PLL_SHIFT 14
#define CFG_RST_L_MESH_W_PLL_SHIFT 15
#define CFG_RST_L_SRAM_W_PLL_SHIFT 16
#define CFG_RST_L_DMA_E_PLL_SHIFT 17
#define CFG_RST_L_SIF_E_PLL_SHIFT 18
#define CFG_RST_L_MESH_E_PLL_SHIFT 19
#define CFG_RST_L_SRAM_E_PLL_SHIFT 20
#define CFG_RST_L_IF_1_SHIFT 21
#define CFG_RST_L_IF_0_SHIFT 22
#define CFG_RST_L_IF_2_SHIFT 23
#define CFG_RST_L_IF_3_SHIFT 24
#define CFG_RST_L_TPC_0_SHIFT 25
#define CFG_RST_L_TPC_1_SHIFT 26
#define CFG_RST_L_TPC_2_SHIFT 27
#define CFG_RST_L_TPC_3_SHIFT 28
#define CFG_RST_L_TPC_4_SHIFT 29
#define CFG_RST_L_TPC_5_SHIFT 30
#define CFG_RST_L_TPC_6_SHIFT 31
#define CFG_RST_H_TPC_7_SHIFT 0
#define CFG_RST_H_MME_0_SHIFT 1
#define CFG_RST_H_MME_1_SHIFT 2
#define CFG_RST_H_MME_2_SHIFT 3
#define CFG_RST_H_MME_3_SHIFT 4
#define CFG_RST_H_HBM_0_SHIFT 5
#define CFG_RST_H_HBM_1_SHIFT 6
#define CFG_RST_H_HBM_2_SHIFT 7
#define CFG_RST_H_HBM_3_SHIFT 8
#define CFG_RST_H_NIC_0_SHIFT 9
#define CFG_RST_H_NIC_1_SHIFT 10
#define CFG_RST_H_NIC_2_SHIFT 11
#define CFG_RST_H_NIC_3_SHIFT 12
#define CFG_RST_H_NIC_4_SHIFT 13
#define CFG_RST_H_SM_0_SHIFT 14
#define CFG_RST_H_SM_1_SHIFT 15
#define CFG_RST_H_SM_2_SHIFT 16
#define CFG_RST_H_SM_3_SHIFT 17
#define CFG_RST_H_DMA_0_SHIFT 18
#define CFG_RST_H_DMA_1_SHIFT 19
#define CFG_RST_H_CPU_SHIFT 20
#define CFG_RST_H_MMU_SHIFT 21
#define CFG_RST_L_PSOC_MASK BIT_MASK(0)
#define CFG_RST_L_PCIE_MASK BIT_MASK(1)
#define CFG_RST_L_PCIE_IF_MASK BIT_MASK(2)
#define CFG_RST_L_HBM_S_PLL_MASK BIT_MASK(3)
#define CFG_RST_L_TPC_S_PLL_MASK BIT_MASK(4)
#define CFG_RST_L_MME_S_PLL_MASK BIT_MASK(5)
#define CFG_RST_L_CPU_PLL_MASK BIT_MASK(6)
#define CFG_RST_L_PCIE_PLL_MASK BIT_MASK(7)
#define CFG_RST_L_NIC_S_PLL_MASK BIT_MASK(8)
#define CFG_RST_L_HBM_N_PLL_MASK BIT_MASK(9)
#define CFG_RST_L_TPC_N_PLL_MASK BIT_MASK(10)
#define CFG_RST_L_MME_N_PLL_MASK BIT_MASK(11)
#define CFG_RST_L_NIC_N_PLL_MASK BIT_MASK(12)
#define CFG_RST_L_DMA_W_PLL_MASK BIT_MASK(13)
#define CFG_RST_L_SIF_W_PLL_MASK BIT_MASK(14)
#define CFG_RST_L_MESH_W_PLL_MASK BIT_MASK(15)
#define CFG_RST_L_SRAM_W_PLL_MASK BIT_MASK(16)
#define CFG_RST_L_DMA_E_PLL_MASK BIT_MASK(17)
#define CFG_RST_L_SIF_E_PLL_MASK BIT_MASK(18)
#define CFG_RST_L_MESH_E_PLL_MASK BIT_MASK(19)
#define CFG_RST_L_SRAM_E_PLL_MASK BIT_MASK(20)
#define CFG_RST_L_IF_1_MASK BIT_MASK(21)
#define CFG_RST_L_IF_0_MASK BIT_MASK(22)
#define CFG_RST_L_IF_2_MASK BIT_MASK(23)
#define CFG_RST_L_IF_3_MASK BIT_MASK(24)
#define CFG_RST_L_IF_MASK GENMASK(24, 21)
#define CFG_RST_H_DMA_MASK ((1 << CFG_RST_H_DMA_0_SHIFT) | \
(1 << CFG_RST_H_DMA_1_SHIFT))
#define CFG_RST_L_TPC_0_MASK BIT_MASK(25)
#define CFG_RST_L_TPC_1_MASK BIT_MASK(26)
#define CFG_RST_L_TPC_2_MASK BIT_MASK(27)
#define CFG_RST_L_TPC_3_MASK BIT_MASK(28)
#define CFG_RST_L_TPC_4_MASK BIT_MASK(29)
#define CFG_RST_L_TPC_5_MASK BIT_MASK(30)
#define CFG_RST_L_TPC_6_MASK BIT_MASK(31)
#define CFG_RST_L_TPC_MASK GENMASK(31, 25)
#define CFG_RST_H_CPU_MASK (1 << CFG_RST_H_CPU_SHIFT)
#define CFG_RST_H_MMU_MASK (1 << CFG_RST_H_MMU_SHIFT)
#define CFG_RST_H_TPC_7_MASK BIT_MASK(0)
#define CFG_RST_H_HBM_MASK ((1 << CFG_RST_H_HBM_0_SHIFT) | \
(1 << CFG_RST_H_HBM_1_SHIFT) | \
(1 << CFG_RST_H_HBM_2_SHIFT) | \
(1 << CFG_RST_H_HBM_3_SHIFT))
#define CFG_RST_H_MME_0_MASK BIT_MASK(1)
#define CFG_RST_H_MME_1_MASK BIT_MASK(2)
#define CFG_RST_H_MME_2_MASK BIT_MASK(3)
#define CFG_RST_H_MME_3_MASK BIT_MASK(4)
#define CFG_RST_H_MME_MASK GENMASK(4, 1)
#define CFG_RST_H_NIC_MASK ((1 << CFG_RST_H_NIC_0_SHIFT) | \
(1 << CFG_RST_H_NIC_1_SHIFT) | \
(1 << CFG_RST_H_NIC_2_SHIFT) | \
(1 << CFG_RST_H_NIC_3_SHIFT) | \
(1 << CFG_RST_H_NIC_4_SHIFT))
#define CFG_RST_H_HBM_0_MASK BIT_MASK(5)
#define CFG_RST_H_HBM_1_MASK BIT_MASK(6)
#define CFG_RST_H_HBM_2_MASK BIT_MASK(7)
#define CFG_RST_H_HBM_3_MASK BIT_MASK(8)
#define CFG_RST_H_HBM_MASK GENMASK(8, 5)
#define CFG_RST_H_SM_MASK ((1 << CFG_RST_H_SM_0_SHIFT) | \
(1 << CFG_RST_H_SM_1_SHIFT) | \
(1 << CFG_RST_H_SM_2_SHIFT) | \
(1 << CFG_RST_H_SM_3_SHIFT))
#define CFG_RST_H_NIC_0_MASK BIT_MASK(9)
#define CFG_RST_H_NIC_1_MASK BIT_MASK(10)
#define CFG_RST_H_NIC_2_MASK BIT_MASK(11)
#define CFG_RST_H_NIC_3_MASK BIT_MASK(12)
#define CFG_RST_H_NIC_4_MASK BIT_MASK(13)
#define CFG_RST_H_NIC_MASK GENMASK(13, 9)
#define CFG_RST_H_MME_MASK ((1 << CFG_RST_H_MME_0_SHIFT) | \
(1 << CFG_RST_H_MME_1_SHIFT) | \
(1 << CFG_RST_H_MME_2_SHIFT) | \
(1 << CFG_RST_H_MME_3_SHIFT))
#define CFG_RST_H_SM_0_MASK BIT_MASK(14)
#define CFG_RST_H_SM_1_MASK BIT_MASK(15)
#define CFG_RST_H_SM_2_MASK BIT_MASK(16)
#define CFG_RST_H_SM_3_MASK BIT_MASK(17)
#define CFG_RST_H_SM_MASK GENMASK(17, 14)
#define CFG_RST_L_PSOC_MASK (1 << CFG_RST_L_PSOC_SHIFT)
#define CFG_RST_H_DMA_0_MASK BIT_MASK(18)
#define CFG_RST_H_DMA_1_MASK BIT_MASK(19)
#define CFG_RST_H_DMA_MASK GENMASK(19, 18)
#define CFG_RST_L_IF_MASK ((1 << CFG_RST_L_IF_0_SHIFT) | \
(1 << CFG_RST_L_IF_1_SHIFT) | \
(1 << CFG_RST_L_IF_2_SHIFT) | \
(1 << CFG_RST_L_IF_3_SHIFT))
#define CFG_RST_L_TPC_MASK ((1 << CFG_RST_L_TPC_0_SHIFT) | \
(1 << CFG_RST_L_TPC_1_SHIFT) | \
(1 << CFG_RST_L_TPC_2_SHIFT) | \
(1 << CFG_RST_L_TPC_3_SHIFT) | \
(1 << CFG_RST_L_TPC_4_SHIFT) | \
(1 << CFG_RST_L_TPC_5_SHIFT) | \
(1 << CFG_RST_L_TPC_6_SHIFT))
#define CFG_RST_H_TPC_MASK (1 << CFG_RST_H_TPC_7_SHIFT)
#define CA53_RESET (1 << CFG_RST_H_CPU_SHIFT)
#define CFG_RST_H_CPU_MASK BIT_MASK(20)
#define CFG_RST_H_MMU_MASK BIT_MASK(21)
#define UNIT_RST_L_PSOC_SHIFT 0
#define UNIT_RST_L_PCIE_SHIFT 1

View File

@ -12,6 +12,7 @@
* PSOC scratch-pad registers
*/
#define mmHW_STATE mmPSOC_GLOBAL_CONF_SCRATCHPAD_0
#define mmFUSE_VER_OFFSET mmPSOC_GLOBAL_CONF_SCRATCHPAD_22
#define mmCPU_CMD_STATUS_TO_HOST mmPSOC_GLOBAL_CONF_SCRATCHPAD_23
#define mmCPU_BOOT_ERR0 mmPSOC_GLOBAL_CONF_SCRATCHPAD_24
#define mmCPU_BOOT_ERR1 mmPSOC_GLOBAL_CONF_SCRATCHPAD_25

View File

@ -22,6 +22,7 @@
#define mmCPU_CQ_BASE_ADDR_LOW mmPSOC_GLOBAL_CONF_SCRATCHPAD_8
#define mmCPU_CQ_BASE_ADDR_HIGH mmPSOC_GLOBAL_CONF_SCRATCHPAD_9
#define mmCPU_CQ_LENGTH mmPSOC_GLOBAL_CONF_SCRATCHPAD_10
#define mmFUSE_VER_OFFSET mmPSOC_GLOBAL_CONF_SCRATCHPAD_22
#define mmCPU_CMD_STATUS_TO_HOST mmPSOC_GLOBAL_CONF_SCRATCHPAD_23
#define mmCPU_BOOT_ERR0 mmPSOC_GLOBAL_CONF_SCRATCHPAD_24
#define mmCPU_BOOT_ERR1 mmPSOC_GLOBAL_CONF_SCRATCHPAD_25

View File

@ -29,6 +29,8 @@
#define HOP3_SHIFT 21
#define HOP4_SHIFT 12
#define MMU_ARCH_5_HOPS 5
#define HOP_PHYS_ADDR_MASK (~FLAGS_MASK)
#define HL_PTE_SIZE sizeof(u64)

View File

@ -264,6 +264,10 @@ enum hl_device_status {
* HL_INFO_TIME_SYNC - Retrieve the device's time alongside the host's time
* for synchronization.
* HL_INFO_CS_COUNTERS - Retrieve command submission counters
* HL_INFO_PCI_COUNTERS - Retrieve PCI counters
* HL_INFO_CLK_THROTTLE_REASON - Retrieve clock throttling reason
* HL_INFO_SYNC_MANAGER - Retrieve sync manager info per dcore
* HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@ -276,6 +280,10 @@ enum hl_device_status {
#define HL_INFO_RESET_COUNT 9
#define HL_INFO_TIME_SYNC 10
#define HL_INFO_CS_COUNTERS 11
#define HL_INFO_PCI_COUNTERS 12
#define HL_INFO_CLK_THROTTLE_REASON 13
#define HL_INFO_SYNC_MANAGER 14
#define HL_INFO_TOTAL_ENERGY 15
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@ -289,7 +297,7 @@ struct hl_info_hw_ip_info {
__u32 device_id; /* PCI Device ID */
__u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */
__u32 reserved[2];
__u32 armcp_cpld_version;
__u32 cpld_version;
__u32 psoc_pci_pll_nr;
__u32 psoc_pci_pll_nf;
__u32 psoc_pci_pll_od;
@ -297,7 +305,7 @@ struct hl_info_hw_ip_info {
__u8 tpc_enabled_mask;
__u8 dram_enabled;
__u8 pad[2];
__u8 armcp_version[HL_INFO_VERSION_MAX_LEN];
__u8 cpucp_version[HL_INFO_VERSION_MAX_LEN];
__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
};
@ -313,6 +321,12 @@ struct hl_info_hw_idle {
* Bits definition is according to `enum <chip>_enging_id'.
*/
__u32 busy_engines_mask;
/*
* Extended Bitmask of busy engines.
* Bits definition is according to `enum <chip>_enging_id'.
*/
__u64 busy_engines_mask_ext;
};
struct hl_info_device_status {
@ -340,18 +354,61 @@ struct hl_info_time_sync {
__u64 host_time;
};
/**
* struct hl_info_pci_counters - pci counters
* @rx_throughput: PCI rx throughput KBps
* @tx_throughput: PCI tx throughput KBps
* @replay_cnt: PCI replay counter
*/
struct hl_info_pci_counters {
__u64 rx_throughput;
__u64 tx_throughput;
__u64 replay_cnt;
};
#define HL_CLK_THROTTLE_POWER 0x1
#define HL_CLK_THROTTLE_THERMAL 0x2
/**
* struct hl_info_clk_throttle - clock throttling reason
* @clk_throttling_reason: each bit represents a clk throttling reason
*/
struct hl_info_clk_throttle {
__u32 clk_throttling_reason;
};
/**
* struct hl_info_energy - device energy information
* @total_energy_consumption: total device energy consumption
*/
struct hl_info_energy {
__u64 total_energy_consumption;
};
/**
* struct hl_info_sync_manager - sync manager information
* @first_available_sync_object: first available sob
* @first_available_monitor: first available monitor
*/
struct hl_info_sync_manager {
__u32 first_available_sync_object;
__u32 first_available_monitor;
};
/**
* struct hl_info_cs_counters - command submission counters
* @out_of_mem_drop_cnt: dropped due to memory allocation issue
* @parsing_drop_cnt: dropped due to error in packet parsing
* @queue_full_drop_cnt: dropped due to queue full
* @device_in_reset_drop_cnt: dropped due to device in reset
* @max_cs_in_flight_drop_cnt: dropped due to maximum CS in-flight
*/
struct hl_cs_counters {
__u64 out_of_mem_drop_cnt;
__u64 parsing_drop_cnt;
__u64 queue_full_drop_cnt;
__u64 device_in_reset_drop_cnt;
__u64 max_cs_in_flight_drop_cnt;
};
struct hl_info_cs_counters {
@ -359,6 +416,13 @@ struct hl_info_cs_counters {
struct hl_cs_counters ctx_cs_counters;
};
enum gaudi_dcores {
HL_GAUDI_WS_DCORE,
HL_GAUDI_WN_DCORE,
HL_GAUDI_EN_DCORE,
HL_GAUDI_ES_DCORE
};
struct hl_info_args {
/* Location of relevant struct in userspace */
__u64 return_pointer;
@ -375,6 +439,10 @@ struct hl_info_args {
__u32 op;
union {
/* Dcore id for which the information is relevant.
* For Gaudi refer to 'enum gaudi_dcores'
*/
__u32 dcore_id;
/* Context ID - Currently not in use */
__u32 ctx_id;
/* Period value for utilization rate (100ms - 1000ms, in 100ms
@ -394,6 +462,9 @@ struct hl_info_args {
/* 2MB minus 32 bytes for 2xMSG_PROT */
#define HL_MAX_CB_SIZE (0x200000 - 32)
/* Indicates whether the command buffer should be mapped to the device's MMU */
#define HL_CB_FLAGS_MAP 0x1
struct hl_cb_in {
/* Handle of CB or 0 if we want to create one */
__u64 cb_handle;
@ -405,7 +476,8 @@ struct hl_cb_in {
__u32 cb_size;
/* Context ID - Currently not in use */
__u32 ctx_id;
__u32 pad;
/* HL_CB_FLAGS_* */
__u32 flags;
};
struct hl_cb_out {
@ -788,6 +860,12 @@ struct hl_debug_args {
* When creating a new CB, the IOCTL returns a handle of it, and the user-space
* process needs to use that handle to mmap the buffer so it can access them.
*
* In some instances, the device must access the command buffer through the
* device's MMU, and thus its memory should be mapped. In these cases, user can
* indicate the driver that such a mapping is required.
* The resulting device virtual address will be used internally by the driver,
* and won't be returned to user.
*
*/
#define HL_IOCTL_CB \
_IOWR('H', 0x02, union hl_cb_args)
@ -846,6 +924,9 @@ struct hl_debug_args {
* inside the kernel until the CS has finished or until the user-requested
* timeout has expired.
*
* If the timeout value is 0, the driver won't sleep at all. It will check
* the status of the CS and return immediately
*
* The return value of the IOCTL is a standard Linux error code. The possible
* values are:
*