habanalabs: Add a new H/W queue type
authorTomer Tayar <ttayar@habana.ai>
Thu, 3 Oct 2019 15:22:36 +0000 (15:22 +0000)
committerOded Gabbay <oded.gabbay@gmail.com>
Thu, 21 Nov 2019 09:35:45 +0000 (11:35 +0200)
This patch adds a support for a new H/W queue type.
This type of queue is for DMA and compute engines jobs, for which
completion notification are sent by H/W.
Command buffer for this queue can be created either through the CB
IOCTL and using the retrieved CB handle, or by preparing a buffer on the
host or device SRAM/DRAM, and using the device address to that buffer.
The patch includes the handling of the 2 options, as well as the
initialization of the H/W queue and its jobs scheduling.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
drivers/misc/habanalabs/command_submission.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/habanalabs.h
drivers/misc/habanalabs/hw_queue.c
drivers/misc/habanalabs/include/qman_if.h

index f44205540520f97cf7c5ae0223ec8ecae924b877..776ddafc47fb282f0edb48c21d39d4052e74a3c1 100644 (file)
@@ -65,6 +65,18 @@ static void cs_put(struct hl_cs *cs)
        kref_put(&cs->refcount, cs_do_release);
 }
 
+static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
+{
+       /*
+        * Patched CB is created for external queues jobs, and for H/W queues
+        * jobs if the user CB was allocated by driver and MMU is disabled.
+        */
+       return (job->queue_type == QUEUE_TYPE_EXT ||
+                       (job->queue_type == QUEUE_TYPE_HW &&
+                                       job->is_kernel_allocated_cb &&
+                                       !hdev->mmu_enable));
+}
+
 /*
  * cs_parser - parse the user command submission
  *
@@ -91,11 +103,13 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
        parser.patched_cb = NULL;
        parser.user_cb = job->user_cb;
        parser.user_cb_size = job->user_cb_size;
-       parser.ext_queue = job->ext_queue;
+       parser.queue_type = job->queue_type;
+       parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
        job->patched_cb = NULL;
 
        rc = hdev->asic_funcs->cs_parser(hdev, &parser);
-       if (job->ext_queue) {
+
+       if (is_cb_patched(hdev, job)) {
                if (!rc) {
                        job->patched_cb = parser.patched_cb;
                        job->job_cb_size = parser.patched_cb_size;
@@ -124,7 +138,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 {
        struct hl_cs *cs = job->cs;
 
-       if (job->ext_queue) {
+       if (is_cb_patched(hdev, job)) {
                hl_userptr_delete_list(hdev, &job->userptr_list);
 
                /*
@@ -140,6 +154,19 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
                }
        }
 
+       /* For H/W queue jobs, if a user CB was allocated by driver and MMU is
+        * enabled, the user CB isn't released in cs_parser() and thus should be
+        * released here.
+        */
+       if (job->queue_type == QUEUE_TYPE_HW &&
+                       job->is_kernel_allocated_cb && hdev->mmu_enable) {
+               spin_lock(&job->user_cb->lock);
+               job->user_cb->cs_cnt--;
+               spin_unlock(&job->user_cb->lock);
+
+               hl_cb_put(job->user_cb);
+       }
+
        /*
         * This is the only place where there can be multiple threads
         * modifying the list at the same time
@@ -150,7 +177,8 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 
        hl_debugfs_remove_job(hdev, job);
 
-       if (job->ext_queue)
+       if (job->queue_type == QUEUE_TYPE_EXT ||
+                       job->queue_type == QUEUE_TYPE_HW)
                cs_put(cs);
 
        kfree(job);
@@ -387,18 +415,13 @@ static void job_wq_completion(struct work_struct *work)
        free_job(hdev, job);
 }
 
-static struct hl_cb *validate_queue_index(struct hl_device *hdev,
-                                       struct hl_cb_mgr *cb_mgr,
-                                       struct hl_cs_chunk *chunk,
-                                       bool *ext_queue)
+static int validate_queue_index(struct hl_device *hdev,
+                               struct hl_cs_chunk *chunk,
+                               enum hl_queue_type *queue_type,
+                               bool *is_kernel_allocated_cb)
 {
        struct asic_fixed_properties *asic = &hdev->asic_prop;
        struct hw_queue_properties *hw_queue_prop;
-       u32 cb_handle;
-       struct hl_cb *cb;
-
-       /* Assume external queue */
-       *ext_queue = true;
 
        hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
 
@@ -406,22 +429,29 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
                        (hw_queue_prop->type == QUEUE_TYPE_NA)) {
                dev_err(hdev->dev, "Queue index %d is invalid\n",
                        chunk->queue_index);
-               return NULL;
+               return -EINVAL;
        }
 
        if (hw_queue_prop->driver_only) {
                dev_err(hdev->dev,
                        "Queue index %d is restricted for the kernel driver\n",
                        chunk->queue_index);
-               return NULL;
+               return -EINVAL;
        }
 
-       if (!hw_queue_prop->requires_kernel_cb) {
-               *ext_queue = false;
-               return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
-       }
+       *queue_type = hw_queue_prop->type;
+       *is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;
+
+       return 0;
+}
+
+static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
+                                       struct hl_cb_mgr *cb_mgr,
+                                       struct hl_cs_chunk *chunk)
+{
+       struct hl_cb *cb;
+       u32 cb_handle;
 
-       /* Retrieve CB object */
        cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
 
        cb = hl_cb_get(hdev, cb_mgr, cb_handle);
@@ -446,7 +476,8 @@ release_cb:
        return NULL;
 }
 
-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+               enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
 {
        struct hl_cs_job *job;
 
@@ -454,12 +485,14 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
        if (!job)
                return NULL;
 
-       job->ext_queue = ext_queue;
+       job->queue_type = queue_type;
+       job->is_kernel_allocated_cb = is_kernel_allocated_cb;
 
-       if (job->ext_queue) {
+       if (is_cb_patched(hdev, job))
                INIT_LIST_HEAD(&job->userptr_list);
+
+       if (job->queue_type == QUEUE_TYPE_EXT)
                INIT_WORK(&job->finish_work, job_wq_completion);
-       }
 
        return job;
 }
@@ -472,7 +505,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
        struct hl_cs_job *job;
        struct hl_cs *cs;
        struct hl_cb *cb;
-       bool ext_queue_present = false;
+       bool int_queues_only = true;
        u32 size_to_copy;
        int rc, i, parse_cnt;
 
@@ -516,23 +549,33 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
        /* Validate ALL the CS chunks before submitting the CS */
        for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
                struct hl_cs_chunk *chunk = &cs_chunk_array[i];
-               bool ext_queue;
+               enum hl_queue_type queue_type;
+               bool is_kernel_allocated_cb;
 
-               cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
-                                       &ext_queue);
-               if (ext_queue) {
-                       ext_queue_present = true;
+               rc = validate_queue_index(hdev, chunk, &queue_type,
+                                               &is_kernel_allocated_cb);
+               if (rc)
+                       goto free_cs_object;
+
+               if (is_kernel_allocated_cb) {
+                       cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
                        if (!cb) {
                                rc = -EINVAL;
                                goto free_cs_object;
                        }
+               } else {
+                       cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
                }
 
-               job = hl_cs_allocate_job(hdev, ext_queue);
+               if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
+                       int_queues_only = false;
+
+               job = hl_cs_allocate_job(hdev, queue_type,
+                                               is_kernel_allocated_cb);
                if (!job) {
                        dev_err(hdev->dev, "Failed to allocate a new job\n");
                        rc = -ENOMEM;
-                       if (ext_queue)
+                       if (is_kernel_allocated_cb)
                                goto release_cb;
                        else
                                goto free_cs_object;
@@ -542,7 +585,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
                job->cs = cs;
                job->user_cb = cb;
                job->user_cb_size = chunk->cb_size;
-               if (job->ext_queue)
+               if (is_kernel_allocated_cb)
                        job->job_cb_size = cb->size;
                else
                        job->job_cb_size = chunk->cb_size;
@@ -555,10 +598,11 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
                /*
                 * Increment CS reference. When CS reference is 0, CS is
                 * done and can be signaled to user and free all its resources
-                * Only increment for JOB on external queues, because only
-                * for those JOBs we get completion
+                * Only increment for JOB on external or H/W queues, because
+                * only for those JOBs we get completion
                 */
-               if (job->ext_queue)
+               if (job->queue_type == QUEUE_TYPE_EXT ||
+                               job->queue_type == QUEUE_TYPE_HW)
                        cs_get(cs);
 
                hl_debugfs_add_job(hdev, job);
@@ -572,9 +616,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
                }
        }
 
-       if (!ext_queue_present) {
+       if (int_queues_only) {
                dev_err(hdev->dev,
-                       "Reject CS %d.%llu because no external queues jobs\n",
+                       "Reject CS %d.%llu because only internal queues jobs are present\n",
                        cs->ctx->asid, cs->sequence);
                rc = -EINVAL;
                goto free_cs_object;
index 71693fcffb1634badebcd95cc306c21d18171627..0b40915bede2bcf14d373dbd033436a1c828a44a 100644 (file)
@@ -3943,7 +3943,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
 {
        struct goya_device *goya = hdev->asic_specific;
 
-       if (!parser->ext_queue)
+       if (parser->queue_type == QUEUE_TYPE_INT)
                return goya_parse_cb_no_ext_queue(hdev, parser);
 
        if (goya->hw_cap_initialized & HW_CAP_MMU)
@@ -4614,7 +4614,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
                lin_dma_pkt++;
        } while (--lin_dma_pkts_cnt);
 
-       job = hl_cs_allocate_job(hdev, true);
+       job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
        if (!job) {
                dev_err(hdev->dev, "Failed to allocate a new job\n");
                rc = -ENOMEM;
index f47f4b22cb6b54c268abdf4f68be4a878e7765f5..371d1ec15697227cd7eeec2d1b4e73f939375af9 100644 (file)
@@ -85,12 +85,15 @@ struct hl_fpriv;
  * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's
  *                     memories and/or operates the compute engines.
  * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU.
+ * @QUEUE_TYPE_HW: queue of DMA and compute engines jobs, for which completion
+ *                 notifications are sent by H/W.
  */
 enum hl_queue_type {
        QUEUE_TYPE_NA,
        QUEUE_TYPE_EXT,
        QUEUE_TYPE_INT,
-       QUEUE_TYPE_CPU
+       QUEUE_TYPE_CPU,
+       QUEUE_TYPE_HW
 };
 
 /**
@@ -755,11 +758,14 @@ struct hl_cs {
  * @userptr_list: linked-list of userptr mappings that belong to this job and
  *                     wait for completion.
  * @debugfs_list: node in debugfs list of command submission jobs.
+ * @queue_type: the type of the H/W queue this job is submitted to.
  * @id: the id of this job inside a CS.
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
  * @user_cb_size: the actual size of the CB we got from the user.
  * @job_cb_size: the actual size of the CB that we put on the queue.
- * @ext_queue: whether the job is for external queue or internal queue.
+ * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
+ *                          handle to a kernel-allocated CB object, false
+ *                          otherwise (SRAM/DRAM/host address).
  */
 struct hl_cs_job {
        struct list_head        cs_node;
@@ -769,11 +775,12 @@ struct hl_cs_job {
        struct work_struct      finish_work;
        struct list_head        userptr_list;
        struct list_head        debugfs_list;
+       enum hl_queue_type      queue_type;
        u32                     id;
        u32                     hw_queue_id;
        u32                     user_cb_size;
        u32                     job_cb_size;
-       u8                      ext_queue;
+       u8                      is_kernel_allocated_cb;
 };
 
 /**
@@ -784,24 +791,28 @@ struct hl_cs_job {
  * @job_userptr_list: linked-list of userptr mappings that belong to the related
  *                     job and wait for completion.
  * @cs_sequence: the sequence number of the related CS.
+ * @queue_type: the type of the H/W queue this job is submitted to.
  * @ctx_id: the ID of the context the related CS belongs to.
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
  * @user_cb_size: the actual size of the CB we got from the user.
  * @patched_cb_size: the size of the CB after parsing.
- * @ext_queue: whether the job is for external queue or internal queue.
  * @job_id: the id of the related job inside the related CS.
+ * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
+ *                          handle to a kernel-allocated CB object, false
+ *                          otherwise (SRAM/DRAM/host address).
  */
 struct hl_cs_parser {
        struct hl_cb            *user_cb;
        struct hl_cb            *patched_cb;
        struct list_head        *job_userptr_list;
        u64                     cs_sequence;
+       enum hl_queue_type      queue_type;
        u32                     ctx_id;
        u32                     hw_queue_id;
        u32                     user_cb_size;
        u32                     patched_cb_size;
-       u8                      ext_queue;
        u8                      job_id;
+       u8                      is_kernel_allocated_cb;
 };
 
 
@@ -1504,7 +1515,8 @@ int hl_cb_pool_init(struct hl_device *hdev);
 int hl_cb_pool_fini(struct hl_device *hdev);
 
 void hl_cs_rollback_all(struct hl_device *hdev);
-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue);
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+               enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 
index f733b534f73805eb181e22952027d988a440f482..91579dde9262ae2925522c6ae11acf35597c7def 100644 (file)
@@ -58,8 +58,8 @@ out:
 }
 
 /*
- * ext_queue_submit_bd - Submit a buffer descriptor to an external queue
- *
+ * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
+ *                                H/W queue.
  * @hdev: pointer to habanalabs device structure
  * @q: pointer to habanalabs queue structure
  * @ctl: BD's control word
@@ -73,8 +73,8 @@ out:
  * This function must be called when the scheduler mutex is taken
  *
  */
-static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
-                               u32 ctl, u32 len, u64 ptr)
+static void ext_and_hw_queue_submit_bd(struct hl_device *hdev,
+                       struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr)
 {
        struct hl_bd *bd;
 
@@ -173,6 +173,45 @@ static int int_queue_sanity_checks(struct hl_device *hdev,
        return 0;
 }
 
+/*
+ * hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue.
+ * @hdev: Pointer to hl_device structure.
+ * @q: Pointer to hl_hw_queue structure.
+ * @num_of_entries: How many entries to check for space.
+ *
+ * Perform the following:
+ * - Make sure we have enough space in the completion queue.
+ *   This check also ensures that there is enough space in the h/w queue, as
+ *   both queues are of the same size.
+ * - Reserve space in the completion queue (needs to be reversed if there
+ *   is a failure down the road before the actual submission of work).
+ *
+ * Both operations are done using the "free_slots_cnt" field of the completion
+ * queue. The CI counters of the queue and the completion queue are not
+ * needed/used for the H/W queue type.
+ */
+static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
+                                       int num_of_entries)
+{
+       atomic_t *free_slots =
+                       &hdev->completion_queue[q->hw_queue_id].free_slots_cnt;
+
+       /*
+        * Check we have enough space in the completion queue.
+        * Add -1 to counter (decrement) unless counter was already 0.
+        * In that case, CQ is full so we can't submit a new CB.
+        * atomic_add_unless will return 0 if counter was already 0.
+        */
+       if (atomic_add_negative(num_of_entries * -1, free_slots)) {
+               dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n",
+                       num_of_entries, q->hw_queue_id);
+               atomic_add(num_of_entries, free_slots);
+               return -EAGAIN;
+       }
+
+       return 0;
+}
+
 /*
  * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
  *
@@ -188,7 +227,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
                                u32 cb_size, u64 cb_ptr)
 {
        struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
-       int rc;
+       int rc = 0;
 
        /*
         * The CPU queue is a synchronous queue with an effective depth of
@@ -206,11 +245,18 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
                goto out;
        }
 
-       rc = ext_queue_sanity_checks(hdev, q, 1, false);
-       if (rc)
-               goto out;
+       /*
+        * hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue
+        * type only on init phase, when the queues are empty and being tested,
+        * so there is no need for sanity checks.
+        */
+       if (q->queue_type != QUEUE_TYPE_HW) {
+               rc = ext_queue_sanity_checks(hdev, q, 1, false);
+               if (rc)
+                       goto out;
+       }
 
-       ext_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
+       ext_and_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
 
 out:
        if (q->queue_type != QUEUE_TYPE_CPU)
@@ -220,14 +266,14 @@ out:
 }
 
 /*
- * ext_hw_queue_schedule_job - submit a JOB to an external queue
+ * ext_queue_schedule_job - submit a JOB to an external queue
  *
  * @job: pointer to the job that needs to be submitted to the queue
  *
  * This function must be called when the scheduler mutex is taken
  *
  */
-static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
+static void ext_queue_schedule_job(struct hl_cs_job *job)
 {
        struct hl_device *hdev = job->cs->ctx->hdev;
        struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
@@ -260,7 +306,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
         * H/W queues is done under the scheduler mutex
         *
         * No need to check if CQ is full because it was already
-        * checked in hl_queue_sanity_checks
+        * checked in ext_queue_sanity_checks
         */
        cq = &hdev->completion_queue[q->hw_queue_id];
        cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);
@@ -274,18 +320,18 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 
        cq->pi = hl_cq_inc_ptr(cq->pi);
 
-       ext_queue_submit_bd(hdev, q, ctl, len, ptr);
+       ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
 /*
- * int_hw_queue_schedule_job - submit a JOB to an internal queue
+ * int_queue_schedule_job - submit a JOB to an internal queue
  *
  * @job: pointer to the job that needs to be submitted to the queue
  *
  * This function must be called when the scheduler mutex is taken
  *
  */
-static void int_hw_queue_schedule_job(struct hl_cs_job *job)
+static void int_queue_schedule_job(struct hl_cs_job *job)
 {
        struct hl_device *hdev = job->cs->ctx->hdev;
        struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
@@ -307,6 +353,60 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job)
        hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
 }
 
+/*
+ * hw_queue_schedule_job - submit a JOB to a H/W queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void hw_queue_schedule_job(struct hl_cs_job *job)
+{
+       struct hl_device *hdev = job->cs->ctx->hdev;
+       struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+       struct hl_cq *cq;
+       u64 ptr;
+       u32 offset, ctl, len;
+
+       /*
+        * Upon PQE completion, COMP_DATA is used as the write data to the
+        * completion queue (QMAN HBW message), and COMP_OFFSET is used as the
+        * write address offset in the SM block (QMAN LBW message).
+        * The write address offset is calculated as "COMP_OFFSET << 2".
+        */
+       offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+       ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
+               ((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
+
+       len = job->job_cb_size;
+
+       /*
+        * A patched CB is created only if a user CB was allocated by driver and
+        * MMU is disabled. If MMU is enabled, the user CB should be used
+        * instead. If the user CB wasn't allocated by driver, assume that it
+        * holds an address.
+        */
+       if (job->patched_cb)
+               ptr = job->patched_cb->bus_address;
+       else if (job->is_kernel_allocated_cb)
+               ptr = job->user_cb->bus_address;
+       else
+               ptr = (u64) (uintptr_t) job->user_cb;
+
+       /*
+        * No need to protect pi_offset because scheduling to the
+        * H/W queues is done under the scheduler mutex
+        *
+        * No need to check if CQ is full because it was already
+        * checked in hw_queue_sanity_checks
+        */
+       cq = &hdev->completion_queue[q->hw_queue_id];
+       cq->pi = hl_cq_inc_ptr(cq->pi);
+
+       ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
+}
+
 /*
  * hl_hw_queue_schedule_cs - schedule a command submission
  *
@@ -330,23 +430,34 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
        }
 
        q = &hdev->kernel_queues[0];
-       /* This loop assumes all external queues are consecutive */
        for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
-               if (q->queue_type == QUEUE_TYPE_EXT) {
-                       if (cs->jobs_in_queue_cnt[i]) {
+               if (cs->jobs_in_queue_cnt[i]) {
+                       switch (q->queue_type) {
+                       case QUEUE_TYPE_EXT:
                                rc = ext_queue_sanity_checks(hdev, q,
-                                       cs->jobs_in_queue_cnt[i], true);
-                               if (rc)
-                                       goto unroll_cq_resv;
-                               cq_cnt++;
-                       }
-               } else if (q->queue_type == QUEUE_TYPE_INT) {
-                       if (cs->jobs_in_queue_cnt[i]) {
+                                               cs->jobs_in_queue_cnt[i], true);
+                               break;
+                       case QUEUE_TYPE_INT:
                                rc = int_queue_sanity_checks(hdev, q,
-                                       cs->jobs_in_queue_cnt[i]);
-                               if (rc)
-                                       goto unroll_cq_resv;
+                                               cs->jobs_in_queue_cnt[i]);
+                               break;
+                       case QUEUE_TYPE_HW:
+                               rc = hw_queue_sanity_checks(hdev, q,
+                                               cs->jobs_in_queue_cnt[i]);
+                               break;
+                       default:
+                               dev_err(hdev->dev, "Queue type %d is invalid\n",
+                                       q->queue_type);
+                               rc = -EINVAL;
+                               break;
                        }
+
+                       if (rc)
+                               goto unroll_cq_resv;
+
+                       if (q->queue_type == QUEUE_TYPE_EXT ||
+                                       q->queue_type == QUEUE_TYPE_HW)
+                               cq_cnt++;
                }
        }
 
@@ -373,21 +484,30 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
        }
 
        list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-               if (job->ext_queue)
-                       ext_hw_queue_schedule_job(job);
-               else
-                       int_hw_queue_schedule_job(job);
+               switch (job->queue_type) {
+               case QUEUE_TYPE_EXT:
+                       ext_queue_schedule_job(job);
+                       break;
+               case QUEUE_TYPE_INT:
+                       int_queue_schedule_job(job);
+                       break;
+               case QUEUE_TYPE_HW:
+                       hw_queue_schedule_job(job);
+                       break;
+               default:
+                       break;
+               }
 
        cs->submitted = true;
 
        goto out;
 
 unroll_cq_resv:
-       /* This loop assumes all external queues are consecutive */
        q = &hdev->kernel_queues[0];
        for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
-               if ((q->queue_type == QUEUE_TYPE_EXT) &&
-                               (cs->jobs_in_queue_cnt[i])) {
+               if ((q->queue_type == QUEUE_TYPE_EXT ||
+                               q->queue_type == QUEUE_TYPE_HW) &&
+                               cs->jobs_in_queue_cnt[i]) {
                        atomic_t *free_slots =
                                &hdev->completion_queue[i].free_slots_cnt;
                        atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
@@ -414,8 +534,8 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
        q->ci = hl_queue_inc_ptr(q->ci);
 }
 
-static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
-                               struct hl_hw_queue *q, bool is_cpu_queue)
+static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
+                                       bool is_cpu_queue)
 {
        void *p;
        int rc;
@@ -465,7 +585,7 @@ free_queue:
        return rc;
 }
 
-static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
        void *p;
 
@@ -485,18 +605,38 @@ static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
        return 0;
 }
 
-static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+{
+       return ext_and_cpu_queue_init(hdev, q, true);
+}
+
+static int ext_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
-       return ext_and_cpu_hw_queue_init(hdev, q, true);
+       return ext_and_cpu_queue_init(hdev, q, false);
 }
 
-static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
-       return ext_and_cpu_hw_queue_init(hdev, q, false);
+       void *p;
+
+       p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev,
+                                               HL_QUEUE_SIZE_IN_BYTES,
+                                               &q->bus_address,
+                                               GFP_KERNEL | __GFP_ZERO);
+       if (!p)
+               return -ENOMEM;
+
+       q->kernel_address = (u64) (uintptr_t) p;
+
+       /* Make sure read/write pointers are initialized to start of queue */
+       q->ci = 0;
+       q->pi = 0;
+
+       return 0;
 }
 
 /*
- * hw_queue_init - main initialization function for H/W queue object
+ * queue_init - main initialization function for H/W queue object
  *
  * @hdev: pointer to hl_device device structure
  * @q: pointer to hl_hw_queue queue structure
@@ -505,7 +645,7 @@ static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
  * Allocate dma-able memory for the queue and initialize fields
  * Returns 0 on success
  */
-static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
+static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
                        u32 hw_queue_id)
 {
        int rc;
@@ -516,21 +656,20 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 
        switch (q->queue_type) {
        case QUEUE_TYPE_EXT:
-               rc = ext_hw_queue_init(hdev, q);
+               rc = ext_queue_init(hdev, q);
                break;
-
        case QUEUE_TYPE_INT:
-               rc = int_hw_queue_init(hdev, q);
+               rc = int_queue_init(hdev, q);
                break;
-
        case QUEUE_TYPE_CPU:
-               rc = cpu_hw_queue_init(hdev, q);
+               rc = cpu_queue_init(hdev, q);
+               break;
+       case QUEUE_TYPE_HW:
+               rc = hw_queue_init(hdev, q);
                break;
-
        case QUEUE_TYPE_NA:
                q->valid = 0;
                return 0;
-
        default:
                dev_crit(hdev->dev, "wrong queue type %d during init\n",
                        q->queue_type);
@@ -554,7 +693,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
  *
  * Free the queue memory
  */
-static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
+static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
 {
        if (!q->valid)
                return;
@@ -612,7 +751,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
                        i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {
 
                q->queue_type = asic->hw_queues_props[i].type;
-               rc = hw_queue_init(hdev, q, i);
+               rc = queue_init(hdev, q, i);
                if (rc) {
                        dev_err(hdev->dev,
                                "failed to initialize queue %d\n", i);
@@ -624,7 +763,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
 
 release_queues:
        for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++)
-               hw_queue_fini(hdev, q);
+               queue_fini(hdev, q);
 
        kfree(hdev->kernel_queues);
 
@@ -637,7 +776,7 @@ void hl_hw_queues_destroy(struct hl_device *hdev)
        int i;
 
        for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
-               hw_queue_fini(hdev, q);
+               queue_fini(hdev, q);
 
        kfree(hdev->kernel_queues);
 }
index bf59bbe27fdcdf7d1d84e585720572fcb40d0857..0fdb49188ed7cda05f766e1247c21e03de8b828a 100644 (file)
@@ -23,6 +23,8 @@ struct hl_bd {
 #define HL_BD_SIZE                     sizeof(struct hl_bd)
 
 /*
+ * S/W CTL FIELDS.
+ *
  * BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is
  * valid. 1 means the repeat field is valid, 0 means not-valid,
  * i.e. repeat == 1
@@ -33,6 +35,16 @@ struct hl_bd {
 #define BD_CTL_SHADOW_INDEX_SHIFT      0
 #define BD_CTL_SHADOW_INDEX_MASK       0x00000FFF
 
+/*
+ * H/W CTL FIELDS
+ */
+
+#define BD_CTL_COMP_OFFSET_SHIFT       16
+#define BD_CTL_COMP_OFFSET_MASK                0x00FF0000
+
+#define BD_CTL_COMP_DATA_SHIFT         0
+#define BD_CTL_COMP_DATA_MASK          0x0000FFFF
+
 /*
  * COMPLETION QUEUE
  */