Merge tag 'perf-tools-for-v5.15-2021-09-11' of git://git.kernel.org/pub/scm/linux...
[linux-2.6-microblaze.git] / drivers / misc / habanalabs / gaudi / gaudi.c
index aa8a0ca..383865b 100644 (file)
@@ -76,7 +76,7 @@
 #define GAUDI_PLDM_MMU_TIMEOUT_USEC    (MMU_CONFIG_TIMEOUT_USEC * 100)
 #define GAUDI_PLDM_QMAN0_TIMEOUT_USEC  (HL_DEVICE_TIMEOUT_USEC * 30)
 #define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC        (HL_DEVICE_TIMEOUT_USEC * 30)
-#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC        1000000         /* 1s */
+#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC        4000000         /* 4s */
 #define GAUDI_MSG_TO_CPU_TIMEOUT_USEC  4000000         /* 4s */
 #define GAUDI_WAIT_FOR_BL_TIMEOUT_USEC 15000000        /* 15s */
 
 
 #define GAUDI_PLL_MAX 10
 
+#define BIN_REG_STRING_SIZE    sizeof("0b10101010101010101010101010101010")
+
+#define MONITOR_SOB_STRING_SIZE                256
+
+static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = {
+       GAUDI_QUEUE_ID_DMA_0_0,
+       GAUDI_QUEUE_ID_DMA_0_1,
+       GAUDI_QUEUE_ID_DMA_0_2,
+       GAUDI_QUEUE_ID_DMA_0_3,
+       GAUDI_QUEUE_ID_DMA_1_0,
+       GAUDI_QUEUE_ID_DMA_1_1,
+       GAUDI_QUEUE_ID_DMA_1_2,
+       GAUDI_QUEUE_ID_DMA_1_3
+};
+
 static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
                "gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
                "gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -348,6 +363,97 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
        QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */
 };
 
+static struct hl_hw_obj_name_entry gaudi_so_id_to_str[] = {
+       { .id = 0,  .name = "SYNC_OBJ_DMA_DOWN_FEEDBACK" },
+       { .id = 1,  .name = "SYNC_OBJ_DMA_UP_FEEDBACK" },
+       { .id = 2,  .name = "SYNC_OBJ_DMA_STATIC_DRAM_SRAM_FEEDBACK" },
+       { .id = 3,  .name = "SYNC_OBJ_DMA_SRAM_DRAM_FEEDBACK" },
+       { .id = 4,  .name = "SYNC_OBJ_FIRST_COMPUTE_FINISH" },
+       { .id = 5,  .name = "SYNC_OBJ_HOST_DRAM_DONE" },
+       { .id = 6,  .name = "SYNC_OBJ_DBG_CTR_DEPRECATED" },
+       { .id = 7,  .name = "SYNC_OBJ_DMA_ACTIVATIONS_DRAM_SRAM_FEEDBACK" },
+       { .id = 8,  .name = "SYNC_OBJ_ENGINE_SEM_MME_0" },
+       { .id = 9,  .name = "SYNC_OBJ_ENGINE_SEM_MME_1" },
+       { .id = 10, .name = "SYNC_OBJ_ENGINE_SEM_TPC_0" },
+       { .id = 11, .name = "SYNC_OBJ_ENGINE_SEM_TPC_1" },
+       { .id = 12, .name = "SYNC_OBJ_ENGINE_SEM_TPC_2" },
+       { .id = 13, .name = "SYNC_OBJ_ENGINE_SEM_TPC_3" },
+       { .id = 14, .name = "SYNC_OBJ_ENGINE_SEM_TPC_4" },
+       { .id = 15, .name = "SYNC_OBJ_ENGINE_SEM_TPC_5" },
+       { .id = 16, .name = "SYNC_OBJ_ENGINE_SEM_TPC_6" },
+       { .id = 17, .name = "SYNC_OBJ_ENGINE_SEM_TPC_7" },
+       { .id = 18, .name = "SYNC_OBJ_ENGINE_SEM_DMA_1" },
+       { .id = 19, .name = "SYNC_OBJ_ENGINE_SEM_DMA_2" },
+       { .id = 20, .name = "SYNC_OBJ_ENGINE_SEM_DMA_3" },
+       { .id = 21, .name = "SYNC_OBJ_ENGINE_SEM_DMA_4" },
+       { .id = 22, .name = "SYNC_OBJ_ENGINE_SEM_DMA_5" },
+       { .id = 23, .name = "SYNC_OBJ_ENGINE_SEM_DMA_6" },
+       { .id = 24, .name = "SYNC_OBJ_ENGINE_SEM_DMA_7" },
+       { .id = 25, .name = "SYNC_OBJ_DBG_CTR_0" },
+       { .id = 26, .name = "SYNC_OBJ_DBG_CTR_1" },
+};
+
+static struct hl_hw_obj_name_entry gaudi_monitor_id_to_str[] = {
+       { .id = 200, .name = "MON_OBJ_DMA_DOWN_FEEDBACK_RESET" },
+       { .id = 201, .name = "MON_OBJ_DMA_UP_FEADBACK_RESET" },
+       { .id = 203, .name = "MON_OBJ_DRAM_TO_SRAM_QUEUE_FENCE" },
+       { .id = 204, .name = "MON_OBJ_TPC_0_CLK_GATE" },
+       { .id = 205, .name = "MON_OBJ_TPC_1_CLK_GATE" },
+       { .id = 206, .name = "MON_OBJ_TPC_2_CLK_GATE" },
+       { .id = 207, .name = "MON_OBJ_TPC_3_CLK_GATE" },
+       { .id = 208, .name = "MON_OBJ_TPC_4_CLK_GATE" },
+       { .id = 209, .name = "MON_OBJ_TPC_5_CLK_GATE" },
+       { .id = 210, .name = "MON_OBJ_TPC_6_CLK_GATE" },
+       { .id = 211, .name = "MON_OBJ_TPC_7_CLK_GATE" },
+};
+
+static s64 gaudi_state_dump_specs_props[] = {
+       [SP_SYNC_OBJ_BASE_ADDR] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0,
+       [SP_NEXT_SYNC_OBJ_ADDR] = NEXT_SYNC_OBJ_ADDR_INTERVAL,
+       [SP_SYNC_OBJ_AMOUNT] = NUM_OF_SOB_IN_BLOCK,
+       [SP_MON_OBJ_WR_ADDR_LOW] =
+               mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0,
+       [SP_MON_OBJ_WR_ADDR_HIGH] =
+               mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0,
+       [SP_MON_OBJ_WR_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0,
+       [SP_MON_OBJ_ARM_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0,
+       [SP_MON_OBJ_STATUS] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0,
+       [SP_MONITORS_AMOUNT] = NUM_OF_MONITORS_IN_BLOCK,
+       [SP_TPC0_CMDQ] = mmTPC0_QM_GLBL_CFG0,
+       [SP_TPC0_CFG_SO] = mmTPC0_CFG_QM_SYNC_OBJECT_ADDR,
+       [SP_NEXT_TPC] = mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0,
+       [SP_MME_CMDQ] = mmMME0_QM_GLBL_CFG0,
+       [SP_MME_CFG_SO] = mmMME0_CTRL_ARCH_DESC_SYNC_OBJECT_ADDR_LOW_LOCAL,
+       [SP_NEXT_MME] = mmMME2_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0,
+       [SP_DMA_CMDQ] = mmDMA0_QM_GLBL_CFG0,
+       [SP_DMA_CFG_SO] = mmDMA0_CORE_WR_COMP_ADDR_LO,
+       [SP_DMA_QUEUES_OFFSET] = mmDMA1_QM_GLBL_CFG0 - mmDMA0_QM_GLBL_CFG0,
+       [SP_NUM_OF_MME_ENGINES] = NUM_OF_MME_ENGINES,
+       [SP_SUB_MME_ENG_NUM] = NUM_OF_MME_SUB_ENGINES,
+       [SP_NUM_OF_DMA_ENGINES] = NUM_OF_DMA_ENGINES,
+       [SP_NUM_OF_TPC_ENGINES] = NUM_OF_TPC_ENGINES,
+       [SP_ENGINE_NUM_OF_QUEUES] = NUM_OF_QUEUES,
+       [SP_ENGINE_NUM_OF_STREAMS] = NUM_OF_STREAMS,
+       [SP_ENGINE_NUM_OF_FENCES] = NUM_OF_FENCES,
+       [SP_FENCE0_CNT_OFFSET] =
+               mmDMA0_QM_CP_FENCE0_CNT_0 - mmDMA0_QM_GLBL_CFG0,
+       [SP_FENCE0_RDATA_OFFSET] =
+               mmDMA0_QM_CP_FENCE0_RDATA_0 - mmDMA0_QM_GLBL_CFG0,
+       [SP_CP_STS_OFFSET] = mmDMA0_QM_CP_STS_0 - mmDMA0_QM_GLBL_CFG0,
+       [SP_NUM_CORES] = 1,
+};
+
+/* The order here is opposite to the order of the indexing in the h/w.
+ * i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc.
+ */
+static const char * const gaudi_sync_manager_names[] = {
+       "SYNC_MGR_E_N",
+       "SYNC_MGR_W_N",
+       "SYNC_MGR_E_S",
+       "SYNC_MGR_W_S",
+       NULL
+};
+
 struct ecc_info_extract_params {
        u64 block_address;
        u32 num_memories;
@@ -363,8 +469,6 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
                                        u32 size, u64 val);
 static int gaudi_memset_registers(struct hl_device *hdev, u64 reg_base,
                                        u32 num_regs, u32 val);
-static int gaudi_schedule_register_memset(struct hl_device *hdev,
-               u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val);
 static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
                                u32 tpc_id);
 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev);
@@ -375,7 +479,6 @@ static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
                                u32 size, bool eb);
 static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
                                struct hl_gen_wait_properties *prop);
-
 static inline enum hl_collective_mode
 get_collective_mode(struct hl_device *hdev, u32 queue_id)
 {
@@ -403,7 +506,11 @@ static inline void set_default_power_values(struct hl_device *hdev)
 
        if (hdev->card_type == cpucp_card_type_pmc) {
                prop->max_power_default = MAX_POWER_DEFAULT_PMC;
-               prop->dc_power_default = DC_POWER_DEFAULT_PMC;
+
+               if (prop->fw_security_enabled)
+                       prop->dc_power_default = DC_POWER_DEFAULT_PMC_SEC;
+               else
+                       prop->dc_power_default = DC_POWER_DEFAULT_PMC;
        } else {
                prop->max_power_default = MAX_POWER_DEFAULT_PCI;
                prop->dc_power_default = DC_POWER_DEFAULT_PCI;
@@ -450,6 +557,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
                                                get_collective_mode(hdev, i);
        }
 
+       prop->device_dma_offset_for_host_access = HOST_PHYS_BASE;
        prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
        prop->collective_first_sob = 0;
        prop->collective_first_mon = 0;
@@ -551,6 +659,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
        prop->hard_reset_done_by_fw = false;
        prop->gic_interrupts_enable = true;
 
+       prop->server_type = HL_SERVER_TYPE_UNKNOWN;
+
        return 0;
 }
 
@@ -723,14 +833,14 @@ pci_init:
                                        GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC);
        if (rc) {
                if (hdev->reset_on_preboot_fail)
-                       hdev->asic_funcs->hw_fini(hdev, true);
+                       hdev->asic_funcs->hw_fini(hdev, true, false);
                goto pci_fini;
        }
 
        if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
                dev_info(hdev->dev,
                        "H/W state is dirty, must reset before initializing\n");
-               hdev->asic_funcs->hw_fini(hdev, true);
+               hdev->asic_funcs->hw_fini(hdev, true, false);
        }
 
        return 0;
@@ -974,17 +1084,11 @@ static void gaudi_sob_group_hw_reset(struct kref *ref)
        struct gaudi_hw_sob_group *hw_sob_group =
                container_of(ref, struct gaudi_hw_sob_group, kref);
        struct hl_device *hdev = hw_sob_group->hdev;
-       u64 base_addr;
-       int rc;
+       int i;
 
-       base_addr = CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
-                       hw_sob_group->base_sob_id * 4;
-       rc = gaudi_schedule_register_memset(hdev, hw_sob_group->queue_id,
-                       base_addr, NUMBER_OF_SOBS_IN_GRP, 0);
-       if (rc)
-               dev_err(hdev->dev,
-                       "failed resetting sob group - sob base %u, count %u",
-                       hw_sob_group->base_sob_id, NUMBER_OF_SOBS_IN_GRP);
+       for (i = 0 ; i < NUMBER_OF_SOBS_IN_GRP ; i++)
+               WREG32((mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
+                       (hw_sob_group->base_sob_id * 4) + (i * 4)), 0);
 
        kref_init(&hw_sob_group->kref);
 }
@@ -1121,6 +1225,20 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
        queue_id = job->hw_queue_id;
        prop = &hdev->kernel_queues[queue_id].sync_stream_prop;
 
+       if (job->cs->encaps_signals) {
+               /* use the encaps signal handle store earlier in the flow
+                * and set the SOB information from the encaps
+                * signals handle
+                */
+               hl_hw_queue_encaps_sig_set_sob_info(hdev, job->cs, job,
+                                               cs_cmpl);
+
+               dev_dbg(hdev->dev, "collective wait: Sequence %llu found, sob_id: %u,  wait for sob_val: %u\n",
+                               job->cs->sequence,
+                               cs_cmpl->hw_sob->sob_id,
+                               cs_cmpl->sob_val);
+       }
+
        /* Add to wait CBs using slave monitor */
        wait_prop.data = (void *) job->user_cb;
        wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
@@ -1131,7 +1249,7 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
        wait_prop.size = cb_size;
 
        dev_dbg(hdev->dev,
-               "Generate slave wait CB, sob %d, val:0x%x, mon %d, q %d\n",
+               "Generate slave wait CB, sob %d, val:%x, mon %d, q %d\n",
                cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
                prop->collective_slave_mon_id, queue_id);
 
@@ -1145,7 +1263,7 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
                        prop->collective_sob_id, cb_size, false);
 }
 
-static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
+static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 {
        struct hl_cs_compl *signal_cs_cmpl =
                container_of(cs->signal_fence, struct hl_cs_compl, base_fence);
@@ -1163,9 +1281,37 @@ static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
        gaudi = hdev->asic_specific;
        cprop = &gaudi->collective_props;
 
-       /* copy the SOB id and value of the signal CS */
-       cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
-       cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+       /* In encaps signals case the SOB info will be retrieved from
+        * the handle in gaudi_collective_slave_init_job.
+        */
+       if (!cs->encaps_signals) {
+               /* copy the SOB id and value of the signal CS */
+               cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
+               cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+       }
+
+       /* check again if the signal cs already completed.
+        * if yes then don't send any wait cs since the hw_sob
+        * could be in reset already. if signal is not completed
+        * then get refcount to hw_sob to prevent resetting the sob
+        * while wait cs is not submitted.
+        * note that this check is protected by two locks,
+        * hw queue lock and completion object lock,
+        * and the same completion object lock also protects
+        * the hw_sob reset handler function.
+        * The hw_queue lock prevent out of sync of hw_sob
+        * refcount value, changed by signal/wait flows.
+        */
+       spin_lock(&signal_cs_cmpl->lock);
+
+       if (completion_done(&cs->signal_fence->completion)) {
+               spin_unlock(&signal_cs_cmpl->lock);
+               return -EINVAL;
+       }
+       /* Increment kref since all slave queues are now waiting on it */
+       kref_get(&cs_cmpl->hw_sob->kref);
+
+       spin_unlock(&signal_cs_cmpl->lock);
 
        /* Calculate the stream from collective master queue (1st job) */
        job = list_first_entry(&cs->job_list, struct hl_cs_job, cs_node);
@@ -1210,21 +1356,17 @@ static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
                                cprop->curr_sob_group_idx[stream], stream);
        }
 
-       /* Increment kref since all slave queues are now waiting on it */
-       kref_get(&cs_cmpl->hw_sob->kref);
-       /*
-        * Must put the signal fence after the SOB refcnt increment so
-        * the SOB refcnt won't turn 0 and reset the SOB before the
-        * wait CS was submitted.
-        */
        mb();
        hl_fence_put(cs->signal_fence);
        cs->signal_fence = NULL;
+
+       return 0;
 }
 
 static int gaudi_collective_wait_create_job(struct hl_device *hdev,
                struct hl_ctx *ctx, struct hl_cs *cs,
-               enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id)
+               enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id,
+               u32 encaps_signal_offset)
 {
        struct hw_queue_properties *hw_queue_prop;
        struct hl_cs_counters_atomic *cntr;
@@ -1284,6 +1426,13 @@ static int gaudi_collective_wait_create_job(struct hl_device *hdev,
        job->user_cb_size = cb_size;
        job->hw_queue_id = queue_id;
 
+       /* since its guaranteed to have only one chunk in the collective wait
+        * cs, we can use this chunk to set the encapsulated signal offset
+        * in the jobs.
+        */
+       if (cs->encaps_signals)
+               job->encaps_sig_wait_offset = encaps_signal_offset;
+
        /*
         * No need in parsing, user CB is the patched CB.
         * We call hl_cb_destroy() out of two reasons - we don't need
@@ -1312,8 +1461,9 @@ static int gaudi_collective_wait_create_job(struct hl_device *hdev,
 }
 
 static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
-               struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
-               u32 collective_engine_id)
+               struct hl_ctx *ctx, struct hl_cs *cs,
+               u32 wait_queue_id, u32 collective_engine_id,
+               u32 encaps_signal_offset)
 {
        struct gaudi_device *gaudi = hdev->asic_specific;
        struct hw_queue_properties *hw_queue_prop;
@@ -1363,7 +1513,8 @@ static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
                if (i == 0) {
                        queue_id = wait_queue_id;
                        rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
-                               HL_COLLECTIVE_MASTER, queue_id, wait_queue_id);
+                               HL_COLLECTIVE_MASTER, queue_id,
+                               wait_queue_id, encaps_signal_offset);
                } else {
                        if (nic_idx < NIC_NUMBER_OF_ENGINES) {
                                if (gaudi->hw_cap_initialized &
@@ -1383,7 +1534,8 @@ static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
                        }
 
                        rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
-                               HL_COLLECTIVE_SLAVE, queue_id, wait_queue_id);
+                               HL_COLLECTIVE_SLAVE, queue_id,
+                               wait_queue_id, encaps_signal_offset);
                }
 
                if (rc)
@@ -1431,6 +1583,11 @@ static int gaudi_late_init(struct hl_device *hdev)
                return rc;
        }
 
+       /* Scrub both SRAM and DRAM */
+       rc = hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
+       if (rc)
+               goto disable_pci_access;
+
        rc = gaudi_fetch_psoc_frequency(hdev);
        if (rc) {
                dev_err(hdev->dev, "Failed to fetch psoc frequency\n");
@@ -1455,6 +1612,11 @@ static int gaudi_late_init(struct hl_device *hdev)
                goto disable_pci_access;
        }
 
+       /* We only support a single ASID for the user, so for the sake of optimization, just
+        * initialize the ASID one time during device initialization with the fixed value of 1
+        */
+       gaudi_mmu_prepare(hdev, 1);
+
        return 0;
 
 disable_pci_access:
@@ -1720,8 +1882,12 @@ static int gaudi_sw_init(struct hl_device *hdev)
        hdev->supports_sync_stream = true;
        hdev->supports_coresight = true;
        hdev->supports_staged_submission = true;
+       hdev->supports_wait_for_multi_cs = true;
 
-       gaudi_set_pci_memory_regions(hdev);
+       hdev->asic_funcs->set_pci_memory_regions(hdev);
+       hdev->stream_master_qid_arr =
+                               hdev->asic_funcs->get_stream_master_qid_arr();
+       hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE;
 
        return 0;
 
@@ -2523,7 +2689,7 @@ static void gaudi_init_golden_registers(struct hl_device *hdev)
                                tpc_id < TPC_NUMBER_OF_ENGINES;
                                tpc_id++, tpc_offset += TPC_CFG_OFFSET) {
                /* Mask all arithmetic interrupts from TPC */
-               WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFF);
+               WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFE);
                /* Set 16 cache lines */
                WREG32_FIELD(TPC0_CFG_MSS_CONFIG, tpc_offset,
                                ICACHE_FETCH_LINE_NUM, 2);
@@ -3670,7 +3836,7 @@ static void gaudi_disable_timestamp(struct hl_device *hdev)
        WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
 }
 
-static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
+static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
        u32 wait_timeout_ms;
 
@@ -3682,6 +3848,9 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
        else
                wait_timeout_ms = GAUDI_RESET_WAIT_MSEC;
 
+       if (fw_reset)
+               goto skip_engines;
+
        gaudi_stop_nic_qmans(hdev);
        gaudi_stop_mme_qmans(hdev);
        gaudi_stop_tpc_qmans(hdev);
@@ -3707,6 +3876,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
 
        gaudi_disable_timestamp(hdev);
 
+skip_engines:
        gaudi_disable_msi(hdev);
 }
 
@@ -3739,6 +3909,9 @@ static int gaudi_mmu_init(struct hl_device *hdev)
        WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8);
        WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR >> 40);
 
+       /* mem cache invalidation */
+       WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1);
+
        hdev->asic_funcs->mmu_invalidate_cache(hdev, true, 0);
 
        WREG32(mmMMU_UP_MMU_ENABLE, 1);
@@ -4071,7 +4244,7 @@ disable_queues:
        return rc;
 }
 
-static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
+static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
        struct cpu_dyn_regs *dyn_regs =
                        &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
@@ -4092,6 +4265,14 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
                cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
        }
 
+       if (fw_reset) {
+               dev_info(hdev->dev,
+                       "Firmware performs HARD reset, going to wait %dms\n",
+                       reset_timeout_ms);
+
+               goto skip_reset;
+       }
+
        driver_performs_reset = !!(!hdev->asic_prop.fw_security_enabled &&
                                        !hdev->asic_prop.hard_reset_done_by_fw);
 
@@ -4168,6 +4349,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
                        reset_timeout_ms);
        }
 
+skip_reset:
        /*
         * After hard reset, we can't poll the BTM_FSM register because the PSOC
         * itself is in reset. Need to wait until the reset is deasserted
@@ -4212,7 +4394,7 @@ static int gaudi_resume(struct hl_device *hdev)
        return gaudi_init_iatu(hdev);
 }
 
-static int gaudi_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
+static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
                        void *cpu_addr, dma_addr_t dma_addr, size_t size)
 {
        int rc;
@@ -4621,8 +4803,8 @@ static int gaudi_hbm_scrubbing(struct hl_device *hdev)
                                "Doing HBM scrubbing for 0x%09llx - 0x%09llx\n",
                                cur_addr, cur_addr + chunk_size);
 
-                       WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0);
-                       WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0);
+                       WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0xdeadbeaf);
+                       WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0xdeadbeaf);
                        WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset,
                                                lower_32_bits(cur_addr));
                        WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset,
@@ -5796,78 +5978,6 @@ release_cb:
        return rc;
 }
 
-static int gaudi_schedule_register_memset(struct hl_device *hdev,
-               u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val)
-{
-       struct hl_ctx *ctx;
-       struct hl_pending_cb *pending_cb;
-       struct packet_msg_long *pkt;
-       u32 cb_size, ctl;
-       struct hl_cb *cb;
-       int i, rc;
-
-       mutex_lock(&hdev->fpriv_list_lock);
-       ctx = hdev->compute_ctx;
-
-       /* If no compute context available or context is going down
-        * memset registers directly
-        */
-       if (!ctx || kref_read(&ctx->refcount) == 0) {
-               rc = gaudi_memset_registers(hdev, reg_base, num_regs, val);
-               mutex_unlock(&hdev->fpriv_list_lock);
-               return rc;
-       }
-
-       mutex_unlock(&hdev->fpriv_list_lock);
-
-       cb_size = (sizeof(*pkt) * num_regs) +
-                       sizeof(struct packet_msg_prot) * 2;
-
-       if (cb_size > SZ_2M) {
-               dev_err(hdev->dev, "CB size must be smaller than %uMB", SZ_2M);
-               return -ENOMEM;
-       }
-
-       pending_cb = kzalloc(sizeof(*pending_cb), GFP_KERNEL);
-       if (!pending_cb)
-               return -ENOMEM;
-
-       cb = hl_cb_kernel_create(hdev, cb_size, false);
-       if (!cb) {
-               kfree(pending_cb);
-               return -EFAULT;
-       }
-
-       pkt = cb->kernel_address;
-
-       ctl = FIELD_PREP(GAUDI_PKT_LONG_CTL_OP_MASK, 0); /* write the value */
-       ctl |= FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_LONG);
-       ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);
-       ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1);
-       ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
-
-       for (i = 0; i < num_regs ; i++, pkt++) {
-               pkt->ctl = cpu_to_le32(ctl);
-               pkt->value = cpu_to_le32(val);
-               pkt->addr = cpu_to_le64(reg_base + (i * 4));
-       }
-
-       hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
-
-       pending_cb->cb = cb;
-       pending_cb->cb_size = cb_size;
-       /* The queue ID MUST be an external queue ID. Otherwise, we will
-        * have undefined behavior
-        */
-       pending_cb->hw_queue_id = hw_queue_id;
-
-       spin_lock(&ctx->pending_cb_lock);
-       list_add_tail(&pending_cb->cb_node, &ctx->pending_cb_list);
-       spin_unlock(&ctx->pending_cb_lock);
-
-       return 0;
-}
-
 static int gaudi_restore_sm_registers(struct hl_device *hdev)
 {
        u64 base_addr;
@@ -6013,7 +6123,7 @@ static int gaudi_restore_user_registers(struct hl_device *hdev)
 
 static int gaudi_context_switch(struct hl_device *hdev, u32 asid)
 {
-       return gaudi_restore_user_registers(hdev);
+       return 0;
 }
 
 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev)
@@ -6723,6 +6833,9 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
                                asid);
        }
 
+       gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, asid);
+       gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, asid);
+
        hdev->asic_funcs->set_clock_gating(hdev);
 
        mutex_unlock(&gaudi->clk_gate_mutex);
@@ -6772,7 +6885,8 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
 
        dma_offset = gaudi_dma_assignment[GAUDI_PCI_DMA_1] * DMA_CORE_OFFSET;
 
-       WREG32_OR(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_VAL_SHIFT));
+       WREG32(mmDMA0_CORE_PROT + dma_offset,
+                       BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT) | BIT(DMA0_CORE_PROT_VAL_SHIFT));
 
        rc = hl_hw_queue_send_cb_no_cmpl(hdev, GAUDI_QUEUE_ID_DMA_0_0,
                                        job->job_cb_size, cb->bus_address);
@@ -6793,8 +6907,7 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
        }
 
 free_fence_ptr:
-       WREG32_AND(mmDMA0_CORE_PROT + dma_offset,
-                       ~BIT(DMA0_CORE_PROT_VAL_SHIFT));
+       WREG32(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT));
 
        hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_ptr,
                                        fence_dma_addr);
@@ -7168,7 +7281,7 @@ static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream
 
        cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo);
        size = RREG32(cq_tsize);
-       dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n",
+       dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n",
                                                        stream, cq_ptr, size);
 }
 
@@ -7224,7 +7337,7 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 
                addr = le64_to_cpu(bd->ptr);
 
-               dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n",
+               dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n",
                                                        stream, ci, addr, len);
 
                /* get previous ci, wrap if needed */
@@ -7326,24 +7439,30 @@ static void gaudi_print_sm_sei_info(struct hl_device *hdev, u16 event_type,
 {
        u32 index = event_type - GAUDI_EVENT_DMA_IF_SEI_0;
 
+       /* Flip the bits as the enum is ordered in the opposite way */
+       index = (index ^ 0x3) & 0x3;
+
        switch (sei_data->sei_cause) {
        case SM_SEI_SO_OVERFLOW:
-               dev_err(hdev->dev,
-                       "SM %u SEI Error: SO %u overflow/underflow",
-                       index, le32_to_cpu(sei_data->sei_log));
+               dev_err_ratelimited(hdev->dev,
+                       "%s SEI Error: SOB Group %u overflow/underflow",
+                       gaudi_sync_manager_names[index],
+                       le32_to_cpu(sei_data->sei_log));
                break;
        case SM_SEI_LBW_4B_UNALIGNED:
-               dev_err(hdev->dev,
-                       "SM %u SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x",
-                       index, le32_to_cpu(sei_data->sei_log));
+               dev_err_ratelimited(hdev->dev,
+                       "%s SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x",
+                       gaudi_sync_manager_names[index],
+                       le32_to_cpu(sei_data->sei_log));
                break;
        case SM_SEI_AXI_RESPONSE_ERR:
-               dev_err(hdev->dev,
-                       "SM %u SEI Error: AXI ID %u response error",
-                       index, le32_to_cpu(sei_data->sei_log));
+               dev_err_ratelimited(hdev->dev,
+                       "%s SEI Error: AXI ID %u response error",
+                       gaudi_sync_manager_names[index],
+                       le32_to_cpu(sei_data->sei_log));
                break;
        default:
-               dev_err(hdev->dev, "Unknown SM SEI cause %u",
+               dev_err_ratelimited(hdev->dev, "Unknown SM SEI cause %u",
                                le32_to_cpu(sei_data->sei_log));
                break;
        }
@@ -7358,6 +7477,11 @@ static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
        bool extract_info_from_fw;
        int rc;
 
+       if (hdev->asic_prop.fw_security_enabled) {
+               extract_info_from_fw = true;
+               goto extract_ecc_info;
+       }
+
        switch (event_type) {
        case GAUDI_EVENT_PCIE_CORE_SERR ... GAUDI_EVENT_PCIE_PHY_DERR:
        case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_MMU_DERR:
@@ -7430,6 +7554,7 @@ static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
                return;
        }
 
+extract_ecc_info:
        if (extract_info_from_fw) {
                ecc_address = le64_to_cpu(ecc_data->ecc_address);
                ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom);
@@ -7806,8 +7931,15 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
        u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
                        >> EQ_CTL_EVENT_TYPE_SHIFT);
-       u8 cause;
        bool reset_required;
+       u8 cause;
+       int rc;
+
+       if (event_type >= GAUDI_EVENT_SIZE) {
+               dev_err(hdev->dev, "Event type %u exceeds maximum of %u",
+                               event_type, GAUDI_EVENT_SIZE - 1);
+               return;
+       }
 
        gaudi->events_stat[event_type]++;
        gaudi->events_stat_aggregate[event_type]++;
@@ -7880,10 +8012,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
                                        tpc_dec_event_to_tpc_id(event_type),
                                        "AXI_SLV_DEC_Error");
                if (reset_required) {
-                       dev_err(hdev->dev, "hard reset required due to %s\n",
+                       dev_err(hdev->dev, "reset required due to %s\n",
                                gaudi_irq_map_table[event_type].name);
 
-                       goto reset_device;
+                       hl_device_reset(hdev, 0);
                } else {
                        hl_fw_unmask_irq(hdev, event_type);
                }
@@ -7902,10 +8034,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
                                        tpc_krn_event_to_tpc_id(event_type),
                                        "KRN_ERR");
                if (reset_required) {
-                       dev_err(hdev->dev, "hard reset required due to %s\n",
+                       dev_err(hdev->dev, "reset required due to %s\n",
                                gaudi_irq_map_table[event_type].name);
 
-                       goto reset_device;
+                       hl_device_reset(hdev, 0);
                } else {
                        hl_fw_unmask_irq(hdev, event_type);
                }
@@ -7993,6 +8125,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
                gaudi_print_irq_info(hdev, event_type, false);
                gaudi_print_sm_sei_info(hdev, event_type,
                                        &eq_entry->sm_sei_data);
+               rc = hl_state_dump(hdev);
+               if (rc)
+                       dev_err(hdev->dev,
+                               "Error during system state dump %d\n", rc);
                hl_fw_unmask_irq(hdev, event_type);
                break;
 
@@ -8031,7 +8167,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        return;
 
 reset_device:
-       if (hdev->hard_reset_on_fw_events)
+       if (hdev->asic_prop.fw_security_enabled)
+               hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW);
+       else if (hdev->hard_reset_on_fw_events)
                hl_device_reset(hdev, HL_RESET_HARD);
        else
                hl_fw_unmask_irq(hdev, event_type);
@@ -8563,11 +8701,20 @@ static void gaudi_internal_cb_pool_fini(struct hl_device *hdev,
 
 static int gaudi_ctx_init(struct hl_ctx *ctx)
 {
+       int rc;
+
        if (ctx->asid == HL_KERNEL_ASID_ID)
                return 0;
 
-       gaudi_mmu_prepare(ctx->hdev, ctx->asid);
-       return gaudi_internal_cb_pool_init(ctx->hdev, ctx);
+       rc = gaudi_internal_cb_pool_init(ctx->hdev, ctx);
+       if (rc)
+               return rc;
+
+       rc = gaudi_restore_user_registers(ctx->hdev);
+       if (rc)
+               gaudi_internal_cb_pool_fini(ctx->hdev, ctx);
+
+       return rc;
 }
 
 static void gaudi_ctx_fini(struct hl_ctx *ctx)
@@ -8596,6 +8743,11 @@ static u32 gaudi_get_wait_cb_size(struct hl_device *hdev)
                        sizeof(struct packet_msg_prot) * 2;
 }
 
+static u32 gaudi_get_sob_addr(struct hl_device *hdev, u32 sob_id)
+{
+       return mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + (sob_id * 4);
+}
+
 static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
                                u32 size, bool eb)
 {
@@ -8902,16 +9054,12 @@ static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
 static void gaudi_reset_sob(struct hl_device *hdev, void *data)
 {
        struct hl_hw_sob *hw_sob = (struct hl_hw_sob *) data;
-       int rc;
 
        dev_dbg(hdev->dev, "reset SOB, q_idx: %d, sob_id: %d\n", hw_sob->q_idx,
                hw_sob->sob_id);
 
-       rc = gaudi_schedule_register_memset(hdev, hw_sob->q_idx,
-                       CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
-                       hw_sob->sob_id * 4, 1, 0);
-       if (rc)
-               dev_err(hdev->dev, "failed resetting sob %u", hw_sob->sob_id);
+       WREG32(mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
+                       hw_sob->sob_id * 4, 0);
 
        kref_init(&hw_sob->kref);
 }
@@ -8977,6 +9125,280 @@ static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx)
        }
 }
 
+static int gaudi_add_sync_to_engine_map_entry(
+       struct hl_sync_to_engine_map *map, u32 reg_value,
+       enum hl_sync_engine_type engine_type, u32 engine_id)
+{
+       struct hl_sync_to_engine_map_entry *entry;
+
+       /* Reg value represents a partial address of sync object,
+        * it is used as unique identifier. For this we need to
+        * clear the cutoff cfg base bits from the value.
+        */
+       if (reg_value == 0 || reg_value == 0xffffffff)
+               return 0;
+       reg_value -= (u32)CFG_BASE;
+
+       /* create a new hash entry */
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return -ENOMEM;
+       entry->engine_type = engine_type;
+       entry->engine_id = engine_id;
+       entry->sync_id = reg_value;
+       hash_add(map->tb, &entry->node, reg_value);
+
+       return 0;
+}
+
+static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev,
+                               struct hl_sync_to_engine_map *map)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       struct gaudi_device *gaudi = hdev->asic_specific;
+       int i, j, rc;
+       u32 reg_value;
+
+       /* Iterate over TPC engines */
+       for (i = 0; i < sds->props[SP_NUM_OF_TPC_ENGINES]; ++i) {
+               /* TPC registered must be accessed with clock gating disabled */
+               mutex_lock(&gaudi->clk_gate_mutex);
+               hdev->asic_funcs->disable_clock_gating(hdev);
+
+               reg_value = RREG32(sds->props[SP_TPC0_CFG_SO] +
+                                       sds->props[SP_NEXT_TPC] * i);
+
+               /* We can reenable clock_gating */
+               hdev->asic_funcs->set_clock_gating(hdev);
+               mutex_unlock(&gaudi->clk_gate_mutex);
+
+               rc = gaudi_add_sync_to_engine_map_entry(map, reg_value,
+                                                       ENGINE_TPC, i);
+               if (rc)
+                       goto free_sync_to_engine_map;
+       }
+
+       /* Iterate over MME engines */
+       for (i = 0; i < sds->props[SP_NUM_OF_MME_ENGINES]; ++i) {
+               for (j = 0; j < sds->props[SP_SUB_MME_ENG_NUM]; ++j) {
+                       /* MME registered must be accessed with clock gating
+                        * disabled
+                        */
+                       mutex_lock(&gaudi->clk_gate_mutex);
+                       hdev->asic_funcs->disable_clock_gating(hdev);
+
+                       reg_value = RREG32(sds->props[SP_MME_CFG_SO] +
+                                               sds->props[SP_NEXT_MME] * i +
+                                               j * sizeof(u32));
+
+                       /* We can reenable clock_gating */
+                       hdev->asic_funcs->set_clock_gating(hdev);
+                       mutex_unlock(&gaudi->clk_gate_mutex);
+
+                       rc = gaudi_add_sync_to_engine_map_entry(
+                               map, reg_value, ENGINE_MME,
+                               i * sds->props[SP_SUB_MME_ENG_NUM] + j);
+                       if (rc)
+                               goto free_sync_to_engine_map;
+               }
+       }
+
+       /* Iterate over DMA engines */
+       for (i = 0; i < sds->props[SP_NUM_OF_DMA_ENGINES]; ++i) {
+               reg_value = RREG32(sds->props[SP_DMA_CFG_SO] +
+                                       sds->props[SP_DMA_QUEUES_OFFSET] * i);
+               rc = gaudi_add_sync_to_engine_map_entry(map, reg_value,
+                                                       ENGINE_DMA, i);
+               if (rc)
+                       goto free_sync_to_engine_map;
+       }
+
+       return 0;
+
+free_sync_to_engine_map:
+       hl_state_dump_free_sync_to_engine_map(map);
+
+       return rc;
+}
+
+static int gaudi_monitor_valid(struct hl_mon_state_dump *mon)
+{
+       return FIELD_GET(
+               SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_MASK,
+               mon->status);
+}
+
+static void gaudi_fill_sobs_from_mon(char *sobs, struct hl_mon_state_dump *mon)
+{
+       const size_t max_write = 10;
+       u32 gid, mask, sob;
+       int i, offset;
+
+       /* Sync object ID is calculated as follows:
+        * (8 * group_id + cleared bits in mask)
+        */
+       gid = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK,
+                       mon->arm_data);
+       mask = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK,
+                       mon->arm_data);
+
+       for (i = 0, offset = 0; mask && offset < MONITOR_SOB_STRING_SIZE -
+               max_write; mask >>= 1, i++) {
+               if (!(mask & 1)) {
+                       sob = gid * MONITOR_MAX_SOBS + i;
+
+                       if (offset > 0)
+                               offset += snprintf(sobs + offset, max_write,
+                                                       ", ");
+
+                       offset += snprintf(sobs + offset, max_write, "%u", sob);
+               }
+       }
+}
+
+static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset,
+                               struct hl_device *hdev,
+                               struct hl_mon_state_dump *mon)
+{
+       const char *name;
+       char scratch_buf1[BIN_REG_STRING_SIZE],
+               scratch_buf2[BIN_REG_STRING_SIZE];
+       char monitored_sobs[MONITOR_SOB_STRING_SIZE] = {0};
+
+       name = hl_state_dump_get_monitor_name(hdev, mon);
+       if (!name)
+               name = "";
+
+       gaudi_fill_sobs_from_mon(monitored_sobs, mon);
+
+       return hl_snprintf_resize(
+               buf, size, offset,
+               "Mon id: %u%s, wait for group id: %u mask %s to reach val: %u and write %u to address 0x%llx. Pending: %s. Means sync objects [%s] are being monitored.",
+               mon->id, name,
+               FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK,
+                               mon->arm_data),
+               hl_format_as_binary(
+                       scratch_buf1, sizeof(scratch_buf1),
+                       FIELD_GET(
+                               SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK,
+                               mon->arm_data)),
+               FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_MASK,
+                               mon->arm_data),
+               mon->wr_data,
+               (((u64)mon->wr_addr_high) << 32) | mon->wr_addr_low,
+               hl_format_as_binary(
+                       scratch_buf2, sizeof(scratch_buf2),
+                       FIELD_GET(
+                               SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK,
+                               mon->status)),
+               monitored_sobs);
+}
+
+
+static int gaudi_print_fences_single_engine(
+       struct hl_device *hdev, u64 base_offset, u64 status_base_offset,
+       enum hl_sync_engine_type engine_type, u32 engine_id, char **buf,
+       size_t *size, size_t *offset)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       int rc = -ENOMEM, i;
+       u32 *statuses, *fences;
+
+       statuses = kcalloc(sds->props[SP_ENGINE_NUM_OF_QUEUES],
+                       sizeof(*statuses), GFP_KERNEL);
+       if (!statuses)
+               goto out;
+
+       fences = kcalloc(sds->props[SP_ENGINE_NUM_OF_FENCES] *
+                               sds->props[SP_ENGINE_NUM_OF_QUEUES],
+                        sizeof(*fences), GFP_KERNEL);
+       if (!fences)
+               goto free_status;
+
+       for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES]; ++i)
+               statuses[i] = RREG32(status_base_offset + i * sizeof(u32));
+
+       for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES] *
+                               sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i)
+               fences[i] = RREG32(base_offset + i * sizeof(u32));
+
+       /* The actual print */
+       for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i) {
+               u32 fence_id;
+               u64 fence_cnt, fence_rdata;
+               const char *engine_name;
+
+               if (!FIELD_GET(TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_MASK,
+                       statuses[i]))
+                       continue;
+
+               fence_id =
+                       FIELD_GET(TPC0_QM_CP_STS_0_FENCE_ID_MASK, statuses[i]);
+               fence_cnt = base_offset + CFG_BASE +
+                       sizeof(u32) *
+                       (i + fence_id * sds->props[SP_ENGINE_NUM_OF_QUEUES]);
+               fence_rdata = fence_cnt - sds->props[SP_FENCE0_CNT_OFFSET] +
+                               sds->props[SP_FENCE0_RDATA_OFFSET];
+               engine_name = hl_sync_engine_to_string(engine_type);
+
+               rc = hl_snprintf_resize(
+                       buf, size, offset,
+                       "%s%u, stream %u: fence id %u cnt = 0x%llx (%s%u_QM.CP_FENCE%u_CNT_%u) rdata = 0x%llx (%s%u_QM.CP_FENCE%u_RDATA_%u) value = %u, cp_status = %u\n",
+                       engine_name, engine_id,
+                       i, fence_id,
+                       fence_cnt, engine_name, engine_id, fence_id, i,
+                       fence_rdata, engine_name, engine_id, fence_id, i,
+                       fences[fence_id],
+                       statuses[i]);
+               if (rc)
+                       goto free_fences;
+       }
+
+       rc = 0;
+
+free_fences:
+       kfree(fences);
+free_status:
+       kfree(statuses);
+out:
+       return rc;
+}
+
+
+static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = {
+       .monitor_valid = gaudi_monitor_valid,
+       .print_single_monitor = gaudi_print_single_monitor,
+       .gen_sync_to_engine_map = gaudi_gen_sync_to_engine_map,
+       .print_fences_single_engine = gaudi_print_fences_single_engine,
+};
+
+static void gaudi_state_dump_init(struct hl_device *hdev)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(gaudi_so_id_to_str); ++i)
+               hash_add(sds->so_id_to_str_tb,
+                       &gaudi_so_id_to_str[i].node,
+                       gaudi_so_id_to_str[i].id);
+
+       for (i = 0; i < ARRAY_SIZE(gaudi_monitor_id_to_str); ++i)
+               hash_add(sds->monitor_id_to_str_tb,
+                       &gaudi_monitor_id_to_str[i].node,
+                       gaudi_monitor_id_to_str[i].id);
+
+       sds->props = gaudi_state_dump_specs_props;
+
+       sds->sync_namager_names = gaudi_sync_manager_names;
+
+       sds->funcs = gaudi_state_dump_funcs;
+}
+
+static u32 *gaudi_get_stream_master_qid_arr(void)
+{
+       return gaudi_stream_master;
+}
+
 static const struct hl_asic_funcs gaudi_funcs = {
        .early_init = gaudi_early_init,
        .early_fini = gaudi_early_fini,
@@ -8989,7 +9411,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
        .halt_engines = gaudi_halt_engines,
        .suspend = gaudi_suspend,
        .resume = gaudi_resume,
-       .cb_mmap = gaudi_cb_mmap,
+       .mmap = gaudi_mmap,
        .ring_doorbell = gaudi_ring_doorbell,
        .pqe_write = gaudi_pqe_write,
        .asic_dma_alloc_coherent = gaudi_dma_alloc_coherent,
@@ -9062,7 +9484,11 @@ static const struct hl_asic_funcs gaudi_funcs = {
        .enable_events_from_fw = gaudi_enable_events_from_fw,
        .map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx,
        .init_firmware_loader = gaudi_init_firmware_loader,
-       .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm
+       .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
+       .state_dump_init = gaudi_state_dump_init,
+       .get_sob_addr = gaudi_get_sob_addr,
+       .set_pci_memory_regions = gaudi_set_pci_memory_regions,
+       .get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr
 };
 
 /**