// SPDX-License-Identifier: GPL-2.0
/*
* Copyright 2016-2022 HabanaLabs, Ltd.
* All Rights Reserved.
*/
#include "gaudiP.h"
#include "../include/hw_ip/mmu/mmu_general.h"
#include "../include/hw_ip/mmu/mmu_v1_1.h"
#include "../include/gaudi/gaudi_masks.h"
#include "../include/gaudi/gaudi_fw_if.h"
#include "../include/gaudi/gaudi_reg_map.h"
#include "../include/gaudi/gaudi_async_ids_map_extended.h"
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/firmware.h>
#include <linux/hwmon.h>
#include <linux/iommu.h>
#include <linux/seq_file.h>
/*
* Gaudi security scheme:
*
* 1. Host is protected by:
* - Range registers
* - MMU
*
* 2. DDR is protected by:
* - Range registers (protect the first 512MB)
*
* 3. Configuration is protected by:
* - Range registers
* - Protection bits
*
* MMU is always enabled.
*
* QMAN DMA channels 0,1 (PCI DMAN):
* - DMA is not secured.
* - PQ and CQ are secured.
* - CP is secured: The driver needs to parse CB but WREG should be allowed
* because of TDMA (tensor DMA). Hence, WREG is always not
* secured.
*
* When the driver needs to use DMA it will check that Gaudi is idle, set DMA
* channel 0 to be secured, execute the DMA and change it back to not secured.
* Currently, the driver doesn't use the DMA while there are compute jobs
* running.
*
* The current use cases for the driver to use the DMA are:
* - Clear SRAM on context switch (happens on context switch when device is
* idle)
* - MMU page tables area clear (happens on init)
*
* QMAN DMA 2-7, TPC, MME, NIC:
* PQ is secured and is located on the Host (HBM CON TPC3 bug)
* CQ, CP and the engine are not secured
*
*/
#define GAUDI_BOOT_FIT_FILE "habanalabs/gaudi/gaudi-boot-fit.itb"
#define GAUDI_LINUX_FW_FILE "habanalabs/gaudi/gaudi-fit.itb"
#define GAUDI_TPC_FW_FILE "habanalabs/gaudi/gaudi_tpc.bin"
MODULE_FIRMWARE(GAUDI_BOOT_FIT_FILE);
MODULE_FIRMWARE(GAUDI_LINUX_FW_FILE);
MODULE_FIRMWARE(GAUDI_TPC_FW_FILE);
#define GAUDI_DMA_POOL_BLK_SIZE 0x100 /* 256 bytes */
#define GAUDI_RESET_TIMEOUT_MSEC 2000 /* 2000ms */
#define GAUDI_RESET_WAIT_MSEC 1 /* 1ms */
#define GAUDI_CPU_RESET_WAIT_MSEC 200 /* 200ms */
#define GAUDI_TEST_QUEUE_WAIT_USEC 100000 /* 100ms */
#define GAUDI_PLDM_RESET_WAIT_MSEC 1000 /* 1s */
#define GAUDI_PLDM_HRESET_TIMEOUT_MSEC 20000 /* 20s */
#define GAUDI_PLDM_TEST_QUEUE_WAIT_USEC 1000000 /* 1s */
#define GAUDI_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100)
#define GAUDI_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
#define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC 4000000 /* 4s */
#define GAUDI_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */
#define GAUDI_WAIT_FOR_BL_TIMEOUT_USEC 15000000 /* 15s */
#define GAUDI_QMAN0_FENCE_VAL 0x72E91AB9
#define GAUDI_MAX_STRING_LEN 20
#define GAUDI_CB_POOL_CB_CNT 512
#define GAUDI_CB_POOL_CB_SIZE 0x20000 /* 128KB */
#define GAUDI_ALLOC_CPU_MEM_RETRY_CNT 3
#define GAUDI_NUM_OF_TPC_INTR_CAUSE 20
#define GAUDI_NUM_OF_QM_ERR_CAUSE 16
#define GAUDI_NUM_OF_QM_ARB_ERR_CAUSE 3
#define GAUDI_ARB_WDT_TIMEOUT 0xEE6b27FF /* 8 seconds */
#define HBM_SCRUBBING_TIMEOUT_US 1000000 /* 1s */
#define BIN_REG_STRING_SIZE sizeof ("0b10101010101010101010101010101010" )
#define MONITOR_SOB_STRING_SIZE 256
static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = {
GAUDI_QUEUE_ID_DMA_0_0,
GAUDI_QUEUE_ID_DMA_0_1,
GAUDI_QUEUE_ID_DMA_0_2,
GAUDI_QUEUE_ID_DMA_0_3,
GAUDI_QUEUE_ID_DMA_1_0,
GAUDI_QUEUE_ID_DMA_1_1,
GAUDI_QUEUE_ID_DMA_1_2,
GAUDI_QUEUE_ID_DMA_1_3
};
static const u8 gaudi_dma_assignment[GAUDI_DMA_MAX] = {
[GAUDI_PCI_DMA_1] = GAUDI_ENGINE_ID_DMA_0,
[GAUDI_PCI_DMA_2] = GAUDI_ENGINE_ID_DMA_1,
[GAUDI_HBM_DMA_1] = GAUDI_ENGINE_ID_DMA_2,
[GAUDI_HBM_DMA_2] = GAUDI_ENGINE_ID_DMA_3,
[GAUDI_HBM_DMA_3] = GAUDI_ENGINE_ID_DMA_4,
[GAUDI_HBM_DMA_4] = GAUDI_ENGINE_ID_DMA_5,
[GAUDI_HBM_DMA_5] = GAUDI_ENGINE_ID_DMA_6,
[GAUDI_HBM_DMA_6] = GAUDI_ENGINE_ID_DMA_7
};
static const u8 gaudi_cq_assignment[NUMBER_OF_CMPLT_QUEUES] = {
[0] = GAUDI_QUEUE_ID_DMA_0_0,
[1] = GAUDI_QUEUE_ID_DMA_0_1,
[2] = GAUDI_QUEUE_ID_DMA_0_2,
[3] = GAUDI_QUEUE_ID_DMA_0_3,
[4] = GAUDI_QUEUE_ID_DMA_1_0,
[5] = GAUDI_QUEUE_ID_DMA_1_1,
[6] = GAUDI_QUEUE_ID_DMA_1_2,
[7] = GAUDI_QUEUE_ID_DMA_1_3,
};
static const u16 gaudi_packet_sizes[MAX_PACKET_ID] = {
[PACKET_WREG_32] = sizeof (struct packet_wreg32),
[PACKET_WREG_BULK] = sizeof (struct packet_wreg_bulk),
[PACKET_MSG_LONG] = sizeof (struct packet_msg_long),
[PACKET_MSG_SHORT] = sizeof (struct packet_msg_short),
[PACKET_CP_DMA] = sizeof (struct packet_cp_dma),
[PACKET_REPEAT] = sizeof (struct packet_repeat),
[PACKET_MSG_PROT] = sizeof (struct packet_msg_prot),
[PACKET_FENCE] = sizeof (struct packet_fence),
[PACKET_LIN_DMA] = sizeof (struct packet_lin_dma),
[PACKET_NOP] = sizeof (struct packet_nop),
[PACKET_STOP] = sizeof (struct packet_stop),
[PACKET_ARB_POINT] = sizeof (struct packet_arb_point),
[PACKET_WAIT] = sizeof (struct packet_wait),
[PACKET_LOAD_AND_EXE] = sizeof (struct packet_load_and_exe)
};
static inline bool validate_packet_id(enum packet_id id)
{
switch (id) {
case PACKET_WREG_32:
case PACKET_WREG_BULK:
case PACKET_MSG_LONG:
case PACKET_MSG_SHORT:
case PACKET_CP_DMA:
case PACKET_REPEAT:
case PACKET_MSG_PROT:
case PACKET_FENCE:
case PACKET_LIN_DMA:
case PACKET_NOP:
case PACKET_STOP:
case PACKET_ARB_POINT:
case PACKET_WAIT:
case PACKET_LOAD_AND_EXE:
return true ;
default :
return false ;
}
}
static const char * const
gaudi_tpc_interrupts_cause[GAUDI_NUM_OF_TPC_INTR_CAUSE] = {
"tpc_address_exceed_slm" ,
"tpc_div_by_0" ,
"tpc_spu_mac_overflow" ,
"tpc_spu_addsub_overflow" ,
"tpc_spu_abs_overflow" ,
"tpc_spu_fp_dst_nan_inf" ,
"tpc_spu_fp_dst_denorm" ,
"tpc_vpu_mac_overflow" ,
"tpc_vpu_addsub_overflow" ,
"tpc_vpu_abs_overflow" ,
"tpc_vpu_fp_dst_nan_inf" ,
"tpc_vpu_fp_dst_denorm" ,
"tpc_assertions" ,
"tpc_illegal_instruction" ,
"tpc_pc_wrap_around" ,
"tpc_qm_sw_err" ,
"tpc_hbw_rresp_err" ,
"tpc_hbw_bresp_err" ,
"tpc_lbw_rresp_err" ,
"tpc_lbw_bresp_err"
};
static const char * const
gaudi_qman_error_cause[GAUDI_NUM_OF_QM_ERR_CAUSE] = {
"PQ AXI HBW error" ,
"CQ AXI HBW error" ,
"CP AXI HBW error" ,
"CP error due to undefined OPCODE" ,
"CP encountered STOP OPCODE" ,
"CP AXI LBW error" ,
"CP WRREG32 or WRBULK returned error" ,
"N/A" ,
"FENCE 0 inc over max value and clipped" ,
"FENCE 1 inc over max value and clipped" ,
"FENCE 2 inc over max value and clipped" ,
"FENCE 3 inc over max value and clipped" ,
"FENCE 0 dec under min value and clipped" ,
"FENCE 1 dec under min value and clipped" ,
"FENCE 2 dec under min value and clipped" ,
"FENCE 3 dec under min value and clipped"
};
static const char * const
gaudi_qman_arb_error_cause[GAUDI_NUM_OF_QM_ARB_ERR_CAUSE] = {
"Choice push while full error" ,
"Choice Q watchdog error" ,
"MSG AXI LBW returned with error"
};
static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_0 */
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_1 */
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_2 */
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_0_3 */
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_1_0 */
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_1_1 */
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_1_2 */
QUEUE_TYPE_EXT, /* GAUDI_QUEUE_ID_DMA_1_3 */
QUEUE_TYPE_CPU, /* GAUDI_QUEUE_ID_CPU_PQ */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_2_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_2_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_2_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_2_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_3_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_3_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_3_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_3_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_4_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_5_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_6_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_7_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_7_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_7_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_DMA_7_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_0_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_0_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_0_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_0_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_1_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_1_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_1_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_MME_1_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_0_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_0_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_0_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_0_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_1_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_1_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_1_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_1_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_2_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_2_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_2_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_2_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_3_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_3_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_3_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_3_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_4_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_4_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_4_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_4_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_5_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_5_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_5_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_5_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_6_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_6_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_6_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_6_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_7_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_7_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_7_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_TPC_7_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_0_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_0_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_0_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_0_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_1_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_1_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_1_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_1_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_2_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_2_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_2_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_2_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_3_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_3_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_3_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_3_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_4_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_4_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_4_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_4_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_5_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_5_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_5_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_5_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_6_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_6_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_6_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_6_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_7_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_7_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_7_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_7_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_8_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_8_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_8_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_8_3 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_0 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_1 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_2 */
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */
};
static struct hl_hw_obj_name_entry gaudi_so_id_to_str[] = {
{ .id = 0, .name = "SYNC_OBJ_DMA_DOWN_FEEDBACK" },
{ .id = 1, .name = "SYNC_OBJ_DMA_UP_FEEDBACK" },
{ .id = 2, .name = "SYNC_OBJ_DMA_STATIC_DRAM_SRAM_FEEDBACK" },
{ .id = 3, .name = "SYNC_OBJ_DMA_SRAM_DRAM_FEEDBACK" },
{ .id = 4, .name = "SYNC_OBJ_FIRST_COMPUTE_FINISH" },
{ .id = 5, .name = "SYNC_OBJ_HOST_DRAM_DONE" },
{ .id = 6, .name = "SYNC_OBJ_DBG_CTR_DEPRECATED" },
{ .id = 7, .name = "SYNC_OBJ_DMA_ACTIVATIONS_DRAM_SRAM_FEEDBACK" },
{ .id = 8, .name = "SYNC_OBJ_ENGINE_SEM_MME_0" },
{ .id = 9, .name = "SYNC_OBJ_ENGINE_SEM_MME_1" },
{ .id = 10, .name = "SYNC_OBJ_ENGINE_SEM_TPC_0" },
{ .id = 11, .name = "SYNC_OBJ_ENGINE_SEM_TPC_1" },
{ .id = 12, .name = "SYNC_OBJ_ENGINE_SEM_TPC_2" },
{ .id = 13, .name = "SYNC_OBJ_ENGINE_SEM_TPC_3" },
{ .id = 14, .name = "SYNC_OBJ_ENGINE_SEM_TPC_4" },
{ .id = 15, .name = "SYNC_OBJ_ENGINE_SEM_TPC_5" },
{ .id = 16, .name = "SYNC_OBJ_ENGINE_SEM_TPC_6" },
{ .id = 17, .name = "SYNC_OBJ_ENGINE_SEM_TPC_7" },
{ .id = 18, .name = "SYNC_OBJ_ENGINE_SEM_DMA_1" },
{ .id = 19, .name = "SYNC_OBJ_ENGINE_SEM_DMA_2" },
{ .id = 20, .name = "SYNC_OBJ_ENGINE_SEM_DMA_3" },
{ .id = 21, .name = "SYNC_OBJ_ENGINE_SEM_DMA_4" },
{ .id = 22, .name = "SYNC_OBJ_ENGINE_SEM_DMA_5" },
{ .id = 23, .name = "SYNC_OBJ_ENGINE_SEM_DMA_6" },
{ .id = 24, .name = "SYNC_OBJ_ENGINE_SEM_DMA_7" },
{ .id = 25, .name = "SYNC_OBJ_DBG_CTR_0" },
{ .id = 26, .name = "SYNC_OBJ_DBG_CTR_1" },
};
static struct hl_hw_obj_name_entry gaudi_monitor_id_to_str[] = {
{ .id = 200, .name = "MON_OBJ_DMA_DOWN_FEEDBACK_RESET" },
{ .id = 201, .name = "MON_OBJ_DMA_UP_FEEDBACK_RESET" },
{ .id = 203, .name = "MON_OBJ_DRAM_TO_SRAM_QUEUE_FENCE" },
{ .id = 204, .name = "MON_OBJ_TPC_0_CLK_GATE" },
{ .id = 205, .name = "MON_OBJ_TPC_1_CLK_GATE" },
{ .id = 206, .name = "MON_OBJ_TPC_2_CLK_GATE" },
{ .id = 207, .name = "MON_OBJ_TPC_3_CLK_GATE" },
{ .id = 208, .name = "MON_OBJ_TPC_4_CLK_GATE" },
{ .id = 209, .name = "MON_OBJ_TPC_5_CLK_GATE" },
{ .id = 210, .name = "MON_OBJ_TPC_6_CLK_GATE" },
{ .id = 211, .name = "MON_OBJ_TPC_7_CLK_GATE" },
};
static s64 gaudi_state_dump_specs_props[] = {
[SP_SYNC_OBJ_BASE_ADDR] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0,
[SP_NEXT_SYNC_OBJ_ADDR] = NEXT_SYNC_OBJ_ADDR_INTERVAL,
[SP_SYNC_OBJ_AMOUNT] = NUM_OF_SOB_IN_BLOCK,
[SP_MON_OBJ_WR_ADDR_LOW] =
mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0,
[SP_MON_OBJ_WR_ADDR_HIGH] =
mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0,
[SP_MON_OBJ_WR_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0,
[SP_MON_OBJ_ARM_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0,
[SP_MON_OBJ_STATUS] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0,
[SP_MONITORS_AMOUNT] = NUM_OF_MONITORS_IN_BLOCK,
[SP_TPC0_CMDQ] = mmTPC0_QM_GLBL_CFG0,
[SP_TPC0_CFG_SO] = mmTPC0_CFG_QM_SYNC_OBJECT_ADDR,
[SP_NEXT_TPC] = mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0,
[SP_MME_CMDQ] = mmMME0_QM_GLBL_CFG0,
[SP_MME_CFG_SO] = mmMME0_CTRL_ARCH_DESC_SYNC_OBJECT_ADDR_LOW_LOCAL,
[SP_NEXT_MME] = mmMME2_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0,
[SP_DMA_CMDQ] = mmDMA0_QM_GLBL_CFG0,
[SP_DMA_CFG_SO] = mmDMA0_CORE_WR_COMP_ADDR_LO,
[SP_DMA_QUEUES_OFFSET] = mmDMA1_QM_GLBL_CFG0 - mmDMA0_QM_GLBL_CFG0,
[SP_NUM_OF_MME_ENGINES] = NUM_OF_MME_ENGINES,
[SP_SUB_MME_ENG_NUM] = NUM_OF_MME_SUB_ENGINES,
[SP_NUM_OF_DMA_ENGINES] = NUM_OF_DMA_ENGINES,
[SP_NUM_OF_TPC_ENGINES] = NUM_OF_TPC_ENGINES,
[SP_ENGINE_NUM_OF_QUEUES] = NUM_OF_QUEUES,
[SP_ENGINE_NUM_OF_STREAMS] = NUM_OF_STREAMS,
[SP_ENGINE_NUM_OF_FENCES] = NUM_OF_FENCES,
[SP_FENCE0_CNT_OFFSET] =
mmDMA0_QM_CP_FENCE0_CNT_0 - mmDMA0_QM_GLBL_CFG0,
[SP_FENCE0_RDATA_OFFSET] =
mmDMA0_QM_CP_FENCE0_RDATA_0 - mmDMA0_QM_GLBL_CFG0,
[SP_CP_STS_OFFSET] = mmDMA0_QM_CP_STS_0 - mmDMA0_QM_GLBL_CFG0,
[SP_NUM_CORES] = 1,
};
static const int gaudi_queue_id_to_engine_id[] = {
[GAUDI_QUEUE_ID_DMA_0_0...GAUDI_QUEUE_ID_DMA_0_3] = GAUDI_ENGINE_ID_DMA_0,
[GAUDI_QUEUE_ID_DMA_1_0...GAUDI_QUEUE_ID_DMA_1_3] = GAUDI_ENGINE_ID_DMA_1,
[GAUDI_QUEUE_ID_CPU_PQ] = GAUDI_ENGINE_ID_SIZE,
[GAUDI_QUEUE_ID_DMA_2_0...GAUDI_QUEUE_ID_DMA_2_3] = GAUDI_ENGINE_ID_DMA_2,
[GAUDI_QUEUE_ID_DMA_3_0...GAUDI_QUEUE_ID_DMA_3_3] = GAUDI_ENGINE_ID_DMA_3,
[GAUDI_QUEUE_ID_DMA_4_0...GAUDI_QUEUE_ID_DMA_4_3] = GAUDI_ENGINE_ID_DMA_4,
[GAUDI_QUEUE_ID_DMA_5_0...GAUDI_QUEUE_ID_DMA_5_3] = GAUDI_ENGINE_ID_DMA_5,
[GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3] = GAUDI_ENGINE_ID_DMA_6,
[GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3] = GAUDI_ENGINE_ID_DMA_7,
[GAUDI_QUEUE_ID_MME_0_0...GAUDI_QUEUE_ID_MME_0_3] = GAUDI_ENGINE_ID_MME_0,
[GAUDI_QUEUE_ID_MME_1_0...GAUDI_QUEUE_ID_MME_1_3] = GAUDI_ENGINE_ID_MME_2,
[GAUDI_QUEUE_ID_TPC_0_0...GAUDI_QUEUE_ID_TPC_0_3] = GAUDI_ENGINE_ID_TPC_0,
[GAUDI_QUEUE_ID_TPC_1_0...GAUDI_QUEUE_ID_TPC_1_3] = GAUDI_ENGINE_ID_TPC_1,
[GAUDI_QUEUE_ID_TPC_2_0...GAUDI_QUEUE_ID_TPC_2_3] = GAUDI_ENGINE_ID_TPC_2,
[GAUDI_QUEUE_ID_TPC_3_0...GAUDI_QUEUE_ID_TPC_3_3] = GAUDI_ENGINE_ID_TPC_3,
[GAUDI_QUEUE_ID_TPC_4_0...GAUDI_QUEUE_ID_TPC_4_3] = GAUDI_ENGINE_ID_TPC_4,
[GAUDI_QUEUE_ID_TPC_5_0...GAUDI_QUEUE_ID_TPC_5_3] = GAUDI_ENGINE_ID_TPC_5,
[GAUDI_QUEUE_ID_TPC_6_0...GAUDI_QUEUE_ID_TPC_6_3] = GAUDI_ENGINE_ID_TPC_6,
[GAUDI_QUEUE_ID_TPC_7_0...GAUDI_QUEUE_ID_TPC_7_3] = GAUDI_ENGINE_ID_TPC_7,
[GAUDI_QUEUE_ID_NIC_0_0...GAUDI_QUEUE_ID_NIC_0_3] = GAUDI_ENGINE_ID_NIC_0,
[GAUDI_QUEUE_ID_NIC_1_0...GAUDI_QUEUE_ID_NIC_1_3] = GAUDI_ENGINE_ID_NIC_1,
[GAUDI_QUEUE_ID_NIC_2_0...GAUDI_QUEUE_ID_NIC_2_3] = GAUDI_ENGINE_ID_NIC_2,
[GAUDI_QUEUE_ID_NIC_3_0...GAUDI_QUEUE_ID_NIC_3_3] = GAUDI_ENGINE_ID_NIC_3,
[GAUDI_QUEUE_ID_NIC_4_0...GAUDI_QUEUE_ID_NIC_4_3] = GAUDI_ENGINE_ID_NIC_4,
[GAUDI_QUEUE_ID_NIC_5_0...GAUDI_QUEUE_ID_NIC_5_3] = GAUDI_ENGINE_ID_NIC_5,
[GAUDI_QUEUE_ID_NIC_6_0...GAUDI_QUEUE_ID_NIC_6_3] = GAUDI_ENGINE_ID_NIC_6,
[GAUDI_QUEUE_ID_NIC_7_0...GAUDI_QUEUE_ID_NIC_7_3] = GAUDI_ENGINE_ID_NIC_7,
[GAUDI_QUEUE_ID_NIC_8_0...GAUDI_QUEUE_ID_NIC_8_3] = GAUDI_ENGINE_ID_NIC_8,
[GAUDI_QUEUE_ID_NIC_9_0...GAUDI_QUEUE_ID_NIC_9_3] = GAUDI_ENGINE_ID_NIC_9,
};
/* The order here is opposite to the order of the indexing in the h/w.
* i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc.
*/
static const char * const gaudi_sync_manager_names[] = {
"SYNC_MGR_E_N" ,
"SYNC_MGR_W_N" ,
"SYNC_MGR_E_S" ,
"SYNC_MGR_W_S" ,
NULL
};
struct ecc_info_extract_params {
u64 block_address;
u32 num_memories;
bool derr;
};
static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid,
u64 phys_addr);
static int gaudi_send_job_on_qman0(struct hl_device *hdev,
struct hl_cs_job *job);
static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
u32 size, u64 val);
static int gaudi_memset_registers(struct hl_device *hdev, u64 reg_base,
u32 num_regs, u32 val);
static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
u32 tpc_id);
static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev);
static int gaudi_cpucp_info_get(struct hl_device *hdev);
static void gaudi_disable_clock_gating(struct hl_device *hdev);
static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid);
static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
u32 size, bool eb);
static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
struct hl_gen_wait_properties *prop);
static inline enum hl_collective_mode
get_collective_mode(struct hl_device *hdev, u32 queue_id)
{
if (gaudi_queue_type[queue_id] == QUEUE_TYPE_EXT)
return HL_COLLECTIVE_MASTER;
if (queue_id >= GAUDI_QUEUE_ID_DMA_5_0 &&
queue_id <= GAUDI_QUEUE_ID_DMA_5_3)
return HL_COLLECTIVE_SLAVE;
if (queue_id >= GAUDI_QUEUE_ID_TPC_7_0 &&
queue_id <= GAUDI_QUEUE_ID_TPC_7_3)
return HL_COLLECTIVE_SLAVE;
if (queue_id >= GAUDI_QUEUE_ID_NIC_0_0 &&
queue_id <= GAUDI_QUEUE_ID_NIC_9_3)
return HL_COLLECTIVE_SLAVE;
return HL_COLLECTIVE_NOT_SUPPORTED;
}
static inline void set_default_power_values(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
if (hdev->card_type == cpucp_card_type_pmc) {
prop->max_power_default = MAX_POWER_DEFAULT_PMC;
if (prop->fw_security_enabled)
prop->dc_power_default = DC_POWER_DEFAULT_PMC_SEC;
else
prop->dc_power_default = DC_POWER_DEFAULT_PMC;
} else {
prop->max_power_default = MAX_POWER_DEFAULT_PCI;
prop->dc_power_default = DC_POWER_DEFAULT_PCI;
}
}
static int gaudi_set_fixed_properties(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
u32 num_sync_stream_queues = 0;
int i;
prop->max_queues = GAUDI_QUEUE_ID_SIZE;
prop->hw_queues_props = kcalloc(prop->max_queues,
sizeof (struct hw_queue_properties),
GFP_KERNEL);
if (!prop->hw_queues_props)
return -ENOMEM;
for (i = 0 ; i < prop->max_queues ; i++) {
if (gaudi_queue_type[i] == QUEUE_TYPE_EXT) {
prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].supports_sync_stream = 1;
prop->hw_queues_props[i].cb_alloc_flags =
CB_ALLOC_KERNEL;
num_sync_stream_queues++;
} else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) {
prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
prop->hw_queues_props[i].driver_only = 1;
prop->hw_queues_props[i].supports_sync_stream = 0;
prop->hw_queues_props[i].cb_alloc_flags =
CB_ALLOC_KERNEL;
} else if (gaudi_queue_type[i] == QUEUE_TYPE_INT) {
prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].supports_sync_stream = 0;
prop->hw_queues_props[i].cb_alloc_flags =
CB_ALLOC_USER;
}
prop->hw_queues_props[i].collective_mode =
get_collective_mode(hdev, i);
}
prop->cache_line_size = DEVICE_CACHE_LINE_SIZE;
prop->cfg_base_address = CFG_BASE;
prop->device_dma_offset_for_host_access = HOST_PHYS_BASE;
prop->host_base_address = HOST_PHYS_BASE;
prop->host_end_address = prop->host_base_address + HOST_PHYS_SIZE;
prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
prop->completion_mode = HL_COMPLETION_MODE_JOB;
prop->collective_first_sob = 0;
prop->collective_first_mon = 0;
/* 2 SOBs per internal queue stream are reserved for collective */
prop->sync_stream_first_sob =
ALIGN(NUMBER_OF_SOBS_IN_GRP, HL_MAX_SOBS_PER_MONITOR)
* QMAN_STREAMS * HL_RSVD_SOBS;
/* 1 monitor per internal queue stream are reserved for collective
* 2 monitors per external queue stream are reserved for collective
*/
prop->sync_stream_first_mon =
(NUMBER_OF_COLLECTIVE_QUEUES * QMAN_STREAMS) +
(NUMBER_OF_EXT_HW_QUEUES * 2);
prop->dram_base_address = DRAM_PHYS_BASE;
prop->dram_size = GAUDI_HBM_SIZE_32GB;
prop->dram_end_address = prop->dram_base_address + prop->dram_size;
prop->dram_user_base_address = DRAM_BASE_ADDR_USER;
prop->sram_base_address = SRAM_BASE_ADDR;
prop->sram_size = SRAM_SIZE;
prop->sram_end_address = prop->sram_base_address + prop->sram_size;
prop->sram_user_base_address =
prop->sram_base_address + SRAM_USER_BASE_OFFSET;
prop->mmu_cache_mng_addr = MMU_CACHE_MNG_ADDR;
prop->mmu_cache_mng_size = MMU_CACHE_MNG_SIZE;
prop->mmu_pgt_addr = MMU_PAGE_TABLES_ADDR;
if (hdev->pldm)
prop->mmu_pgt_size = 0x800000; /* 8MB */
else
prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
prop->mmu_pte_size = HL_PTE_SIZE;
prop->dram_page_size = PAGE_SIZE_2MB;
prop->device_mem_alloc_default_page_size = prop->dram_page_size;
prop->dram_supports_virtual_memory = false ;
prop->pmmu.hop_shifts[MMU_HOP0] = MMU_V1_1_HOP0_SHIFT;
prop->pmmu.hop_shifts[MMU_HOP1] = MMU_V1_1_HOP1_SHIFT;
prop->pmmu.hop_shifts[MMU_HOP2] = MMU_V1_1_HOP2_SHIFT;
prop->pmmu.hop_shifts[MMU_HOP3] = MMU_V1_1_HOP3_SHIFT;
prop->pmmu.hop_shifts[MMU_HOP4] = MMU_V1_1_HOP4_SHIFT;
prop->pmmu.hop_masks[MMU_HOP0] = MMU_V1_1_HOP0_MASK;
prop->pmmu.hop_masks[MMU_HOP1] = MMU_V1_1_HOP1_MASK;
prop->pmmu.hop_masks[MMU_HOP2] = MMU_V1_1_HOP2_MASK;
prop->pmmu.hop_masks[MMU_HOP3] = MMU_V1_1_HOP3_MASK;
prop->pmmu.hop_masks[MMU_HOP4] = MMU_V1_1_HOP4_MASK;
prop->pmmu.start_addr = VA_HOST_SPACE_START;
prop->pmmu.end_addr =
(VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1;
prop->pmmu.page_size = PAGE_SIZE_4KB;
prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
prop->pmmu.last_mask = LAST_MASK;
/* TODO: will be duplicated until implementing per-MMU props */
prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
/* PMMU and HPMMU are the same except of page size */
memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof (prop->pmmu));
prop->pmmu_huge.page_size = PAGE_SIZE_2MB;
/* shifts and masks are the same in PMMU and DMMU */
memcpy(&prop->dmmu, &prop->pmmu, sizeof (prop->pmmu));
prop->dmmu.start_addr = (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2);
prop->dmmu.end_addr = VA_HOST_SPACE_END;
prop->dmmu.page_size = PAGE_SIZE_2MB;
prop->dmmu.pgt_size = prop->mmu_pgt_size;
prop->cfg_size = CFG_SIZE;
prop->max_asid = MAX_ASID;
prop->num_of_events = GAUDI_EVENT_SIZE;
prop->max_num_of_engines = GAUDI_ENGINE_ID_SIZE;
prop->tpc_enabled_mask = TPC_ENABLED_MASK;
set_default_power_values(hdev);
prop->cb_pool_cb_cnt = GAUDI_CB_POOL_CB_CNT;
prop->cb_pool_cb_size = GAUDI_CB_POOL_CB_SIZE;
prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;
strscpy_pad(prop->cpucp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
prop->max_pending_cs = GAUDI_MAX_PENDING_CS;
prop->first_available_user_sob[HL_GAUDI_WS_DCORE] =
prop->sync_stream_first_sob +
(num_sync_stream_queues * HL_RSVD_SOBS);
prop->first_available_user_mon[HL_GAUDI_WS_DCORE] =
prop->sync_stream_first_mon +
(num_sync_stream_queues * HL_RSVD_MONS);
prop->first_available_user_interrupt = USHRT_MAX;
prop->tpc_interrupt_id = USHRT_MAX;
/* single msi */
prop->eq_interrupt_id = 0;
for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX;
prop->fw_cpu_boot_dev_sts0_valid = false ;
prop->fw_cpu_boot_dev_sts1_valid = false ;
prop->hard_reset_done_by_fw = false ;
prop->gic_interrupts_enable = true ;
prop->server_type = HL_SERVER_TYPE_UNKNOWN;
prop->clk_pll_index = HL_GAUDI_MME_PLL;
prop->max_freq_value = GAUDI_MAX_CLK_FREQ;
prop->use_get_power_for_reset_history = true ;
prop->configurable_stop_on_err = true ;
prop->set_max_power_on_device_init = true ;
prop->dma_mask = 48;
prop->hbw_flush_reg = mmPCIE_WRAP_RR_ELBI_RD_SEC_REG_CTRL;
return 0;
}
static int gaudi_pci_bars_map(struct hl_device *hdev)
{
static const char * const name[] = {"SRAM" , "CFG" , "HBM" };
bool is_wc[3] = {false , false , true };
int rc;
rc = hl_pci_bars_map(hdev, name, is_wc);
if (rc)
return rc;
hdev->rmmio = hdev->pcie_bar[CFG_BAR_ID] +
(CFG_BASE - SPI_FLASH_BASE_ADDR);
return 0;
}
static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr)
{
struct gaudi_device *gaudi = hdev->asic_specific;
struct hl_inbound_pci_region pci_region;
u64 old_addr = addr;
int rc;
if ((gaudi) && (gaudi->hbm_bar_cur_addr == addr))
return old_addr;
if (hdev->asic_prop.iatu_done_by_fw)
return U64_MAX;
/* Inbound Region 2 - Bar 4 - Point to HBM */
pci_region.mode = PCI_BAR_MATCH_MODE;
pci_region.bar = HBM_BAR_ID;
pci_region.addr = addr;
rc = hl_pci_set_inbound_region(hdev, 2, &pci_region);
if (rc)
return U64_MAX;
if (gaudi) {
old_addr = gaudi->hbm_bar_cur_addr;
gaudi->hbm_bar_cur_addr = addr;
}
return old_addr;
}
static int gaudi_init_iatu(struct hl_device *hdev)
{
struct hl_inbound_pci_region inbound_region;
struct hl_outbound_pci_region outbound_region;
int rc;
if (hdev->asic_prop.iatu_done_by_fw)
return 0;
/* Inbound Region 0 - Bar 0 - Point to SRAM + CFG */
inbound_region.mode = PCI_BAR_MATCH_MODE;
inbound_region.bar = SRAM_BAR_ID;
inbound_region.addr = SRAM_BASE_ADDR;
rc = hl_pci_set_inbound_region(hdev, 0, &inbound_region);
if (rc)
goto done;
/* Inbound Region 1 - Bar 2 - Point to SPI FLASH */
inbound_region.mode = PCI_BAR_MATCH_MODE;
inbound_region.bar = CFG_BAR_ID;
inbound_region.addr = SPI_FLASH_BASE_ADDR;
rc = hl_pci_set_inbound_region(hdev, 1, &inbound_region);
if (rc)
goto done;
/* Inbound Region 2 - Bar 4 - Point to HBM */
inbound_region.mode = PCI_BAR_MATCH_MODE;
inbound_region.bar = HBM_BAR_ID;
inbound_region.addr = DRAM_PHYS_BASE;
rc = hl_pci_set_inbound_region(hdev, 2, &inbound_region);
if (rc)
goto done;
/* Outbound Region 0 - Point to Host */
outbound_region.addr = HOST_PHYS_BASE;
outbound_region.size = HOST_PHYS_SIZE;
rc = hl_pci_set_outbound_region(hdev, &outbound_region);
done:
return rc;
}
static enum hl_device_hw_state gaudi_get_hw_state(struct hl_device *hdev)
{
return RREG32(mmHW_STATE);
}
static int gaudi_early_init(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct pci_dev *pdev = hdev->pdev;
resource_size_t pci_bar_size;
u32 fw_boot_status;
int rc;
rc = gaudi_set_fixed_properties(hdev);
if (rc) {
dev_err(hdev->dev, "Failed setting fixed properties\n" );
return rc;
}
/* Check BAR sizes */
pci_bar_size = pci_resource_len(pdev, SRAM_BAR_ID);
if (pci_bar_size != SRAM_BAR_SIZE) {
dev_err(hdev->dev, "Not " HL_NAME "? BAR %d size %pa, expecting %llu\n" ,
SRAM_BAR_ID, &pci_bar_size, SRAM_BAR_SIZE);
rc = -ENODEV;
goto free_queue_props;
}
pci_bar_size = pci_resource_len(pdev, CFG_BAR_ID);
if (pci_bar_size != CFG_BAR_SIZE) {
dev_err(hdev->dev, "Not " HL_NAME "? BAR %d size %pa, expecting %llu\n" ,
CFG_BAR_ID, &pci_bar_size, CFG_BAR_SIZE);
rc = -ENODEV;
goto free_queue_props;
}
prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID);
hdev->dram_pci_bar_start = pci_resource_start(pdev, HBM_BAR_ID);
/* If FW security is enabled at this point it means no access to ELBI */
if (hdev->asic_prop.fw_security_enabled) {
hdev->asic_prop.iatu_done_by_fw = true ;
/*
* GIC-security-bit can ONLY be set by CPUCP, so in this stage
* decision can only be taken based on PCI ID security.
*/
hdev->asic_prop.gic_interrupts_enable = false ;
goto pci_init;
}
rc = hl_pci_elbi_read(hdev, CFG_BASE + mmCPU_BOOT_DEV_STS0,
&fw_boot_status);
if (rc)
goto free_queue_props;
/* Check whether FW is configuring iATU */
if ((fw_boot_status & CPU_BOOT_DEV_STS0_ENABLED) &&
(fw_boot_status & CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN))
hdev->asic_prop.iatu_done_by_fw = true ;
pci_init:
rc = hl_pci_init(hdev);
if (rc)
goto free_queue_props;
/* Before continuing in the initialization, we need to read the preboot
* version to determine whether we run with a security-enabled firmware
*/
rc = hl_fw_read_preboot_status(hdev);
if (rc) {
if (hdev->reset_on_preboot_fail)
/* we are already on failure flow, so don't check if hw_fini fails. */
hdev->asic_funcs->hw_fini(hdev, true , false );
goto pci_fini;
}
if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n" );
rc = hdev->asic_funcs->hw_fini(hdev, true , false );
if (rc) {
dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n" , rc);
goto pci_fini;
}
}
return 0;
pci_fini:
hl_pci_fini(hdev);
free_queue_props:
kfree(hdev->asic_prop.hw_queues_props);
return rc;
}
static int gaudi_early_fini(struct hl_device *hdev)
{
kfree(hdev->asic_prop.hw_queues_props);
hl_pci_fini(hdev);
return 0;
}
/**
* gaudi_fetch_psoc_frequency - Fetch PSOC frequency values
*
* @hdev: pointer to hl_device structure
*
*/
static int gaudi_fetch_psoc_frequency(struct hl_device *hdev)
{
u32 nr = 0, nf = 0, od = 0, div_fctr = 0, pll_clk, div_sel;
struct asic_fixed_properties *prop = &hdev->asic_prop;
u16 pll_freq_arr[HL_PLL_NUM_OUTPUTS], freq;
int rc;
if ((hdev->fw_components & FW_TYPE_LINUX) &&
(prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PLL_INFO_EN)) {
struct gaudi_device *gaudi = hdev->asic_specific;
if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q))
return 0;
rc = hl_fw_cpucp_pll_info_get(hdev, HL_GAUDI_CPU_PLL, pll_freq_arr);
if (rc)
return rc;
freq = pll_freq_arr[2];
} else {
/* Backward compatibility */
div_fctr = RREG32(mmPSOC_CPU_PLL_DIV_FACTOR_2);
div_sel = RREG32(mmPSOC_CPU_PLL_DIV_SEL_2);
nr = RREG32(mmPSOC_CPU_PLL_NR);
nf = RREG32(mmPSOC_CPU_PLL_NF);
od = RREG32(mmPSOC_CPU_PLL_OD);
if (div_sel == DIV_SEL_REF_CLK ||
div_sel == DIV_SEL_DIVIDED_REF) {
if (div_sel == DIV_SEL_REF_CLK)
freq = PLL_REF_CLK;
else
freq = PLL_REF_CLK / (div_fctr + 1);
} else if (div_sel == DIV_SEL_PLL_CLK ||
div_sel == DIV_SEL_DIVIDED_PLL) {
pll_clk = PLL_REF_CLK * (nf + 1) /
((nr + 1) * (od + 1));
if (div_sel == DIV_SEL_PLL_CLK)
freq = pll_clk;
else
freq = pll_clk / (div_fctr + 1);
} else {
dev_warn(hdev->dev, "Received invalid div select value: %#x" , div_sel);
freq = 0;
}
}
prop->psoc_timestamp_frequency = freq;
prop->psoc_pci_pll_nr = nr;
prop->psoc_pci_pll_nf = nf;
prop->psoc_pci_pll_od = od;
prop->psoc_pci_pll_div_factor = div_fctr;
return 0;
}
static int _gaudi_init_tpc_mem(struct hl_device *hdev,
dma_addr_t tpc_kernel_src_addr, u32 tpc_kernel_size)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct packet_lin_dma *init_tpc_mem_pkt;
struct hl_cs_job *job;
struct hl_cb *cb;
u64 dst_addr;
u32 cb_size, ctl;
u8 tpc_id;
int rc;
cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false );
if (!cb)
return -EFAULT;
init_tpc_mem_pkt = cb->kernel_address;
cb_size = sizeof (*init_tpc_mem_pkt);
memset(init_tpc_mem_pkt, 0, cb_size);
init_tpc_mem_pkt->tsize = cpu_to_le32(tpc_kernel_size);
ctl = FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
ctl |= FIELD_PREP(GAUDI_PKT_LIN_DMA_CTL_LIN_MASK, 1);
ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1);
ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
init_tpc_mem_pkt->ctl = cpu_to_le32(ctl);
init_tpc_mem_pkt->src_addr = cpu_to_le64(tpc_kernel_src_addr);
/* TPC_CMD is configured with I$ prefetch enabled, so address should be aligned to 8KB */
dst_addr = FIELD_PREP(GAUDI_PKT_LIN_DMA_DST_ADDR_MASK,
round_up(prop->sram_user_base_address, SZ_8K));
init_tpc_mem_pkt->dst_addr |= cpu_to_le64(dst_addr);
job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true );
if (!job) {
dev_err(hdev->dev, "Failed to allocate a new job\n" );
rc = -ENOMEM;
goto release_cb;
}
job->id = 0;
job->user_cb = cb;
atomic_inc(&job->user_cb->cs_cnt);
job->user_cb_size = cb_size;
job->hw_queue_id = GAUDI_QUEUE_ID_DMA_0_0;
job->patched_cb = job->user_cb;
job->job_cb_size = job->user_cb_size + sizeof (struct packet_msg_prot);
hl_debugfs_add_job(hdev, job);
rc = gaudi_send_job_on_qman0(hdev, job);
if (rc)
goto free_job;
for (tpc_id = 0 ; tpc_id < TPC_NUMBER_OF_ENGINES ; tpc_id++) {
rc = gaudi_run_tpc_kernel(hdev, dst_addr, tpc_id);
if (rc)
break ;
}
free_job:
hl_userptr_delete_list(hdev, &job->userptr_list);
hl_debugfs_remove_job(hdev, job);
kfree(job);
atomic_dec(&cb->cs_cnt);
release_cb:
hl_cb_put(cb);
hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle);
return rc;
}
/*
* gaudi_init_tpc_mem() - Initialize TPC memories.
* @hdev: Pointer to hl_device structure.
*
* Copy TPC kernel fw from firmware file and run it to initialize TPC memories.
*
* Return: 0 for success, negative value for error.
*/
static int gaudi_init_tpc_mem(struct hl_device *hdev)
{
const struct firmware *fw;
size_t fw_size;
void *cpu_addr;
dma_addr_t dma_handle;
int rc, count = 5;
again:
rc = request_firmware(&fw, GAUDI_TPC_FW_FILE, hdev->dev);
if (rc == -EINTR && count-- > 0) {
msleep(50);
goto again;
}
if (rc) {
dev_err(hdev->dev, "Failed to load firmware file %s\n" ,
GAUDI_TPC_FW_FILE);
goto out;
}
fw_size = fw->size;
cpu_addr = hl_asic_dma_alloc_coherent(hdev, fw_size, &dma_handle, GFP_KERNEL | __GFP_ZERO);
if (!cpu_addr) {
dev_err(hdev->dev,
"Failed to allocate %zu of dma memory for TPC kernel\n" ,
fw_size);
rc = -ENOMEM;
goto out;
}
memcpy(cpu_addr, fw->data, fw_size);
rc = _gaudi_init_tpc_mem(hdev, dma_handle, fw_size);
hl_asic_dma_free_coherent(hdev, fw->size, cpu_addr, dma_handle);
out:
release_firmware(fw);
return rc;
}
static void gaudi_collective_map_sobs(struct hl_device *hdev, u32 stream)
{
struct gaudi_device *gaudi = hdev->asic_specific;
struct gaudi_collective_properties *prop = &gaudi->collective_props;
struct hl_hw_queue *q;
u32 i, sob_id, sob_group_id, queue_id;
/* Iterate through SOB groups and assign a SOB for each slave queue */
sob_group_id =
stream * HL_RSVD_SOBS + prop->curr_sob_group_idx[stream];
sob_id = prop->hw_sob_group[sob_group_id].base_sob_id;
queue_id = GAUDI_QUEUE_ID_NIC_0_0 + stream;
for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++) {
q = &hdev->kernel_queues[queue_id + (4 * i)];
q->sync_stream_prop.collective_sob_id = sob_id + i;
}
/* Both DMA5 and TPC7 use the same resources since only a single
* engine need to participate in the reduction process
*/
queue_id = GAUDI_QUEUE_ID_DMA_5_0 + stream;
q = &hdev->kernel_queues[queue_id];
q->sync_stream_prop.collective_sob_id =
sob_id + NIC_NUMBER_OF_ENGINES;
queue_id = GAUDI_QUEUE_ID_TPC_7_0 + stream;
q = &hdev->kernel_queues[queue_id];
q->sync_stream_prop.collective_sob_id =
sob_id + NIC_NUMBER_OF_ENGINES;
}
static void gaudi_sob_group_hw_reset(struct kref *ref)
{
struct gaudi_hw_sob_group *hw_sob_group =
container_of(ref, struct gaudi_hw_sob_group, kref);
struct hl_device *hdev = hw_sob_group->hdev;
int i;
for (i = 0 ; i < NUMBER_OF_SOBS_IN_GRP ; i++)
WREG32((mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
(hw_sob_group->base_sob_id * 4) + (i * 4)), 0);
kref_init(&hw_sob_group->kref);
}
static void gaudi_sob_group_reset_error(struct kref *ref)
{
struct gaudi_hw_sob_group *hw_sob_group =
container_of(ref, struct gaudi_hw_sob_group, kref);
struct hl_device *hdev = hw_sob_group->hdev;
dev_crit(hdev->dev,
"SOB release shouldn't be called here, base_sob_id: %d\n" ,
hw_sob_group->base_sob_id);
}
static void gaudi_collective_mstr_sob_mask_set(struct gaudi_device *gaudi)
{
struct gaudi_collective_properties *prop;
int i;
prop = &gaudi->collective_props;
memset(prop->mstr_sob_mask, 0, sizeof (prop->mstr_sob_mask));
for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++)
if (gaudi->hw_cap_initialized & BIT(HW_CAP_NIC_SHIFT + i))
prop->mstr_sob_mask[i / HL_MAX_SOBS_PER_MONITOR] |=
BIT(i % HL_MAX_SOBS_PER_MONITOR);
/* Set collective engine bit */
prop->mstr_sob_mask[i / HL_MAX_SOBS_PER_MONITOR] |=
BIT(i % HL_MAX_SOBS_PER_MONITOR);
}
static int gaudi_collective_init(struct hl_device *hdev)
{
u32 i, sob_id, reserved_sobs_per_group;
struct gaudi_collective_properties *prop;
struct gaudi_device *gaudi;
gaudi = hdev->asic_specific;
prop = &gaudi->collective_props;
sob_id = hdev->asic_prop.collective_first_sob;
/* First sob in group must be aligned to HL_MAX_SOBS_PER_MONITOR */
reserved_sobs_per_group =
ALIGN(NUMBER_OF_SOBS_IN_GRP, HL_MAX_SOBS_PER_MONITOR);
/* Init SOB groups */
for (i = 0 ; i < NUM_SOB_GROUPS; i++) {
prop->hw_sob_group[i].hdev = hdev;
prop->hw_sob_group[i].base_sob_id = sob_id;
sob_id += reserved_sobs_per_group;
gaudi_sob_group_hw_reset(&prop->hw_sob_group[i].kref);
}
for (i = 0 ; i < QMAN_STREAMS; i++) {
prop->next_sob_group_val[i] = 1;
prop->curr_sob_group_idx[i] = 0;
gaudi_collective_map_sobs(hdev, i);
}
gaudi_collective_mstr_sob_mask_set(gaudi);
return 0;
}
static void gaudi_reset_sob_group(struct hl_device *hdev, u16 sob_group)
{
struct gaudi_device *gaudi = hdev->asic_specific;
struct gaudi_collective_properties *cprop = &gaudi->collective_props;
kref_put(&cprop->hw_sob_group[sob_group].kref,
gaudi_sob_group_hw_reset);
}
static void gaudi_collective_master_init_job(struct hl_device *hdev,
struct hl_cs_job *job, u32 stream, u32 sob_group_offset)
{
u32 master_sob_base, master_monitor, queue_id, cb_size = 0;
struct gaudi_collective_properties *cprop;
struct hl_gen_wait_properties wait_prop;
struct hl_sync_stream_properties *prop;
struct gaudi_device *gaudi;
gaudi = hdev->asic_specific;
cprop = &gaudi->collective_props;
queue_id = job->hw_queue_id;
prop = &hdev->kernel_queues[queue_id].sync_stream_prop;
master_sob_base =
cprop->hw_sob_group[sob_group_offset].base_sob_id;
master_monitor = prop->collective_mstr_mon_id[0];
cprop->hw_sob_group[sob_group_offset].queue_id = queue_id;
dev_dbg(hdev->dev,
"Generate master wait CBs, sob %d (mask %#x), val:0x%x, mon %u, q %d\n" ,
master_sob_base, cprop->mstr_sob_mask[0],
cprop->next_sob_group_val[stream],
master_monitor, queue_id);
wait_prop.data = (void *) job->patched_cb;
wait_prop.sob_base = master_sob_base;
wait_prop.sob_mask = cprop->mstr_sob_mask[0];
wait_prop.sob_val = cprop->next_sob_group_val[stream];
wait_prop.mon_id = master_monitor;
wait_prop.q_idx = queue_id;
wait_prop.size = cb_size;
cb_size += gaudi_gen_wait_cb(hdev, &wait_prop);
master_sob_base += HL_MAX_SOBS_PER_MONITOR;
master_monitor = prop->collective_mstr_mon_id[1];
dev_dbg(hdev->dev,
"Generate master wait CBs, sob %d (mask %#x), val:0x%x, mon %u, q %d\n" ,
master_sob_base, cprop->mstr_sob_mask[1],
cprop->next_sob_group_val[stream],
master_monitor, queue_id);
wait_prop.sob_base = master_sob_base;
wait_prop.sob_mask = cprop->mstr_sob_mask[1];
wait_prop.mon_id = master_monitor;
wait_prop.size = cb_size;
cb_size += gaudi_gen_wait_cb(hdev, &wait_prop);
}
static void gaudi_collective_slave_init_job(struct hl_device *hdev,
struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
{
struct hl_gen_wait_properties wait_prop;
struct hl_sync_stream_properties *prop;
u32 queue_id, cb_size = 0;
queue_id = job->hw_queue_id;
prop = &hdev->kernel_queues[queue_id].sync_stream_prop;
if (job->cs->encaps_signals) {
/* use the encaps signal handle store earlier in the flow
* and set the SOB information from the encaps
* signals handle
*/
hl_hw_queue_encaps_sig_set_sob_info(hdev, job->cs, job,
cs_cmpl);
dev_dbg(hdev->dev, "collective wait: Sequence %llu found, sob_id: %u, wait for sob_val: %u\n" ,
job->cs->sequence,
cs_cmpl->hw_sob->sob_id,
cs_cmpl->sob_val);
}
/* Add to wait CBs using slave monitor */
wait_prop.data = (void *) job->user_cb;
wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
wait_prop.sob_mask = 0x1;
wait_prop.sob_val = cs_cmpl->sob_val;
wait_prop.mon_id = prop->collective_slave_mon_id;
wait_prop.q_idx = queue_id;
wait_prop.size = cb_size;
dev_dbg(hdev->dev,
"Generate slave wait CB, sob %d, val:%x, mon %d, q %d\n" ,
cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
prop->collective_slave_mon_id, queue_id);
cb_size += gaudi_gen_wait_cb(hdev, &wait_prop);
dev_dbg(hdev->dev,
"generate signal CB, sob_id: %d, sob val: 1, q_idx: %d\n" ,
prop->collective_sob_id, queue_id);
cb_size += gaudi_gen_signal_cb(hdev, job->user_cb,
prop->collective_sob_id, cb_size, false );
}
static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
{
struct hl_cs_compl *signal_cs_cmpl =
container_of(cs->signal_fence, struct hl_cs_compl, base_fence);
struct hl_cs_compl *cs_cmpl =
container_of(cs->fence, struct hl_cs_compl, base_fence);
struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl;
struct gaudi_collective_properties *cprop;
u32 stream, queue_id, sob_group_offset;
struct gaudi_device *gaudi;
struct hl_device *hdev;
struct hl_cs_job *job;
struct hl_ctx *ctx;
ctx = cs->ctx;
hdev = ctx->hdev;
gaudi = hdev->asic_specific;
cprop = &gaudi->collective_props;
if (cs->encaps_signals) {
cs_cmpl->hw_sob = handle->hw_sob;
/* at this checkpoint we only need the hw_sob pointer
* for the completion check before start going over the jobs
* of the master/slaves, the sob_value will be taken later on
* in gaudi_collective_slave_init_job depends on each
* job wait offset value.
*/
cs_cmpl->sob_val = 0;
} else {
/* copy the SOB id and value of the signal CS */
cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
}
/* check again if the signal cs already completed.
* if yes then don't send any wait cs since the hw_sob
* could be in reset already. if signal is not completed
* then get refcount to hw_sob to prevent resetting the sob
* while wait cs is not submitted.
* note that this check is protected by two locks,
* hw queue lock and completion object lock,
* and the same completion object lock also protects
* the hw_sob reset handler function.
* The hw_queue lock prevent out of sync of hw_sob
* refcount value, changed by signal/wait flows.
*/
spin_lock(&signal_cs_cmpl->lock);
if (completion_done(&cs->signal_fence->completion)) {
spin_unlock(&signal_cs_cmpl->lock);
return -EINVAL;
}
/* Increment kref since all slave queues are now waiting on it */
kref_get(&cs_cmpl->hw_sob->kref);
spin_unlock(&signal_cs_cmpl->lock);
/* Calculate the stream from collective master queue (1st job) */
job = list_first_entry(&cs->job_list, struct hl_cs_job, cs_node);
stream = job->hw_queue_id % 4;
sob_group_offset =
stream * HL_RSVD_SOBS + cprop->curr_sob_group_idx[stream];
list_for_each_entry(job, &cs->job_list, cs_node) {
queue_id = job->hw_queue_id;
if (hdev->kernel_queues[queue_id].collective_mode ==
HL_COLLECTIVE_MASTER)
gaudi_collective_master_init_job(hdev, job, stream,
sob_group_offset);
else
gaudi_collective_slave_init_job(hdev, job, cs_cmpl);
}
cs_cmpl->sob_group = sob_group_offset;
/* Handle sob group kref and wraparound */
kref_get(&cprop->hw_sob_group[sob_group_offset].kref);
cprop->next_sob_group_val[stream]++;
if (cprop->next_sob_group_val[stream] == HL_MAX_SOB_VAL) {
/*
* Decrement as we reached the max value.
* The release function won't be called here as we've
* just incremented the refcount.
*/
kref_put(&cprop->hw_sob_group[sob_group_offset].kref,
gaudi_sob_group_reset_error);
cprop->next_sob_group_val[stream] = 1;
/* only two SOBs are currently in use */
cprop->curr_sob_group_idx[stream] =
(cprop->curr_sob_group_idx[stream] + 1) &
(HL_RSVD_SOBS - 1);
gaudi_collective_map_sobs(hdev, stream);
dev_dbg(hdev->dev, "switched to SOB group %d, stream: %d\n" ,
cprop->curr_sob_group_idx[stream], stream);
}
mb();
hl_fence_put(cs->signal_fence);
cs->signal_fence = NULL;
return 0;
}
static u32 gaudi_get_patched_cb_extra_size(u32 user_cb_size)
{
u32 cacheline_end, additional_commands;
cacheline_end = round_up(user_cb_size, DEVICE_CACHE_LINE_SIZE);
additional_commands = sizeof (struct packet_msg_prot) * 2;
if (user_cb_size + additional_commands > cacheline_end)
return cacheline_end - user_cb_size + additional_commands;
else
return additional_commands;
}
static int gaudi_collective_wait_create_job(struct hl_device *hdev,
struct hl_ctx *ctx, struct hl_cs *cs,
enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id,
u32 encaps_signal_offset)
{
struct hw_queue_properties *hw_queue_prop;
struct hl_cs_counters_atomic *cntr;
struct hl_cs_job *job;
struct hl_cb *cb;
u32 cb_size;
bool patched_cb;
cntr = &hdev->aggregated_cs_counters;
if (mode == HL_COLLECTIVE_MASTER) {
/* CB size of collective master queue contains
* 4 msg short packets for monitor 1 configuration
* 1 fence packet
* 4 msg short packets for monitor 2 configuration
* 1 fence packet
* 2 msg prot packets for completion and MSI
*/
cb_size = sizeof (struct packet_msg_short) * 8 +
sizeof (struct packet_fence) * 2 +
sizeof (struct packet_msg_prot) * 2;
patched_cb = true ;
} else {
/* CB size of collective slave queues contains
* 4 msg short packets for monitor configuration
* 1 fence packet
* 1 additional msg short packet for sob signal
*/
cb_size = sizeof (struct packet_msg_short) * 5 +
sizeof (struct packet_fence);
patched_cb = false ;
}
hw_queue_prop = &hdev->asic_prop.hw_queues_props[queue_id];
job = hl_cs_allocate_job(hdev, hw_queue_prop->type, true );
if (!job) {
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
atomic64_inc(&cntr->out_of_mem_drop_cnt);
dev_err(hdev->dev, "Failed to allocate a new job\n" );
return -ENOMEM;
}
/* Allocate internal mapped CB for non patched CBs */
cb = hl_cb_kernel_create(hdev, cb_size, !patched_cb);
if (!cb) {
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
atomic64_inc(&cntr->out_of_mem_drop_cnt);
kfree(job);
return -EFAULT;
}
job->id = 0;
job->cs = cs;
job->user_cb = cb;
atomic_inc(&job->user_cb->cs_cnt);
job->user_cb_size = cb_size;
job->hw_queue_id = queue_id;
/* since its guaranteed to have only one chunk in the collective wait
* cs, we can use this chunk to set the encapsulated signal offset
* in the jobs.
*/
if (cs->encaps_signals)
job->encaps_sig_wait_offset = encaps_signal_offset;
/*
* No need in parsing, user CB is the patched CB.
* We call hl_cb_destroy() out of two reasons - we don't need
* the CB in the CB idr anymore and to decrement its refcount as
* it was incremented inside hl_cb_kernel_create().
*/
if (patched_cb)
job->patched_cb = job->user_cb;
else
job->patched_cb = NULL;
job->job_cb_size = job->user_cb_size;
hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle);
/* increment refcount as for external queues we get completion */
if (hw_queue_prop->type == QUEUE_TYPE_EXT)
cs_get(cs);
cs->jobs_in_queue_cnt[job->hw_queue_id]++;
list_add_tail(&job->cs_node, &cs->job_list);
hl_debugfs_add_job(hdev, job);
return 0;
}
static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
struct hl_ctx *ctx, struct hl_cs *cs,
u32 wait_queue_id, u32 collective_engine_id,
u32 encaps_signal_offset)
{
struct gaudi_device *gaudi = hdev->asic_specific;
struct hw_queue_properties *hw_queue_prop;
u32 queue_id, collective_queue, num_jobs;
u32 stream, nic_queue, nic_idx = 0;
bool skip;
int i, rc = 0;
/* Verify wait queue id is configured as master */
hw_queue_prop = &hdev->asic_prop.hw_queues_props[wait_queue_id];
if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
dev_err(hdev->dev,
"Queue %d is not configured as collective master\n" ,
wait_queue_id);
return -EINVAL;
}
/* Verify engine id is supported */
if (collective_engine_id != GAUDI_ENGINE_ID_DMA_5 &&
collective_engine_id != GAUDI_ENGINE_ID_TPC_7) {
dev_err(hdev->dev,
"Collective wait does not support engine %u\n" ,
collective_engine_id);
return -EINVAL;
}
stream = wait_queue_id % 4;
if (collective_engine_id == GAUDI_ENGINE_ID_DMA_5)
collective_queue = GAUDI_QUEUE_ID_DMA_5_0 + stream;
else
collective_queue = GAUDI_QUEUE_ID_TPC_7_0 + stream;
num_jobs = NUMBER_OF_SOBS_IN_GRP + 1;
nic_queue = GAUDI_QUEUE_ID_NIC_0_0 + stream;
/* First job goes to the collective master queue, it will wait for
* the collective slave queues to finish execution.
* The synchronization is done using two monitors:
* First monitor for NICs 0-7, second monitor for NICs 8-9 and the
* reduction engine (DMA5/TPC7).
*
* Rest of the jobs goes to the collective slave queues which will
* all wait for the user to signal sob 'cs_cmpl->sob_val'.
*/
for (i = 0 ; i < num_jobs ; i++) {
if (i == 0) {
queue_id = wait_queue_id;
rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
HL_COLLECTIVE_MASTER, queue_id,
wait_queue_id, encaps_signal_offset);
} else {
if (nic_idx < NIC_NUMBER_OF_ENGINES) {
if (gaudi->hw_cap_initialized &
BIT(HW_CAP_NIC_SHIFT + nic_idx))
skip = false ;
else
skip = true ;
queue_id = nic_queue;
nic_queue += 4;
nic_idx++;
if (skip)
continue ;
} else {
queue_id = collective_queue;
}
rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
HL_COLLECTIVE_SLAVE, queue_id,
wait_queue_id, encaps_signal_offset);
}
if (rc)
return rc;
}
return rc;
}
static int gaudi_late_init(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
int rc;
rc = gaudi->cpucp_info_get(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to get cpucp info\n" );
return rc;
}
if ((hdev->card_type == cpucp_card_type_pci) &&
(hdev->nic_ports_mask & 0x3)) {
dev_info(hdev->dev,
"PCI card detected, only 8 ports are enabled\n" );
hdev->nic_ports_mask &= ~0x3;
/* Stop and disable unused NIC QMANs */
WREG32(mmNIC0_QM0_GLBL_CFG1, NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK |
NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK |
NIC0_QM0_GLBL_CFG1_CP_STOP_MASK);
WREG32(mmNIC0_QM1_GLBL_CFG1, NIC0_QM0_GLBL_CFG1_PQF_STOP_MASK |
NIC0_QM0_GLBL_CFG1_CQF_STOP_MASK |
NIC0_QM0_GLBL_CFG1_CP_STOP_MASK);
WREG32(mmNIC0_QM0_GLBL_CFG0, 0);
WREG32(mmNIC0_QM1_GLBL_CFG0, 0);
gaudi->hw_cap_initialized &= ~(HW_CAP_NIC0 | HW_CAP_NIC1);
}
rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS, 0x0);
if (rc)
return rc;
/* Scrub both SRAM and DRAM */
rc = hdev->asic_funcs->scrub_device_mem(hdev);
if (rc)
goto disable_pci_access;
rc = gaudi_fetch_psoc_frequency(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to fetch psoc frequency\n" );
goto disable_pci_access;
}
rc = gaudi_mmu_clear_pgt_range(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to clear MMU page tables range\n" );
goto disable_pci_access;
}
rc = gaudi_init_tpc_mem(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to initialize TPC memories\n" );
goto disable_pci_access;
}
rc = gaudi_collective_init(hdev);
if (rc) {
dev_err(hdev->dev, "Failed to init collective\n" );
goto disable_pci_access;
}
/* We only support a single ASID for the user, so for the sake of optimization, just
* initialize the ASID one time during device initialization with the fixed value of 1
*/
gaudi_mmu_prepare(hdev, 1);
hl_fw_set_pll_profile(hdev);
return 0;
disable_pci_access:
hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
return rc;
}
static void gaudi_late_fini(struct hl_device *hdev)
{
hl_hwmon_release_resources(hdev);
}
static int gaudi_alloc_cpu_accessible_dma_mem(struct hl_device *hdev)
{
dma_addr_t dma_addr_arr[GAUDI_ALLOC_CPU_MEM_RETRY_CNT] = {}, end_addr;
void *virt_addr_arr[GAUDI_ALLOC_CPU_MEM_RETRY_CNT] = {};
int i, j, rc = 0;
/*
* The device CPU works with 40-bits addresses, while bit 39 must be set
* to '1' when accessing the host.
* Bits 49:39 of the full host address are saved for a later
* configuration of the HW to perform extension to 50 bits.
* Because there is a single HW register that holds the extension bits,
* these bits must be identical in all allocated range.
*/
for (i = 0 ; i < GAUDI_ALLOC_CPU_MEM_RETRY_CNT ; i++) {
virt_addr_arr[i] = hl_asic_dma_alloc_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE,
&dma_addr_arr[i],
GFP_KERNEL | __GFP_ZERO);
if (!virt_addr_arr[i]) {
rc = -ENOMEM;
goto free_dma_mem_arr;
}
end_addr = dma_addr_arr[i] + HL_CPU_ACCESSIBLE_MEM_SIZE - 1;
if (GAUDI_CPU_PCI_MSB_ADDR(dma_addr_arr[i]) ==
GAUDI_CPU_PCI_MSB_ADDR(end_addr))
break ;
}
if (i == GAUDI_ALLOC_CPU_MEM_RETRY_CNT) {
dev_err(hdev->dev,
"MSB of CPU accessible DMA memory are not identical in all range\n" );
rc = -EFAULT;
goto free_dma_mem_arr;
}
hdev->cpu_accessible_dma_mem = virt_addr_arr[i];
hdev->cpu_accessible_dma_address = dma_addr_arr[i];
hdev->cpu_pci_msb_addr =
GAUDI_CPU_PCI_MSB_ADDR(hdev->cpu_accessible_dma_address);
if (!hdev->asic_prop.fw_security_enabled)
GAUDI_PCI_TO_CPU_ADDR(hdev->cpu_accessible_dma_address);
free_dma_mem_arr:
for (j = 0 ; j < i ; j++)
hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, virt_addr_arr[j],
dma_addr_arr[j]);
return rc;
}
static void gaudi_free_internal_qmans_pq_mem(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
struct gaudi_internal_qman_info *q;
u32 i;
for (i = 0 ; i < GAUDI_QUEUE_ID_SIZE ; i++) {
q = &gaudi->internal_qmans[i];
if (!q->pq_kernel_addr)
continue ;
hl_asic_dma_free_coherent(hdev, q->pq_size, q->pq_kernel_addr, q->pq_dma_addr);
}
}
static int gaudi_alloc_internal_qmans_pq_mem(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
struct gaudi_internal_qman_info *q;
int rc, i;
for (i = 0 ; i < GAUDI_QUEUE_ID_SIZE ; i++) {
if (gaudi_queue_type[i] != QUEUE_TYPE_INT)
continue ;
q = &gaudi->internal_qmans[i];
switch (i) {
case GAUDI_QUEUE_ID_DMA_2_0 ... GAUDI_QUEUE_ID_DMA_7_3:
q->pq_size = HBM_DMA_QMAN_SIZE_IN_BYTES;
break ;
case GAUDI_QUEUE_ID_MME_0_0 ... GAUDI_QUEUE_ID_MME_1_3:
q->pq_size = MME_QMAN_SIZE_IN_BYTES;
break ;
case GAUDI_QUEUE_ID_TPC_0_0 ... GAUDI_QUEUE_ID_TPC_7_3:
q->pq_size = TPC_QMAN_SIZE_IN_BYTES;
break ;
case GAUDI_QUEUE_ID_NIC_0_0 ... GAUDI_QUEUE_ID_NIC_9_3:
q->pq_size = NIC_QMAN_SIZE_IN_BYTES;
break ;
default :
dev_err(hdev->dev, "Bad internal queue index %d" , i);
rc = -EINVAL;
goto free_internal_qmans_pq_mem;
}
q->pq_kernel_addr = hl_asic_dma_alloc_coherent(hdev, q->pq_size, &q->pq_dma_addr,
GFP_KERNEL | __GFP_ZERO);
if (!q->pq_kernel_addr) {
rc = -ENOMEM;
goto free_internal_qmans_pq_mem;
}
}
return 0;
free_internal_qmans_pq_mem:
gaudi_free_internal_qmans_pq_mem(hdev);
return rc;
}
static void gaudi_set_pci_memory_regions(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct pci_mem_region *region;
/* CFG */
region = &hdev->pci_mem_region[PCI_REGION_CFG];
region->region_base = CFG_BASE;
region->region_size = CFG_SIZE;
region->offset_in_bar = CFG_BASE - SPI_FLASH_BASE_ADDR;
region->bar_size = CFG_BAR_SIZE;
region->bar_id = CFG_BAR_ID;
region->used = 1;
/* SRAM */
region = &hdev->pci_mem_region[PCI_REGION_SRAM];
region->region_base = SRAM_BASE_ADDR;
region->region_size = SRAM_SIZE;
region->offset_in_bar = 0;
region->bar_size = SRAM_BAR_SIZE;
region->bar_id = SRAM_BAR_ID;
region->used = 1;
/* DRAM */
region = &hdev->pci_mem_region[PCI_REGION_DRAM];
region->region_base = DRAM_PHYS_BASE;
region->region_size = hdev->asic_prop.dram_size;
region->offset_in_bar = 0;
region->bar_size = prop->dram_pci_bar_size;
region->bar_id = HBM_BAR_ID;
region->used = 1;
/* SP SRAM */
region = &hdev->pci_mem_region[PCI_REGION_SP_SRAM];
region->region_base = PSOC_SCRATCHPAD_ADDR;
region->region_size = PSOC_SCRATCHPAD_SIZE;
region->offset_in_bar = PSOC_SCRATCHPAD_ADDR - SPI_FLASH_BASE_ADDR;
region->bar_size = CFG_BAR_SIZE;
region->bar_id = CFG_BAR_ID;
region->used = 1;
}
static int gaudi_sw_init(struct hl_device *hdev)
{
struct gaudi_device *gaudi;
u32 i, event_id = 0;
int rc;
/* Allocate device structure */
gaudi = kzalloc(sizeof (*gaudi), GFP_KERNEL);
if (!gaudi)
return -ENOMEM;
for (i = 0 ; i < ARRAY_SIZE(gaudi_irq_map_table) ; i++) {
if (gaudi_irq_map_table[i].valid) {
if (event_id == GAUDI_EVENT_SIZE) {
dev_err(hdev->dev,
"Event array exceeds the limit of %u events\n" ,
GAUDI_EVENT_SIZE);
rc = -EINVAL;
goto free_gaudi_device;
}
gaudi->events[event_id++] =
gaudi_irq_map_table[i].fc_id;
}
}
gaudi->cpucp_info_get = gaudi_cpucp_info_get;
hdev->asic_specific = gaudi;
/* Create DMA pool for small allocations */
hdev->dma_pool = dma_pool_create(dev_name(hdev->dev),
&hdev->pdev->dev, GAUDI_DMA_POOL_BLK_SIZE, 8, 0);
if (!hdev->dma_pool) {
dev_err(hdev->dev, "failed to create DMA pool\n" );
rc = -ENOMEM;
goto free_gaudi_device;
}
rc = gaudi_alloc_cpu_accessible_dma_mem(hdev);
if (rc)
goto free_dma_pool;
hdev->cpu_accessible_dma_pool = gen_pool_create(ilog2(32), -1);
if (!hdev->cpu_accessible_dma_pool) {
dev_err(hdev->dev,
"Failed to create CPU accessible DMA pool\n" );
rc = -ENOMEM;
goto free_cpu_dma_mem;
}
rc = gen_pool_add(hdev->cpu_accessible_dma_pool,
(uintptr_t) hdev->cpu_accessible_dma_mem,
HL_CPU_ACCESSIBLE_MEM_SIZE, -1);
if (rc) {
dev_err(hdev->dev,
"Failed to add memory to CPU accessible DMA pool\n" );
rc = -EFAULT;
goto free_cpu_accessible_dma_pool;
}
rc = gaudi_alloc_internal_qmans_pq_mem(hdev);
if (rc)
goto free_cpu_accessible_dma_pool;
spin_lock_init(&gaudi->hw_queues_lock);
hdev->supports_sync_stream = true ;
hdev->supports_coresight = true ;
hdev->supports_staged_submission = true ;
hdev->supports_wait_for_multi_cs = true ;
hdev->asic_funcs->set_pci_memory_regions(hdev);
hdev->stream_master_qid_arr =
hdev->asic_funcs->get_stream_master_qid_arr();
hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE;
return 0;
free_cpu_accessible_dma_pool:
gen_pool_destroy(hdev->cpu_accessible_dma_pool);
free_cpu_dma_mem:
if (!hdev->asic_prop.fw_security_enabled)
GAUDI_CPU_TO_PCI_ADDR(hdev->cpu_accessible_dma_address,
hdev->cpu_pci_msb_addr);
hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem,
hdev->cpu_accessible_dma_address);
free_dma_pool:
dma_pool_destroy(hdev->dma_pool);
free_gaudi_device:
kfree(gaudi);
return rc;
}
static int gaudi_sw_fini(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
gaudi_free_internal_qmans_pq_mem(hdev);
gen_pool_destroy(hdev->cpu_accessible_dma_pool);
if (!hdev->asic_prop.fw_security_enabled)
GAUDI_CPU_TO_PCI_ADDR(hdev->cpu_accessible_dma_address,
hdev->cpu_pci_msb_addr);
hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem,
hdev->cpu_accessible_dma_address);
dma_pool_destroy(hdev->dma_pool);
kfree(gaudi);
return 0;
}
static irqreturn_t gaudi_irq_handler_single(int irq, void *arg)
{
struct hl_device *hdev = arg;
int i;
if (hdev->disabled)
return IRQ_HANDLED;
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
hl_irq_handler_cq(irq, &hdev->completion_queue[i]);
hl_irq_handler_eq(irq, &hdev->event_queue);
return IRQ_HANDLED;
}
/*
* For backward compatibility, new MSI interrupts should be set after the
* existing CPU and NIC interrupts.
*/
static int gaudi_pci_irq_vector(struct hl_device *hdev, unsigned int nr,
bool cpu_eq)
{
int msi_vec;
if ((nr != GAUDI_EVENT_QUEUE_MSI_IDX) && (cpu_eq))
dev_crit(hdev->dev, "CPU EQ must use IRQ %d\n" ,
GAUDI_EVENT_QUEUE_MSI_IDX);
msi_vec = ((nr < GAUDI_EVENT_QUEUE_MSI_IDX) || (cpu_eq)) ? nr :
(nr + NIC_NUMBER_OF_ENGINES + 1);
return pci_irq_vector(hdev->pdev, msi_vec);
}
static int gaudi_enable_msi_single(struct hl_device *hdev)
{
int rc, irq;
dev_dbg(hdev->dev, "Working in single MSI IRQ mode\n" );
irq = gaudi_pci_irq_vector(hdev, 0, false );
rc = request_irq(irq, gaudi_irq_handler_single, 0,
"gaudi single msi" , hdev);
if (rc)
dev_err(hdev->dev,
"Failed to request single MSI IRQ\n" );
return rc;
}
static int gaudi_enable_msi(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
int rc;
if (gaudi->hw_cap_initialized & HW_CAP_MSI)
return 0;
rc = pci_alloc_irq_vectors(hdev->pdev, 1, 1, PCI_IRQ_MSI);
if (rc < 0) {
dev_err(hdev->dev, "MSI: Failed to enable support %d\n" , rc);
return rc;
}
rc = gaudi_enable_msi_single(hdev);
if (rc)
goto free_pci_irq_vectors;
gaudi->hw_cap_initialized |= HW_CAP_MSI;
return 0;
free_pci_irq_vectors:
pci_free_irq_vectors(hdev->pdev);
return rc;
}
static void gaudi_sync_irqs(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
if (!(gaudi->hw_cap_initialized & HW_CAP_MSI))
return ;
/* Wait for all pending IRQs to be finished */
synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false ));
}
static void gaudi_disable_msi(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
if (!(gaudi->hw_cap_initialized & HW_CAP_MSI))
return ;
gaudi_sync_irqs(hdev);
free_irq(gaudi_pci_irq_vector(hdev, 0, false ), hdev);
pci_free_irq_vectors(hdev->pdev);
gaudi->hw_cap_initialized &= ~HW_CAP_MSI;
}
static void gaudi_init_scrambler_sram(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
if (hdev->asic_prop.fw_security_enabled)
return ;
if (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
CPU_BOOT_DEV_STS0_SRAM_SCR_EN)
return ;
if (gaudi->hw_cap_initialized & HW_CAP_SRAM_SCRAMBLER)
return ;
WREG32(mmNIF_RTR_CTRL_0_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmNIF_RTR_CTRL_1_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmNIF_RTR_CTRL_2_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmNIF_RTR_CTRL_3_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmNIF_RTR_CTRL_4_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmNIF_RTR_CTRL_5_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmNIF_RTR_CTRL_6_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmNIF_RTR_CTRL_7_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmSIF_RTR_CTRL_0_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmSIF_RTR_CTRL_1_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmSIF_RTR_CTRL_2_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
WREG32(mmSIF_RTR_CTRL_3_SCRAM_SRAM_EN,
1 << IF_RTR_CTRL_SCRAM_SRAM_EN_VAL_SHIFT);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=92 H=86 G=88
¤ Dauer der Verarbeitung: 0.22 Sekunden
¤
*© Formatika GbR, Deutschland