162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci
362306a36Sopenharmony_ci/*
462306a36Sopenharmony_ci * Copyright 2016-2022 HabanaLabs, Ltd.
562306a36Sopenharmony_ci * All Rights Reserved.
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#define pr_fmt(fmt)			"habanalabs: " fmt
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <uapi/drm/habanalabs_accel.h>
1162306a36Sopenharmony_ci#include "habanalabs.h"
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#include <linux/pci.h>
1462306a36Sopenharmony_ci#include <linux/hwmon.h>
1562306a36Sopenharmony_ci#include <linux/vmalloc.h>
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#include <trace/events/habanalabs.h>
1862306a36Sopenharmony_ci
1962306a36Sopenharmony_ci#define HL_RESET_DELAY_USEC			10000	/* 10ms */
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci#define HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC	5
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_cienum dma_alloc_type {
2462306a36Sopenharmony_ci	DMA_ALLOC_COHERENT,
2562306a36Sopenharmony_ci	DMA_ALLOC_POOL,
2662306a36Sopenharmony_ci};
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#define MEM_SCRUB_DEFAULT_VAL 0x1122334455667788
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci/*
3162306a36Sopenharmony_ci * hl_set_dram_bar- sets the bar to allow later access to address
3262306a36Sopenharmony_ci *
3362306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure.
3462306a36Sopenharmony_ci * @addr: the address the caller wants to access.
3562306a36Sopenharmony_ci * @region: the PCI region.
3662306a36Sopenharmony_ci * @new_bar_region_base: the new BAR region base address.
3762306a36Sopenharmony_ci *
3862306a36Sopenharmony_ci * @return: the old BAR base address on success, U64_MAX for failure.
3962306a36Sopenharmony_ci *	    The caller should set it back to the old address after use.
4062306a36Sopenharmony_ci *
4162306a36Sopenharmony_ci * In case the bar space does not cover the whole address space,
4262306a36Sopenharmony_ci * the bar base address should be set to allow access to a given address.
4362306a36Sopenharmony_ci * This function can be called also if the bar doesn't need to be set,
4462306a36Sopenharmony_ci * in that case it just won't change the base.
4562306a36Sopenharmony_ci */
4662306a36Sopenharmony_cistatic u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region,
4762306a36Sopenharmony_ci				u64 *new_bar_region_base)
4862306a36Sopenharmony_ci{
4962306a36Sopenharmony_ci	struct asic_fixed_properties *prop = &hdev->asic_prop;
5062306a36Sopenharmony_ci	u64 bar_base_addr, old_base;
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci	if (is_power_of_2(prop->dram_pci_bar_size))
5362306a36Sopenharmony_ci		bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
5462306a36Sopenharmony_ci	else
5562306a36Sopenharmony_ci		bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) *
5662306a36Sopenharmony_ci				prop->dram_pci_bar_size;
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci	old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci	/* in case of success we need to update the new BAR base */
6162306a36Sopenharmony_ci	if ((old_base != U64_MAX) && new_bar_region_base)
6262306a36Sopenharmony_ci		*new_bar_region_base = bar_base_addr;
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci	return old_base;
6562306a36Sopenharmony_ci}
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ciint hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val,
6862306a36Sopenharmony_ci	enum debugfs_access_type acc_type, enum pci_region region_type, bool set_dram_bar)
6962306a36Sopenharmony_ci{
7062306a36Sopenharmony_ci	struct pci_mem_region *region = &hdev->pci_mem_region[region_type];
7162306a36Sopenharmony_ci	u64 old_base = 0, rc, bar_region_base = region->region_base;
7262306a36Sopenharmony_ci	void __iomem *acc_addr;
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci	if (set_dram_bar) {
7562306a36Sopenharmony_ci		old_base = hl_set_dram_bar(hdev, addr, region, &bar_region_base);
7662306a36Sopenharmony_ci		if (old_base == U64_MAX)
7762306a36Sopenharmony_ci			return -EIO;
7862306a36Sopenharmony_ci	}
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci	acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
8162306a36Sopenharmony_ci			(addr - bar_region_base);
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci	switch (acc_type) {
8462306a36Sopenharmony_ci	case DEBUGFS_READ8:
8562306a36Sopenharmony_ci		*val = readb(acc_addr);
8662306a36Sopenharmony_ci		break;
8762306a36Sopenharmony_ci	case DEBUGFS_WRITE8:
8862306a36Sopenharmony_ci		writeb(*val, acc_addr);
8962306a36Sopenharmony_ci		break;
9062306a36Sopenharmony_ci	case DEBUGFS_READ32:
9162306a36Sopenharmony_ci		*val = readl(acc_addr);
9262306a36Sopenharmony_ci		break;
9362306a36Sopenharmony_ci	case DEBUGFS_WRITE32:
9462306a36Sopenharmony_ci		writel(*val, acc_addr);
9562306a36Sopenharmony_ci		break;
9662306a36Sopenharmony_ci	case DEBUGFS_READ64:
9762306a36Sopenharmony_ci		*val = readq(acc_addr);
9862306a36Sopenharmony_ci		break;
9962306a36Sopenharmony_ci	case DEBUGFS_WRITE64:
10062306a36Sopenharmony_ci		writeq(*val, acc_addr);
10162306a36Sopenharmony_ci		break;
10262306a36Sopenharmony_ci	}
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	if (set_dram_bar) {
10562306a36Sopenharmony_ci		rc = hl_set_dram_bar(hdev, old_base, region, NULL);
10662306a36Sopenharmony_ci		if (rc == U64_MAX)
10762306a36Sopenharmony_ci			return -EIO;
10862306a36Sopenharmony_ci	}
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	return 0;
11162306a36Sopenharmony_ci}
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_cistatic void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle,
11462306a36Sopenharmony_ci					gfp_t flag, enum dma_alloc_type alloc_type,
11562306a36Sopenharmony_ci					const char *caller)
11662306a36Sopenharmony_ci{
11762306a36Sopenharmony_ci	void *ptr = NULL;
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci	switch (alloc_type) {
12062306a36Sopenharmony_ci	case DMA_ALLOC_COHERENT:
12162306a36Sopenharmony_ci		ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag);
12262306a36Sopenharmony_ci		break;
12362306a36Sopenharmony_ci	case DMA_ALLOC_POOL:
12462306a36Sopenharmony_ci		ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle);
12562306a36Sopenharmony_ci		break;
12662306a36Sopenharmony_ci	}
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	if (trace_habanalabs_dma_alloc_enabled() && !ZERO_OR_NULL_PTR(ptr))
12962306a36Sopenharmony_ci		trace_habanalabs_dma_alloc(hdev->dev, (u64) (uintptr_t) ptr, *dma_handle, size,
13062306a36Sopenharmony_ci						caller);
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	return ptr;
13362306a36Sopenharmony_ci}
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_cistatic void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *cpu_addr,
13662306a36Sopenharmony_ci					dma_addr_t dma_handle, enum dma_alloc_type alloc_type,
13762306a36Sopenharmony_ci					const char *caller)
13862306a36Sopenharmony_ci{
13962306a36Sopenharmony_ci	/* this is needed to avoid warning on using freed pointer */
14062306a36Sopenharmony_ci	u64 store_cpu_addr = (u64) (uintptr_t) cpu_addr;
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci	switch (alloc_type) {
14362306a36Sopenharmony_ci	case DMA_ALLOC_COHERENT:
14462306a36Sopenharmony_ci		hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle);
14562306a36Sopenharmony_ci		break;
14662306a36Sopenharmony_ci	case DMA_ALLOC_POOL:
14762306a36Sopenharmony_ci		hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle);
14862306a36Sopenharmony_ci		break;
14962306a36Sopenharmony_ci	}
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	trace_habanalabs_dma_free(hdev->dev, store_cpu_addr, dma_handle, size, caller);
15262306a36Sopenharmony_ci}
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_civoid *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle,
15562306a36Sopenharmony_ci					gfp_t flag, const char *caller)
15662306a36Sopenharmony_ci{
15762306a36Sopenharmony_ci	return hl_dma_alloc_common(hdev, size, dma_handle, flag, DMA_ALLOC_COHERENT, caller);
15862306a36Sopenharmony_ci}
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_civoid hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void *cpu_addr,
16162306a36Sopenharmony_ci					dma_addr_t dma_handle, const char *caller)
16262306a36Sopenharmony_ci{
16362306a36Sopenharmony_ci	hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller);
16462306a36Sopenharmony_ci}
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_civoid *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags,
16762306a36Sopenharmony_ci					dma_addr_t *dma_handle, const char *caller)
16862306a36Sopenharmony_ci{
16962306a36Sopenharmony_ci	return hl_dma_alloc_common(hdev, size, dma_handle, mem_flags, DMA_ALLOC_POOL, caller);
17062306a36Sopenharmony_ci}
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_civoid hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr,
17362306a36Sopenharmony_ci					const char *caller)
17462306a36Sopenharmony_ci{
17562306a36Sopenharmony_ci	hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller);
17662306a36Sopenharmony_ci}
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_civoid *hl_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle)
17962306a36Sopenharmony_ci{
18062306a36Sopenharmony_ci	return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle);
18162306a36Sopenharmony_ci}
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_civoid hl_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, void *vaddr)
18462306a36Sopenharmony_ci{
18562306a36Sopenharmony_ci	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr);
18662306a36Sopenharmony_ci}
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ciint hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir)
18962306a36Sopenharmony_ci{
19062306a36Sopenharmony_ci	struct asic_fixed_properties *prop = &hdev->asic_prop;
19162306a36Sopenharmony_ci	struct scatterlist *sg;
19262306a36Sopenharmony_ci	int rc, i;
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci	rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0);
19562306a36Sopenharmony_ci	if (rc)
19662306a36Sopenharmony_ci		return rc;
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	/* Shift to the device's base physical address of host memory if necessary */
19962306a36Sopenharmony_ci	if (prop->device_dma_offset_for_host_access)
20062306a36Sopenharmony_ci		for_each_sgtable_dma_sg(sgt, sg, i)
20162306a36Sopenharmony_ci			sg->dma_address += prop->device_dma_offset_for_host_access;
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	return 0;
20462306a36Sopenharmony_ci}
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_civoid hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir)
20762306a36Sopenharmony_ci{
20862306a36Sopenharmony_ci	struct asic_fixed_properties *prop = &hdev->asic_prop;
20962306a36Sopenharmony_ci	struct scatterlist *sg;
21062306a36Sopenharmony_ci	int i;
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	/* Cancel the device's base physical address of host memory if necessary */
21362306a36Sopenharmony_ci	if (prop->device_dma_offset_for_host_access)
21462306a36Sopenharmony_ci		for_each_sgtable_dma_sg(sgt, sg, i)
21562306a36Sopenharmony_ci			sg->dma_address -= prop->device_dma_offset_for_host_access;
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0);
21862306a36Sopenharmony_ci}
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci/*
22162306a36Sopenharmony_ci * hl_access_cfg_region - access the config region
22262306a36Sopenharmony_ci *
22362306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
22462306a36Sopenharmony_ci * @addr: the address to access
22562306a36Sopenharmony_ci * @val: the value to write from or read to
22662306a36Sopenharmony_ci * @acc_type: the type of access (read/write 64/32)
22762306a36Sopenharmony_ci */
22862306a36Sopenharmony_ciint hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val,
22962306a36Sopenharmony_ci	enum debugfs_access_type acc_type)
23062306a36Sopenharmony_ci{
23162306a36Sopenharmony_ci	struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG];
23262306a36Sopenharmony_ci	u32 val_h, val_l;
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci	if (!IS_ALIGNED(addr, sizeof(u32))) {
23562306a36Sopenharmony_ci		dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32));
23662306a36Sopenharmony_ci		return -EINVAL;
23762306a36Sopenharmony_ci	}
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci	switch (acc_type) {
24062306a36Sopenharmony_ci	case DEBUGFS_READ32:
24162306a36Sopenharmony_ci		*val = RREG32(addr - cfg_region->region_base);
24262306a36Sopenharmony_ci		break;
24362306a36Sopenharmony_ci	case DEBUGFS_WRITE32:
24462306a36Sopenharmony_ci		WREG32(addr - cfg_region->region_base, *val);
24562306a36Sopenharmony_ci		break;
24662306a36Sopenharmony_ci	case DEBUGFS_READ64:
24762306a36Sopenharmony_ci		val_l = RREG32(addr - cfg_region->region_base);
24862306a36Sopenharmony_ci		val_h = RREG32(addr + sizeof(u32) - cfg_region->region_base);
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci		*val = (((u64) val_h) << 32) | val_l;
25162306a36Sopenharmony_ci		break;
25262306a36Sopenharmony_ci	case DEBUGFS_WRITE64:
25362306a36Sopenharmony_ci		WREG32(addr - cfg_region->region_base, lower_32_bits(*val));
25462306a36Sopenharmony_ci		WREG32(addr + sizeof(u32) - cfg_region->region_base, upper_32_bits(*val));
25562306a36Sopenharmony_ci		break;
25662306a36Sopenharmony_ci	default:
25762306a36Sopenharmony_ci		dev_err(hdev->dev, "access type %d is not supported\n", acc_type);
25862306a36Sopenharmony_ci		return -EOPNOTSUPP;
25962306a36Sopenharmony_ci	}
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci	return 0;
26262306a36Sopenharmony_ci}
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci/*
26562306a36Sopenharmony_ci * hl_access_dev_mem - access device memory
26662306a36Sopenharmony_ci *
26762306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
26862306a36Sopenharmony_ci * @region_type: the type of the region the address belongs to
26962306a36Sopenharmony_ci * @addr: the address to access
27062306a36Sopenharmony_ci * @val: the value to write from or read to
27162306a36Sopenharmony_ci * @acc_type: the type of access (r/w, 32/64)
27262306a36Sopenharmony_ci */
27362306a36Sopenharmony_ciint hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type,
27462306a36Sopenharmony_ci			u64 addr, u64 *val, enum debugfs_access_type acc_type)
27562306a36Sopenharmony_ci{
27662306a36Sopenharmony_ci	switch (region_type) {
27762306a36Sopenharmony_ci	case PCI_REGION_CFG:
27862306a36Sopenharmony_ci		return hl_access_cfg_region(hdev, addr, val, acc_type);
27962306a36Sopenharmony_ci	case PCI_REGION_SRAM:
28062306a36Sopenharmony_ci	case PCI_REGION_DRAM:
28162306a36Sopenharmony_ci		return hl_access_sram_dram_region(hdev, addr, val, acc_type,
28262306a36Sopenharmony_ci				region_type, (region_type == PCI_REGION_DRAM));
28362306a36Sopenharmony_ci	default:
28462306a36Sopenharmony_ci		return -EFAULT;
28562306a36Sopenharmony_ci	}
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci	return 0;
28862306a36Sopenharmony_ci}
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_civoid hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...)
29162306a36Sopenharmony_ci{
29262306a36Sopenharmony_ci	va_list args;
29362306a36Sopenharmony_ci	int str_size;
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	va_start(args, fmt);
29662306a36Sopenharmony_ci	/* Calculate formatted string length. Assuming each string is null terminated, hence
29762306a36Sopenharmony_ci	 * increment result by 1
29862306a36Sopenharmony_ci	 */
29962306a36Sopenharmony_ci	str_size = vsnprintf(NULL, 0, fmt, args) + 1;
30062306a36Sopenharmony_ci	va_end(args);
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	if ((e->actual_size + str_size) < e->allocated_buf_size) {
30362306a36Sopenharmony_ci		va_start(args, fmt);
30462306a36Sopenharmony_ci		vsnprintf(e->buf + e->actual_size, str_size, fmt, args);
30562306a36Sopenharmony_ci		va_end(args);
30662306a36Sopenharmony_ci	}
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	/* Need to update the size even when not updating destination buffer to get the exact size
30962306a36Sopenharmony_ci	 * of all input strings
31062306a36Sopenharmony_ci	 */
31162306a36Sopenharmony_ci	e->actual_size += str_size;
31262306a36Sopenharmony_ci}
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_cienum hl_device_status hl_device_status(struct hl_device *hdev)
31562306a36Sopenharmony_ci{
31662306a36Sopenharmony_ci	enum hl_device_status status;
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	if (hdev->reset_info.in_reset) {
31962306a36Sopenharmony_ci		if (hdev->reset_info.in_compute_reset)
32062306a36Sopenharmony_ci			status = HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE;
32162306a36Sopenharmony_ci		else
32262306a36Sopenharmony_ci			status = HL_DEVICE_STATUS_IN_RESET;
32362306a36Sopenharmony_ci	} else if (hdev->reset_info.needs_reset) {
32462306a36Sopenharmony_ci		status = HL_DEVICE_STATUS_NEEDS_RESET;
32562306a36Sopenharmony_ci	} else if (hdev->disabled) {
32662306a36Sopenharmony_ci		status = HL_DEVICE_STATUS_MALFUNCTION;
32762306a36Sopenharmony_ci	} else if (!hdev->init_done) {
32862306a36Sopenharmony_ci		status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
32962306a36Sopenharmony_ci	} else {
33062306a36Sopenharmony_ci		status = HL_DEVICE_STATUS_OPERATIONAL;
33162306a36Sopenharmony_ci	}
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci	return status;
33462306a36Sopenharmony_ci}
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_cibool hl_device_operational(struct hl_device *hdev,
33762306a36Sopenharmony_ci		enum hl_device_status *status)
33862306a36Sopenharmony_ci{
33962306a36Sopenharmony_ci	enum hl_device_status current_status;
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	current_status = hl_device_status(hdev);
34262306a36Sopenharmony_ci	if (status)
34362306a36Sopenharmony_ci		*status = current_status;
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci	switch (current_status) {
34662306a36Sopenharmony_ci	case HL_DEVICE_STATUS_IN_RESET:
34762306a36Sopenharmony_ci	case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE:
34862306a36Sopenharmony_ci	case HL_DEVICE_STATUS_MALFUNCTION:
34962306a36Sopenharmony_ci	case HL_DEVICE_STATUS_NEEDS_RESET:
35062306a36Sopenharmony_ci		return false;
35162306a36Sopenharmony_ci	case HL_DEVICE_STATUS_OPERATIONAL:
35262306a36Sopenharmony_ci	case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
35362306a36Sopenharmony_ci	default:
35462306a36Sopenharmony_ci		return true;
35562306a36Sopenharmony_ci	}
35662306a36Sopenharmony_ci}
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_cibool hl_ctrl_device_operational(struct hl_device *hdev,
35962306a36Sopenharmony_ci		enum hl_device_status *status)
36062306a36Sopenharmony_ci{
36162306a36Sopenharmony_ci	enum hl_device_status current_status;
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci	current_status = hl_device_status(hdev);
36462306a36Sopenharmony_ci	if (status)
36562306a36Sopenharmony_ci		*status = current_status;
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	switch (current_status) {
36862306a36Sopenharmony_ci	case HL_DEVICE_STATUS_MALFUNCTION:
36962306a36Sopenharmony_ci		return false;
37062306a36Sopenharmony_ci	case HL_DEVICE_STATUS_IN_RESET:
37162306a36Sopenharmony_ci	case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE:
37262306a36Sopenharmony_ci	case HL_DEVICE_STATUS_NEEDS_RESET:
37362306a36Sopenharmony_ci	case HL_DEVICE_STATUS_OPERATIONAL:
37462306a36Sopenharmony_ci	case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
37562306a36Sopenharmony_ci	default:
37662306a36Sopenharmony_ci		return true;
37762306a36Sopenharmony_ci	}
37862306a36Sopenharmony_ci}
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_cistatic void print_idle_status_mask(struct hl_device *hdev, const char *message,
38162306a36Sopenharmony_ci					u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE])
38262306a36Sopenharmony_ci{
38362306a36Sopenharmony_ci	if (idle_mask[3])
38462306a36Sopenharmony_ci		dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx_%016llx)\n",
38562306a36Sopenharmony_ci			message, idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
38662306a36Sopenharmony_ci	else if (idle_mask[2])
38762306a36Sopenharmony_ci		dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx)\n",
38862306a36Sopenharmony_ci			message, idle_mask[2], idle_mask[1], idle_mask[0]);
38962306a36Sopenharmony_ci	else if (idle_mask[1])
39062306a36Sopenharmony_ci		dev_err(hdev->dev, "%s (mask %#llx_%016llx)\n",
39162306a36Sopenharmony_ci			message, idle_mask[1], idle_mask[0]);
39262306a36Sopenharmony_ci	else
39362306a36Sopenharmony_ci		dev_err(hdev->dev, "%s (mask %#llx)\n", message, idle_mask[0]);
39462306a36Sopenharmony_ci}
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_cistatic void hpriv_release(struct kref *ref)
39762306a36Sopenharmony_ci{
39862306a36Sopenharmony_ci	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
39962306a36Sopenharmony_ci	bool reset_device, device_is_idle = true;
40062306a36Sopenharmony_ci	struct hl_fpriv *hpriv;
40162306a36Sopenharmony_ci	struct hl_device *hdev;
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci	hpriv = container_of(ref, struct hl_fpriv, refcount);
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci	hdev = hpriv->hdev;
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci	hdev->asic_funcs->send_device_activity(hdev, false);
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	put_pid(hpriv->taskpid);
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci	hl_debugfs_remove_file(hpriv);
41262306a36Sopenharmony_ci
41362306a36Sopenharmony_ci	mutex_destroy(&hpriv->ctx_lock);
41462306a36Sopenharmony_ci	mutex_destroy(&hpriv->restore_phase_mutex);
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ci	/* There should be no memory buffers at this point and handles IDR can be destroyed */
41762306a36Sopenharmony_ci	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	/* Device should be reset if reset-upon-device-release is enabled, or if there is a pending
42062306a36Sopenharmony_ci	 * reset that waits for device release.
42162306a36Sopenharmony_ci	 */
42262306a36Sopenharmony_ci	reset_device = hdev->reset_upon_device_release || hdev->reset_info.watchdog_active;
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci	/* Check the device idle status and reset if not idle.
42562306a36Sopenharmony_ci	 * Skip it if already in reset, or if device is going to be reset in any case.
42662306a36Sopenharmony_ci	 */
42762306a36Sopenharmony_ci	if (!hdev->reset_info.in_reset && !reset_device && hdev->pdev && !hdev->pldm)
42862306a36Sopenharmony_ci		device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask,
42962306a36Sopenharmony_ci							HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL);
43062306a36Sopenharmony_ci	if (!device_is_idle) {
43162306a36Sopenharmony_ci		print_idle_status_mask(hdev, "device is not idle after user context is closed",
43262306a36Sopenharmony_ci					idle_mask);
43362306a36Sopenharmony_ci		reset_device = true;
43462306a36Sopenharmony_ci	}
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci	/* We need to remove the user from the list to make sure the reset process won't
43762306a36Sopenharmony_ci	 * try to kill the user process. Because, if we got here, it means there are no
43862306a36Sopenharmony_ci	 * more driver/device resources that the user process is occupying so there is
43962306a36Sopenharmony_ci	 * no need to kill it
44062306a36Sopenharmony_ci	 *
44162306a36Sopenharmony_ci	 * However, we can't set the compute_ctx to NULL at this stage. This is to prevent
44262306a36Sopenharmony_ci	 * a race between the release and opening the device again. We don't want to let
44362306a36Sopenharmony_ci	 * a user open the device while there a reset is about to happen.
44462306a36Sopenharmony_ci	 */
44562306a36Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
44662306a36Sopenharmony_ci	list_del(&hpriv->dev_node);
44762306a36Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci	if (reset_device) {
45062306a36Sopenharmony_ci		hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE);
45162306a36Sopenharmony_ci	} else {
45262306a36Sopenharmony_ci		/* Scrubbing is handled within hl_device_reset(), so here need to do it directly */
45362306a36Sopenharmony_ci		int rc = hdev->asic_funcs->scrub_device_mem(hdev);
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci		if (rc)
45662306a36Sopenharmony_ci			dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc);
45762306a36Sopenharmony_ci	}
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci	/* Now we can mark the compute_ctx as not active. Even if a reset is running in a different
46062306a36Sopenharmony_ci	 * thread, we don't care because the in_reset is marked so if a user will try to open
46162306a36Sopenharmony_ci	 * the device it will fail on that, even if compute_ctx is false.
46262306a36Sopenharmony_ci	 */
46362306a36Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
46462306a36Sopenharmony_ci	hdev->is_compute_ctx_active = false;
46562306a36Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_ci	hdev->compute_ctx_in_release = 0;
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	/* release the eventfd */
47062306a36Sopenharmony_ci	if (hpriv->notifier_event.eventfd)
47162306a36Sopenharmony_ci		eventfd_ctx_put(hpriv->notifier_event.eventfd);
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	mutex_destroy(&hpriv->notifier_event.lock);
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	kfree(hpriv);
47662306a36Sopenharmony_ci}
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_civoid hl_hpriv_get(struct hl_fpriv *hpriv)
47962306a36Sopenharmony_ci{
48062306a36Sopenharmony_ci	kref_get(&hpriv->refcount);
48162306a36Sopenharmony_ci}
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ciint hl_hpriv_put(struct hl_fpriv *hpriv)
48462306a36Sopenharmony_ci{
48562306a36Sopenharmony_ci	return kref_put(&hpriv->refcount, hpriv_release);
48662306a36Sopenharmony_ci}
48762306a36Sopenharmony_ci
48862306a36Sopenharmony_cistatic void print_device_in_use_info(struct hl_device *hdev, const char *message)
48962306a36Sopenharmony_ci{
49062306a36Sopenharmony_ci	u32 active_cs_num, dmabuf_export_cnt;
49162306a36Sopenharmony_ci	bool unknown_reason = true;
49262306a36Sopenharmony_ci	char buf[128];
49362306a36Sopenharmony_ci	size_t size;
49462306a36Sopenharmony_ci	int offset;
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci	size = sizeof(buf);
49762306a36Sopenharmony_ci	offset = 0;
49862306a36Sopenharmony_ci
49962306a36Sopenharmony_ci	active_cs_num = hl_get_active_cs_num(hdev);
50062306a36Sopenharmony_ci	if (active_cs_num) {
50162306a36Sopenharmony_ci		unknown_reason = false;
50262306a36Sopenharmony_ci		offset += scnprintf(buf + offset, size - offset, " [%u active CS]", active_cs_num);
50362306a36Sopenharmony_ci	}
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	dmabuf_export_cnt = atomic_read(&hdev->dmabuf_export_cnt);
50662306a36Sopenharmony_ci	if (dmabuf_export_cnt) {
50762306a36Sopenharmony_ci		unknown_reason = false;
50862306a36Sopenharmony_ci		offset += scnprintf(buf + offset, size - offset, " [%u exported dma-buf]",
50962306a36Sopenharmony_ci					dmabuf_export_cnt);
51062306a36Sopenharmony_ci	}
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	if (unknown_reason)
51362306a36Sopenharmony_ci		scnprintf(buf + offset, size - offset, " [unknown reason]");
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	dev_notice(hdev->dev, "%s%s\n", message, buf);
51662306a36Sopenharmony_ci}
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci/*
51962306a36Sopenharmony_ci * hl_device_release - release function for habanalabs device
52062306a36Sopenharmony_ci *
52162306a36Sopenharmony_ci * @inode: pointer to inode structure
52262306a36Sopenharmony_ci * @filp: pointer to file structure
52362306a36Sopenharmony_ci *
52462306a36Sopenharmony_ci * Called when process closes an habanalabs device
52562306a36Sopenharmony_ci */
52662306a36Sopenharmony_cistatic int hl_device_release(struct inode *inode, struct file *filp)
52762306a36Sopenharmony_ci{
52862306a36Sopenharmony_ci	struct hl_fpriv *hpriv = filp->private_data;
52962306a36Sopenharmony_ci	struct hl_device *hdev = hpriv->hdev;
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci	filp->private_data = NULL;
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci	if (!hdev) {
53462306a36Sopenharmony_ci		pr_crit("Closing FD after device was removed. Memory leak will occur and it is advised to reboot.\n");
53562306a36Sopenharmony_ci		put_pid(hpriv->taskpid);
53662306a36Sopenharmony_ci		return 0;
53762306a36Sopenharmony_ci	}
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_ci	hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci	/* Memory buffers might be still in use at this point and thus the handles IDR destruction
54262306a36Sopenharmony_ci	 * is postponed to hpriv_release().
54362306a36Sopenharmony_ci	 */
54462306a36Sopenharmony_ci	hl_mem_mgr_fini(&hpriv->mem_mgr);
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci	hdev->compute_ctx_in_release = 1;
54762306a36Sopenharmony_ci
54862306a36Sopenharmony_ci	if (!hl_hpriv_put(hpriv)) {
54962306a36Sopenharmony_ci		print_device_in_use_info(hdev, "User process closed FD but device still in use");
55062306a36Sopenharmony_ci		hl_device_reset(hdev, HL_DRV_RESET_HARD);
55162306a36Sopenharmony_ci	}
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif;
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci	return 0;
55662306a36Sopenharmony_ci}
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_cistatic int hl_device_release_ctrl(struct inode *inode, struct file *filp)
55962306a36Sopenharmony_ci{
56062306a36Sopenharmony_ci	struct hl_fpriv *hpriv = filp->private_data;
56162306a36Sopenharmony_ci	struct hl_device *hdev = hpriv->hdev;
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci	filp->private_data = NULL;
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci	if (!hdev) {
56662306a36Sopenharmony_ci		pr_err("Closing FD after device was removed\n");
56762306a36Sopenharmony_ci		goto out;
56862306a36Sopenharmony_ci	}
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	mutex_lock(&hdev->fpriv_ctrl_list_lock);
57162306a36Sopenharmony_ci	list_del(&hpriv->dev_node);
57262306a36Sopenharmony_ci	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
57362306a36Sopenharmony_ciout:
57462306a36Sopenharmony_ci	/* release the eventfd */
57562306a36Sopenharmony_ci	if (hpriv->notifier_event.eventfd)
57662306a36Sopenharmony_ci		eventfd_ctx_put(hpriv->notifier_event.eventfd);
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	mutex_destroy(&hpriv->notifier_event.lock);
57962306a36Sopenharmony_ci	put_pid(hpriv->taskpid);
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	kfree(hpriv);
58262306a36Sopenharmony_ci
58362306a36Sopenharmony_ci	return 0;
58462306a36Sopenharmony_ci}
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_ci/*
58762306a36Sopenharmony_ci * hl_mmap - mmap function for habanalabs device
58862306a36Sopenharmony_ci *
58962306a36Sopenharmony_ci * @*filp: pointer to file structure
59062306a36Sopenharmony_ci * @*vma: pointer to vm_area_struct of the process
59162306a36Sopenharmony_ci *
59262306a36Sopenharmony_ci * Called when process does an mmap on habanalabs device. Call the relevant mmap
59362306a36Sopenharmony_ci * function at the end of the common code.
59462306a36Sopenharmony_ci */
59562306a36Sopenharmony_cistatic int hl_mmap(struct file *filp, struct vm_area_struct *vma)
59662306a36Sopenharmony_ci{
59762306a36Sopenharmony_ci	struct hl_fpriv *hpriv = filp->private_data;
59862306a36Sopenharmony_ci	struct hl_device *hdev = hpriv->hdev;
59962306a36Sopenharmony_ci	unsigned long vm_pgoff;
60062306a36Sopenharmony_ci
60162306a36Sopenharmony_ci	if (!hdev) {
60262306a36Sopenharmony_ci		pr_err_ratelimited("Trying to mmap after device was removed! Please close FD\n");
60362306a36Sopenharmony_ci		return -ENODEV;
60462306a36Sopenharmony_ci	}
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci	vm_pgoff = vma->vm_pgoff;
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci	switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
60962306a36Sopenharmony_ci	case HL_MMAP_TYPE_BLOCK:
61062306a36Sopenharmony_ci		vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
61162306a36Sopenharmony_ci		return hl_hw_block_mmap(hpriv, vma);
61262306a36Sopenharmony_ci
61362306a36Sopenharmony_ci	case HL_MMAP_TYPE_CB:
61462306a36Sopenharmony_ci	case HL_MMAP_TYPE_TS_BUFF:
61562306a36Sopenharmony_ci		return hl_mem_mgr_mmap(&hpriv->mem_mgr, vma, NULL);
61662306a36Sopenharmony_ci	}
61762306a36Sopenharmony_ci	return -EINVAL;
61862306a36Sopenharmony_ci}
61962306a36Sopenharmony_ci
62062306a36Sopenharmony_cistatic const struct file_operations hl_ops = {
62162306a36Sopenharmony_ci	.owner = THIS_MODULE,
62262306a36Sopenharmony_ci	.open = hl_device_open,
62362306a36Sopenharmony_ci	.release = hl_device_release,
62462306a36Sopenharmony_ci	.mmap = hl_mmap,
62562306a36Sopenharmony_ci	.unlocked_ioctl = hl_ioctl,
62662306a36Sopenharmony_ci	.compat_ioctl = hl_ioctl
62762306a36Sopenharmony_ci};
62862306a36Sopenharmony_ci
62962306a36Sopenharmony_cistatic const struct file_operations hl_ctrl_ops = {
63062306a36Sopenharmony_ci	.owner = THIS_MODULE,
63162306a36Sopenharmony_ci	.open = hl_device_open_ctrl,
63262306a36Sopenharmony_ci	.release = hl_device_release_ctrl,
63362306a36Sopenharmony_ci	.unlocked_ioctl = hl_ioctl_control,
63462306a36Sopenharmony_ci	.compat_ioctl = hl_ioctl_control
63562306a36Sopenharmony_ci};
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_cistatic void device_release_func(struct device *dev)
63862306a36Sopenharmony_ci{
63962306a36Sopenharmony_ci	kfree(dev);
64062306a36Sopenharmony_ci}
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci/*
64362306a36Sopenharmony_ci * device_init_cdev - Initialize cdev and device for habanalabs device
64462306a36Sopenharmony_ci *
64562306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
64662306a36Sopenharmony_ci * @class: pointer to the class object of the device
64762306a36Sopenharmony_ci * @minor: minor number of the specific device
64862306a36Sopenharmony_ci * @fpos: file operations to install for this device
64962306a36Sopenharmony_ci * @name: name of the device as it will appear in the filesystem
65062306a36Sopenharmony_ci * @cdev: pointer to the char device object that will be initialized
65162306a36Sopenharmony_ci * @dev: pointer to the device object that will be initialized
65262306a36Sopenharmony_ci *
65362306a36Sopenharmony_ci * Initialize a cdev and a Linux device for habanalabs's device.
65462306a36Sopenharmony_ci */
65562306a36Sopenharmony_cistatic int device_init_cdev(struct hl_device *hdev, struct class *class,
65662306a36Sopenharmony_ci				int minor, const struct file_operations *fops,
65762306a36Sopenharmony_ci				char *name, struct cdev *cdev,
65862306a36Sopenharmony_ci				struct device **dev)
65962306a36Sopenharmony_ci{
66062306a36Sopenharmony_ci	cdev_init(cdev, fops);
66162306a36Sopenharmony_ci	cdev->owner = THIS_MODULE;
66262306a36Sopenharmony_ci
66362306a36Sopenharmony_ci	*dev = kzalloc(sizeof(**dev), GFP_KERNEL);
66462306a36Sopenharmony_ci	if (!*dev)
66562306a36Sopenharmony_ci		return -ENOMEM;
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci	device_initialize(*dev);
66862306a36Sopenharmony_ci	(*dev)->devt = MKDEV(hdev->major, minor);
66962306a36Sopenharmony_ci	(*dev)->class = class;
67062306a36Sopenharmony_ci	(*dev)->release = device_release_func;
67162306a36Sopenharmony_ci	dev_set_drvdata(*dev, hdev);
67262306a36Sopenharmony_ci	dev_set_name(*dev, "%s", name);
67362306a36Sopenharmony_ci
67462306a36Sopenharmony_ci	return 0;
67562306a36Sopenharmony_ci}
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_cistatic int cdev_sysfs_debugfs_add(struct hl_device *hdev)
67862306a36Sopenharmony_ci{
67962306a36Sopenharmony_ci	int rc;
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ci	rc = cdev_device_add(&hdev->cdev, hdev->dev);
68262306a36Sopenharmony_ci	if (rc) {
68362306a36Sopenharmony_ci		dev_err(hdev->dev,
68462306a36Sopenharmony_ci			"failed to add a char device to the system\n");
68562306a36Sopenharmony_ci		return rc;
68662306a36Sopenharmony_ci	}
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci	rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
68962306a36Sopenharmony_ci	if (rc) {
69062306a36Sopenharmony_ci		dev_err(hdev->dev,
69162306a36Sopenharmony_ci			"failed to add a control char device to the system\n");
69262306a36Sopenharmony_ci		goto delete_cdev_device;
69362306a36Sopenharmony_ci	}
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci	/* hl_sysfs_init() must be done after adding the device to the system */
69662306a36Sopenharmony_ci	rc = hl_sysfs_init(hdev);
69762306a36Sopenharmony_ci	if (rc) {
69862306a36Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize sysfs\n");
69962306a36Sopenharmony_ci		goto delete_ctrl_cdev_device;
70062306a36Sopenharmony_ci	}
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci	hl_debugfs_add_device(hdev);
70362306a36Sopenharmony_ci
70462306a36Sopenharmony_ci	hdev->cdev_sysfs_debugfs_created = true;
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci	return 0;
70762306a36Sopenharmony_ci
70862306a36Sopenharmony_cidelete_ctrl_cdev_device:
70962306a36Sopenharmony_ci	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
71062306a36Sopenharmony_cidelete_cdev_device:
71162306a36Sopenharmony_ci	cdev_device_del(&hdev->cdev, hdev->dev);
71262306a36Sopenharmony_ci	return rc;
71362306a36Sopenharmony_ci}
71462306a36Sopenharmony_ci
71562306a36Sopenharmony_cistatic void cdev_sysfs_debugfs_remove(struct hl_device *hdev)
71662306a36Sopenharmony_ci{
71762306a36Sopenharmony_ci	if (!hdev->cdev_sysfs_debugfs_created)
71862306a36Sopenharmony_ci		goto put_devices;
71962306a36Sopenharmony_ci
72062306a36Sopenharmony_ci	hl_debugfs_remove_device(hdev);
72162306a36Sopenharmony_ci	hl_sysfs_fini(hdev);
72262306a36Sopenharmony_ci	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
72362306a36Sopenharmony_ci	cdev_device_del(&hdev->cdev, hdev->dev);
72462306a36Sopenharmony_ci
72562306a36Sopenharmony_ciput_devices:
72662306a36Sopenharmony_ci	put_device(hdev->dev);
72762306a36Sopenharmony_ci	put_device(hdev->dev_ctrl);
72862306a36Sopenharmony_ci}
72962306a36Sopenharmony_ci
73062306a36Sopenharmony_cistatic void device_hard_reset_pending(struct work_struct *work)
73162306a36Sopenharmony_ci{
73262306a36Sopenharmony_ci	struct hl_device_reset_work *device_reset_work =
73362306a36Sopenharmony_ci		container_of(work, struct hl_device_reset_work, reset_work.work);
73462306a36Sopenharmony_ci	struct hl_device *hdev = device_reset_work->hdev;
73562306a36Sopenharmony_ci	u32 flags;
73662306a36Sopenharmony_ci	int rc;
73762306a36Sopenharmony_ci
73862306a36Sopenharmony_ci	flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR;
73962306a36Sopenharmony_ci
74062306a36Sopenharmony_ci	rc = hl_device_reset(hdev, flags);
74162306a36Sopenharmony_ci
74262306a36Sopenharmony_ci	if ((rc == -EBUSY) && !hdev->device_fini_pending) {
74362306a36Sopenharmony_ci		struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci		if (ctx) {
74662306a36Sopenharmony_ci			/* The read refcount value should subtracted by one, because the read is
74762306a36Sopenharmony_ci			 * protected with hl_get_compute_ctx().
74862306a36Sopenharmony_ci			 */
74962306a36Sopenharmony_ci			dev_info(hdev->dev,
75062306a36Sopenharmony_ci				"Could not reset device (compute_ctx refcount %u). will try again in %u seconds",
75162306a36Sopenharmony_ci				kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC);
75262306a36Sopenharmony_ci			hl_ctx_put(ctx);
75362306a36Sopenharmony_ci		} else {
75462306a36Sopenharmony_ci			dev_info(hdev->dev, "Could not reset device. will try again in %u seconds",
75562306a36Sopenharmony_ci				HL_PENDING_RESET_PER_SEC);
75662306a36Sopenharmony_ci		}
75762306a36Sopenharmony_ci
75862306a36Sopenharmony_ci		queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work,
75962306a36Sopenharmony_ci					msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000));
76062306a36Sopenharmony_ci	}
76162306a36Sopenharmony_ci}
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_cistatic void device_release_watchdog_func(struct work_struct *work)
76462306a36Sopenharmony_ci{
76562306a36Sopenharmony_ci	struct hl_device_reset_work *watchdog_work =
76662306a36Sopenharmony_ci			container_of(work, struct hl_device_reset_work, reset_work.work);
76762306a36Sopenharmony_ci	struct hl_device *hdev = watchdog_work->hdev;
76862306a36Sopenharmony_ci	u32 flags;
76962306a36Sopenharmony_ci
77062306a36Sopenharmony_ci	dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n");
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci	flags = watchdog_work->flags | HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_WD_THR;
77362306a36Sopenharmony_ci
77462306a36Sopenharmony_ci	hl_device_reset(hdev, flags);
77562306a36Sopenharmony_ci}
77662306a36Sopenharmony_ci
77762306a36Sopenharmony_ci/*
77862306a36Sopenharmony_ci * device_early_init - do some early initialization for the habanalabs device
77962306a36Sopenharmony_ci *
78062306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
78162306a36Sopenharmony_ci *
78262306a36Sopenharmony_ci * Install the relevant function pointers and call the early_init function,
78362306a36Sopenharmony_ci * if such a function exists
78462306a36Sopenharmony_ci */
78562306a36Sopenharmony_cistatic int device_early_init(struct hl_device *hdev)
78662306a36Sopenharmony_ci{
78762306a36Sopenharmony_ci	int i, rc;
78862306a36Sopenharmony_ci	char workq_name[32];
78962306a36Sopenharmony_ci
79062306a36Sopenharmony_ci	switch (hdev->asic_type) {
79162306a36Sopenharmony_ci	case ASIC_GOYA:
79262306a36Sopenharmony_ci		goya_set_asic_funcs(hdev);
79362306a36Sopenharmony_ci		strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
79462306a36Sopenharmony_ci		break;
79562306a36Sopenharmony_ci	case ASIC_GAUDI:
79662306a36Sopenharmony_ci		gaudi_set_asic_funcs(hdev);
79762306a36Sopenharmony_ci		strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name));
79862306a36Sopenharmony_ci		break;
79962306a36Sopenharmony_ci	case ASIC_GAUDI_SEC:
80062306a36Sopenharmony_ci		gaudi_set_asic_funcs(hdev);
80162306a36Sopenharmony_ci		strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name));
80262306a36Sopenharmony_ci		break;
80362306a36Sopenharmony_ci	case ASIC_GAUDI2:
80462306a36Sopenharmony_ci		gaudi2_set_asic_funcs(hdev);
80562306a36Sopenharmony_ci		strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name));
80662306a36Sopenharmony_ci		break;
80762306a36Sopenharmony_ci	case ASIC_GAUDI2B:
80862306a36Sopenharmony_ci		gaudi2_set_asic_funcs(hdev);
80962306a36Sopenharmony_ci		strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name));
81062306a36Sopenharmony_ci		break;
81162306a36Sopenharmony_ci	case ASIC_GAUDI2C:
81262306a36Sopenharmony_ci		gaudi2_set_asic_funcs(hdev);
81362306a36Sopenharmony_ci		strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name));
81462306a36Sopenharmony_ci		break;
81562306a36Sopenharmony_ci	default:
81662306a36Sopenharmony_ci		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
81762306a36Sopenharmony_ci			hdev->asic_type);
81862306a36Sopenharmony_ci		return -EINVAL;
81962306a36Sopenharmony_ci	}
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_ci	rc = hdev->asic_funcs->early_init(hdev);
82262306a36Sopenharmony_ci	if (rc)
82362306a36Sopenharmony_ci		return rc;
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_ci	rc = hl_asid_init(hdev);
82662306a36Sopenharmony_ci	if (rc)
82762306a36Sopenharmony_ci		goto early_fini;
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci	if (hdev->asic_prop.completion_queues_count) {
83062306a36Sopenharmony_ci		hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
83162306a36Sopenharmony_ci				sizeof(struct workqueue_struct *),
83262306a36Sopenharmony_ci				GFP_KERNEL);
83362306a36Sopenharmony_ci		if (!hdev->cq_wq) {
83462306a36Sopenharmony_ci			rc = -ENOMEM;
83562306a36Sopenharmony_ci			goto asid_fini;
83662306a36Sopenharmony_ci		}
83762306a36Sopenharmony_ci	}
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
84062306a36Sopenharmony_ci		snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i);
84162306a36Sopenharmony_ci		hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
84262306a36Sopenharmony_ci		if (hdev->cq_wq[i] == NULL) {
84362306a36Sopenharmony_ci			dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
84462306a36Sopenharmony_ci			rc = -ENOMEM;
84562306a36Sopenharmony_ci			goto free_cq_wq;
84662306a36Sopenharmony_ci		}
84762306a36Sopenharmony_ci	}
84862306a36Sopenharmony_ci
84962306a36Sopenharmony_ci	snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx);
85062306a36Sopenharmony_ci	hdev->eq_wq = create_singlethread_workqueue(workq_name);
85162306a36Sopenharmony_ci	if (hdev->eq_wq == NULL) {
85262306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
85362306a36Sopenharmony_ci		rc = -ENOMEM;
85462306a36Sopenharmony_ci		goto free_cq_wq;
85562306a36Sopenharmony_ci	}
85662306a36Sopenharmony_ci
85762306a36Sopenharmony_ci	snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx);
85862306a36Sopenharmony_ci	hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
85962306a36Sopenharmony_ci	if (!hdev->cs_cmplt_wq) {
86062306a36Sopenharmony_ci		dev_err(hdev->dev,
86162306a36Sopenharmony_ci			"Failed to allocate CS completions workqueue\n");
86262306a36Sopenharmony_ci		rc = -ENOMEM;
86362306a36Sopenharmony_ci		goto free_eq_wq;
86462306a36Sopenharmony_ci	}
86562306a36Sopenharmony_ci
86662306a36Sopenharmony_ci	snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx);
86762306a36Sopenharmony_ci	hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
86862306a36Sopenharmony_ci	if (!hdev->ts_free_obj_wq) {
86962306a36Sopenharmony_ci		dev_err(hdev->dev,
87062306a36Sopenharmony_ci			"Failed to allocate Timestamp registration free workqueue\n");
87162306a36Sopenharmony_ci		rc = -ENOMEM;
87262306a36Sopenharmony_ci		goto free_cs_cmplt_wq;
87362306a36Sopenharmony_ci	}
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci	snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx);
87662306a36Sopenharmony_ci	hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
87762306a36Sopenharmony_ci	if (!hdev->prefetch_wq) {
87862306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
87962306a36Sopenharmony_ci		rc = -ENOMEM;
88062306a36Sopenharmony_ci		goto free_ts_free_wq;
88162306a36Sopenharmony_ci	}
88262306a36Sopenharmony_ci
88362306a36Sopenharmony_ci	hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL);
88462306a36Sopenharmony_ci	if (!hdev->hl_chip_info) {
88562306a36Sopenharmony_ci		rc = -ENOMEM;
88662306a36Sopenharmony_ci		goto free_prefetch_wq;
88762306a36Sopenharmony_ci	}
88862306a36Sopenharmony_ci
88962306a36Sopenharmony_ci	rc = hl_mmu_if_set_funcs(hdev);
89062306a36Sopenharmony_ci	if (rc)
89162306a36Sopenharmony_ci		goto free_chip_info;
89262306a36Sopenharmony_ci
89362306a36Sopenharmony_ci	hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
89462306a36Sopenharmony_ci
89562306a36Sopenharmony_ci	snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx);
89662306a36Sopenharmony_ci	hdev->reset_wq = create_singlethread_workqueue(workq_name);
89762306a36Sopenharmony_ci	if (!hdev->reset_wq) {
89862306a36Sopenharmony_ci		rc = -ENOMEM;
89962306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to create device reset WQ\n");
90062306a36Sopenharmony_ci		goto free_cb_mgr;
90162306a36Sopenharmony_ci	}
90262306a36Sopenharmony_ci
90362306a36Sopenharmony_ci	INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending);
90462306a36Sopenharmony_ci	hdev->device_reset_work.hdev = hdev;
90562306a36Sopenharmony_ci	hdev->device_fini_pending = 0;
90662306a36Sopenharmony_ci
90762306a36Sopenharmony_ci	INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work,
90862306a36Sopenharmony_ci				device_release_watchdog_func);
90962306a36Sopenharmony_ci	hdev->device_release_watchdog_work.hdev = hdev;
91062306a36Sopenharmony_ci
91162306a36Sopenharmony_ci	mutex_init(&hdev->send_cpu_message_lock);
91262306a36Sopenharmony_ci	mutex_init(&hdev->debug_lock);
91362306a36Sopenharmony_ci	INIT_LIST_HEAD(&hdev->cs_mirror_list);
91462306a36Sopenharmony_ci	spin_lock_init(&hdev->cs_mirror_lock);
91562306a36Sopenharmony_ci	spin_lock_init(&hdev->reset_info.lock);
91662306a36Sopenharmony_ci	INIT_LIST_HEAD(&hdev->fpriv_list);
91762306a36Sopenharmony_ci	INIT_LIST_HEAD(&hdev->fpriv_ctrl_list);
91862306a36Sopenharmony_ci	mutex_init(&hdev->fpriv_list_lock);
91962306a36Sopenharmony_ci	mutex_init(&hdev->fpriv_ctrl_list_lock);
92062306a36Sopenharmony_ci	mutex_init(&hdev->clk_throttling.lock);
92162306a36Sopenharmony_ci
92262306a36Sopenharmony_ci	return 0;
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_cifree_cb_mgr:
92562306a36Sopenharmony_ci	hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
92662306a36Sopenharmony_ci	hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
92762306a36Sopenharmony_cifree_chip_info:
92862306a36Sopenharmony_ci	kfree(hdev->hl_chip_info);
92962306a36Sopenharmony_cifree_prefetch_wq:
93062306a36Sopenharmony_ci	destroy_workqueue(hdev->prefetch_wq);
93162306a36Sopenharmony_cifree_ts_free_wq:
93262306a36Sopenharmony_ci	destroy_workqueue(hdev->ts_free_obj_wq);
93362306a36Sopenharmony_cifree_cs_cmplt_wq:
93462306a36Sopenharmony_ci	destroy_workqueue(hdev->cs_cmplt_wq);
93562306a36Sopenharmony_cifree_eq_wq:
93662306a36Sopenharmony_ci	destroy_workqueue(hdev->eq_wq);
93762306a36Sopenharmony_cifree_cq_wq:
93862306a36Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
93962306a36Sopenharmony_ci		if (hdev->cq_wq[i])
94062306a36Sopenharmony_ci			destroy_workqueue(hdev->cq_wq[i]);
94162306a36Sopenharmony_ci	kfree(hdev->cq_wq);
94262306a36Sopenharmony_ciasid_fini:
94362306a36Sopenharmony_ci	hl_asid_fini(hdev);
94462306a36Sopenharmony_ciearly_fini:
94562306a36Sopenharmony_ci	if (hdev->asic_funcs->early_fini)
94662306a36Sopenharmony_ci		hdev->asic_funcs->early_fini(hdev);
94762306a36Sopenharmony_ci
94862306a36Sopenharmony_ci	return rc;
94962306a36Sopenharmony_ci}
95062306a36Sopenharmony_ci
95162306a36Sopenharmony_ci/*
95262306a36Sopenharmony_ci * device_early_fini - finalize all that was done in device_early_init
95362306a36Sopenharmony_ci *
95462306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
95562306a36Sopenharmony_ci *
95662306a36Sopenharmony_ci */
95762306a36Sopenharmony_cistatic void device_early_fini(struct hl_device *hdev)
95862306a36Sopenharmony_ci{
95962306a36Sopenharmony_ci	int i;
96062306a36Sopenharmony_ci
96162306a36Sopenharmony_ci	mutex_destroy(&hdev->debug_lock);
96262306a36Sopenharmony_ci	mutex_destroy(&hdev->send_cpu_message_lock);
96362306a36Sopenharmony_ci
96462306a36Sopenharmony_ci	mutex_destroy(&hdev->fpriv_list_lock);
96562306a36Sopenharmony_ci	mutex_destroy(&hdev->fpriv_ctrl_list_lock);
96662306a36Sopenharmony_ci
96762306a36Sopenharmony_ci	mutex_destroy(&hdev->clk_throttling.lock);
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_ci	hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
97062306a36Sopenharmony_ci	hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
97162306a36Sopenharmony_ci
97262306a36Sopenharmony_ci	kfree(hdev->hl_chip_info);
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ci	destroy_workqueue(hdev->prefetch_wq);
97562306a36Sopenharmony_ci	destroy_workqueue(hdev->ts_free_obj_wq);
97662306a36Sopenharmony_ci	destroy_workqueue(hdev->cs_cmplt_wq);
97762306a36Sopenharmony_ci	destroy_workqueue(hdev->eq_wq);
97862306a36Sopenharmony_ci	destroy_workqueue(hdev->reset_wq);
97962306a36Sopenharmony_ci
98062306a36Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
98162306a36Sopenharmony_ci		destroy_workqueue(hdev->cq_wq[i]);
98262306a36Sopenharmony_ci	kfree(hdev->cq_wq);
98362306a36Sopenharmony_ci
98462306a36Sopenharmony_ci	hl_asid_fini(hdev);
98562306a36Sopenharmony_ci
98662306a36Sopenharmony_ci	if (hdev->asic_funcs->early_fini)
98762306a36Sopenharmony_ci		hdev->asic_funcs->early_fini(hdev);
98862306a36Sopenharmony_ci}
98962306a36Sopenharmony_ci
99062306a36Sopenharmony_cistatic bool is_pci_link_healthy(struct hl_device *hdev)
99162306a36Sopenharmony_ci{
99262306a36Sopenharmony_ci	u16 vendor_id;
99362306a36Sopenharmony_ci
99462306a36Sopenharmony_ci	if (!hdev->pdev)
99562306a36Sopenharmony_ci		return false;
99662306a36Sopenharmony_ci
99762306a36Sopenharmony_ci	pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
99862306a36Sopenharmony_ci
99962306a36Sopenharmony_ci	return (vendor_id == PCI_VENDOR_ID_HABANALABS);
100062306a36Sopenharmony_ci}
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_cistatic void hl_device_heartbeat(struct work_struct *work)
100362306a36Sopenharmony_ci{
100462306a36Sopenharmony_ci	struct hl_device *hdev = container_of(work, struct hl_device,
100562306a36Sopenharmony_ci						work_heartbeat.work);
100662306a36Sopenharmony_ci	struct hl_info_fw_err_info info = {0};
100762306a36Sopenharmony_ci	u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci	if (!hl_device_operational(hdev, NULL))
101062306a36Sopenharmony_ci		goto reschedule;
101162306a36Sopenharmony_ci
101262306a36Sopenharmony_ci	if (!hdev->asic_funcs->send_heartbeat(hdev))
101362306a36Sopenharmony_ci		goto reschedule;
101462306a36Sopenharmony_ci
101562306a36Sopenharmony_ci	if (hl_device_operational(hdev, NULL))
101662306a36Sopenharmony_ci		dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
101762306a36Sopenharmony_ci			is_pci_link_healthy(hdev) ? "healthy" : "broken");
101862306a36Sopenharmony_ci
101962306a36Sopenharmony_ci	info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
102062306a36Sopenharmony_ci	info.event_mask = &event_mask;
102162306a36Sopenharmony_ci	hl_handle_fw_err(hdev, &info);
102262306a36Sopenharmony_ci	hl_device_cond_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT, event_mask);
102362306a36Sopenharmony_ci
102462306a36Sopenharmony_ci	return;
102562306a36Sopenharmony_ci
102662306a36Sopenharmony_cireschedule:
102762306a36Sopenharmony_ci	/*
102862306a36Sopenharmony_ci	 * prev_reset_trigger tracks consecutive fatal h/w errors until first
102962306a36Sopenharmony_ci	 * heartbeat immediately post reset.
103062306a36Sopenharmony_ci	 * If control reached here, then at least one heartbeat work has been
103162306a36Sopenharmony_ci	 * scheduled since last reset/init cycle.
103262306a36Sopenharmony_ci	 * So if the device is not already in reset cycle, reset the flag
103362306a36Sopenharmony_ci	 * prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR
103462306a36Sopenharmony_ci	 * status for at least one heartbeat. From this point driver restarts
103562306a36Sopenharmony_ci	 * tracking future consecutive fatal errors.
103662306a36Sopenharmony_ci	 */
103762306a36Sopenharmony_ci	if (!hdev->reset_info.in_reset)
103862306a36Sopenharmony_ci		hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
103962306a36Sopenharmony_ci
104062306a36Sopenharmony_ci	schedule_delayed_work(&hdev->work_heartbeat,
104162306a36Sopenharmony_ci			usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
104262306a36Sopenharmony_ci}
104362306a36Sopenharmony_ci
104462306a36Sopenharmony_ci/*
104562306a36Sopenharmony_ci * device_late_init - do late stuff initialization for the habanalabs device
104662306a36Sopenharmony_ci *
104762306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
104862306a36Sopenharmony_ci *
104962306a36Sopenharmony_ci * Do stuff that either needs the device H/W queues to be active or needs
105062306a36Sopenharmony_ci * to happen after all the rest of the initialization is finished
105162306a36Sopenharmony_ci */
105262306a36Sopenharmony_cistatic int device_late_init(struct hl_device *hdev)
105362306a36Sopenharmony_ci{
105462306a36Sopenharmony_ci	int rc;
105562306a36Sopenharmony_ci
105662306a36Sopenharmony_ci	if (hdev->asic_funcs->late_init) {
105762306a36Sopenharmony_ci		rc = hdev->asic_funcs->late_init(hdev);
105862306a36Sopenharmony_ci		if (rc) {
105962306a36Sopenharmony_ci			dev_err(hdev->dev,
106062306a36Sopenharmony_ci				"failed late initialization for the H/W\n");
106162306a36Sopenharmony_ci			return rc;
106262306a36Sopenharmony_ci		}
106362306a36Sopenharmony_ci	}
106462306a36Sopenharmony_ci
106562306a36Sopenharmony_ci	hdev->high_pll = hdev->asic_prop.high_pll;
106662306a36Sopenharmony_ci
106762306a36Sopenharmony_ci	if (hdev->heartbeat) {
106862306a36Sopenharmony_ci		INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
106962306a36Sopenharmony_ci		schedule_delayed_work(&hdev->work_heartbeat,
107062306a36Sopenharmony_ci				usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
107162306a36Sopenharmony_ci	}
107262306a36Sopenharmony_ci
107362306a36Sopenharmony_ci	hdev->late_init_done = true;
107462306a36Sopenharmony_ci
107562306a36Sopenharmony_ci	return 0;
107662306a36Sopenharmony_ci}
107762306a36Sopenharmony_ci
107862306a36Sopenharmony_ci/*
107962306a36Sopenharmony_ci * device_late_fini - finalize all that was done in device_late_init
108062306a36Sopenharmony_ci *
108162306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
108262306a36Sopenharmony_ci *
108362306a36Sopenharmony_ci */
108462306a36Sopenharmony_cistatic void device_late_fini(struct hl_device *hdev)
108562306a36Sopenharmony_ci{
108662306a36Sopenharmony_ci	if (!hdev->late_init_done)
108762306a36Sopenharmony_ci		return;
108862306a36Sopenharmony_ci
108962306a36Sopenharmony_ci	if (hdev->heartbeat)
109062306a36Sopenharmony_ci		cancel_delayed_work_sync(&hdev->work_heartbeat);
109162306a36Sopenharmony_ci
109262306a36Sopenharmony_ci	if (hdev->asic_funcs->late_fini)
109362306a36Sopenharmony_ci		hdev->asic_funcs->late_fini(hdev);
109462306a36Sopenharmony_ci
109562306a36Sopenharmony_ci	hdev->late_init_done = false;
109662306a36Sopenharmony_ci}
109762306a36Sopenharmony_ci
109862306a36Sopenharmony_ciint hl_device_utilization(struct hl_device *hdev, u32 *utilization)
109962306a36Sopenharmony_ci{
110062306a36Sopenharmony_ci	u64 max_power, curr_power, dc_power, dividend, divisor;
110162306a36Sopenharmony_ci	int rc;
110262306a36Sopenharmony_ci
110362306a36Sopenharmony_ci	max_power = hdev->max_power;
110462306a36Sopenharmony_ci	dc_power = hdev->asic_prop.dc_power_default;
110562306a36Sopenharmony_ci	divisor = max_power - dc_power;
110662306a36Sopenharmony_ci	if (!divisor) {
110762306a36Sopenharmony_ci		dev_warn(hdev->dev, "device utilization is not supported\n");
110862306a36Sopenharmony_ci		return -EOPNOTSUPP;
110962306a36Sopenharmony_ci	}
111062306a36Sopenharmony_ci	rc = hl_fw_cpucp_power_get(hdev, &curr_power);
111162306a36Sopenharmony_ci
111262306a36Sopenharmony_ci	if (rc)
111362306a36Sopenharmony_ci		return rc;
111462306a36Sopenharmony_ci
111562306a36Sopenharmony_ci	curr_power = clamp(curr_power, dc_power, max_power);
111662306a36Sopenharmony_ci
111762306a36Sopenharmony_ci	dividend = (curr_power - dc_power) * 100;
111862306a36Sopenharmony_ci	*utilization = (u32) div_u64(dividend, divisor);
111962306a36Sopenharmony_ci
112062306a36Sopenharmony_ci	return 0;
112162306a36Sopenharmony_ci}
112262306a36Sopenharmony_ci
112362306a36Sopenharmony_ciint hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable)
112462306a36Sopenharmony_ci{
112562306a36Sopenharmony_ci	int rc = 0;
112662306a36Sopenharmony_ci
112762306a36Sopenharmony_ci	mutex_lock(&hdev->debug_lock);
112862306a36Sopenharmony_ci
112962306a36Sopenharmony_ci	if (!enable) {
113062306a36Sopenharmony_ci		if (!hdev->in_debug) {
113162306a36Sopenharmony_ci			dev_err(hdev->dev,
113262306a36Sopenharmony_ci				"Failed to disable debug mode because device was not in debug mode\n");
113362306a36Sopenharmony_ci			rc = -EFAULT;
113462306a36Sopenharmony_ci			goto out;
113562306a36Sopenharmony_ci		}
113662306a36Sopenharmony_ci
113762306a36Sopenharmony_ci		if (!hdev->reset_info.hard_reset_pending)
113862306a36Sopenharmony_ci			hdev->asic_funcs->halt_coresight(hdev, ctx);
113962306a36Sopenharmony_ci
114062306a36Sopenharmony_ci		hdev->in_debug = 0;
114162306a36Sopenharmony_ci
114262306a36Sopenharmony_ci		goto out;
114362306a36Sopenharmony_ci	}
114462306a36Sopenharmony_ci
114562306a36Sopenharmony_ci	if (hdev->in_debug) {
114662306a36Sopenharmony_ci		dev_err(hdev->dev,
114762306a36Sopenharmony_ci			"Failed to enable debug mode because device is already in debug mode\n");
114862306a36Sopenharmony_ci		rc = -EFAULT;
114962306a36Sopenharmony_ci		goto out;
115062306a36Sopenharmony_ci	}
115162306a36Sopenharmony_ci
115262306a36Sopenharmony_ci	hdev->in_debug = 1;
115362306a36Sopenharmony_ci
115462306a36Sopenharmony_ciout:
115562306a36Sopenharmony_ci	mutex_unlock(&hdev->debug_lock);
115662306a36Sopenharmony_ci
115762306a36Sopenharmony_ci	return rc;
115862306a36Sopenharmony_ci}
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_cistatic void take_release_locks(struct hl_device *hdev)
116162306a36Sopenharmony_ci{
116262306a36Sopenharmony_ci	/* Flush anyone that is inside the critical section of enqueue
116362306a36Sopenharmony_ci	 * jobs to the H/W
116462306a36Sopenharmony_ci	 */
116562306a36Sopenharmony_ci	hdev->asic_funcs->hw_queues_lock(hdev);
116662306a36Sopenharmony_ci	hdev->asic_funcs->hw_queues_unlock(hdev);
116762306a36Sopenharmony_ci
116862306a36Sopenharmony_ci	/* Flush processes that are sending message to CPU */
116962306a36Sopenharmony_ci	mutex_lock(&hdev->send_cpu_message_lock);
117062306a36Sopenharmony_ci	mutex_unlock(&hdev->send_cpu_message_lock);
117162306a36Sopenharmony_ci
117262306a36Sopenharmony_ci	/* Flush anyone that is inside device open */
117362306a36Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
117462306a36Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
117562306a36Sopenharmony_ci	mutex_lock(&hdev->fpriv_ctrl_list_lock);
117662306a36Sopenharmony_ci	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
117762306a36Sopenharmony_ci}
117862306a36Sopenharmony_ci
117962306a36Sopenharmony_cistatic void hl_abort_waiting_for_completions(struct hl_device *hdev)
118062306a36Sopenharmony_ci{
118162306a36Sopenharmony_ci	hl_abort_waiting_for_cs_completions(hdev);
118262306a36Sopenharmony_ci
118362306a36Sopenharmony_ci	/* Release all pending user interrupts, each pending user interrupt
118462306a36Sopenharmony_ci	 * holds a reference to a user context.
118562306a36Sopenharmony_ci	 */
118662306a36Sopenharmony_ci	hl_release_pending_user_interrupts(hdev);
118762306a36Sopenharmony_ci}
118862306a36Sopenharmony_ci
118962306a36Sopenharmony_cistatic void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset,
119062306a36Sopenharmony_ci				bool skip_wq_flush)
119162306a36Sopenharmony_ci{
119262306a36Sopenharmony_ci	if (hard_reset)
119362306a36Sopenharmony_ci		device_late_fini(hdev);
119462306a36Sopenharmony_ci
119562306a36Sopenharmony_ci	/*
119662306a36Sopenharmony_ci	 * Halt the engines and disable interrupts so we won't get any more
119762306a36Sopenharmony_ci	 * completions from H/W and we won't have any accesses from the
119862306a36Sopenharmony_ci	 * H/W to the host machine
119962306a36Sopenharmony_ci	 */
120062306a36Sopenharmony_ci	hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
120162306a36Sopenharmony_ci
120262306a36Sopenharmony_ci	/* Go over all the queues, release all CS and their jobs */
120362306a36Sopenharmony_ci	hl_cs_rollback_all(hdev, skip_wq_flush);
120462306a36Sopenharmony_ci
120562306a36Sopenharmony_ci	/* flush the MMU prefetch workqueue */
120662306a36Sopenharmony_ci	flush_workqueue(hdev->prefetch_wq);
120762306a36Sopenharmony_ci
120862306a36Sopenharmony_ci	hl_abort_waiting_for_completions(hdev);
120962306a36Sopenharmony_ci}
121062306a36Sopenharmony_ci
121162306a36Sopenharmony_ci/*
121262306a36Sopenharmony_ci * hl_device_suspend - initiate device suspend
121362306a36Sopenharmony_ci *
121462306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
121562306a36Sopenharmony_ci *
121662306a36Sopenharmony_ci * Puts the hw in the suspend state (all asics).
121762306a36Sopenharmony_ci * Returns 0 for success or an error on failure.
121862306a36Sopenharmony_ci * Called at driver suspend.
121962306a36Sopenharmony_ci */
122062306a36Sopenharmony_ciint hl_device_suspend(struct hl_device *hdev)
122162306a36Sopenharmony_ci{
122262306a36Sopenharmony_ci	int rc;
122362306a36Sopenharmony_ci
122462306a36Sopenharmony_ci	pci_save_state(hdev->pdev);
122562306a36Sopenharmony_ci
122662306a36Sopenharmony_ci	/* Block future CS/VM/JOB completion operations */
122762306a36Sopenharmony_ci	spin_lock(&hdev->reset_info.lock);
122862306a36Sopenharmony_ci	if (hdev->reset_info.in_reset) {
122962306a36Sopenharmony_ci		spin_unlock(&hdev->reset_info.lock);
123062306a36Sopenharmony_ci		dev_err(hdev->dev, "Can't suspend while in reset\n");
123162306a36Sopenharmony_ci		return -EIO;
123262306a36Sopenharmony_ci	}
123362306a36Sopenharmony_ci	hdev->reset_info.in_reset = 1;
123462306a36Sopenharmony_ci	spin_unlock(&hdev->reset_info.lock);
123562306a36Sopenharmony_ci
123662306a36Sopenharmony_ci	/* This blocks all other stuff that is not blocked by in_reset */
123762306a36Sopenharmony_ci	hdev->disabled = true;
123862306a36Sopenharmony_ci
123962306a36Sopenharmony_ci	take_release_locks(hdev);
124062306a36Sopenharmony_ci
124162306a36Sopenharmony_ci	rc = hdev->asic_funcs->suspend(hdev);
124262306a36Sopenharmony_ci	if (rc)
124362306a36Sopenharmony_ci		dev_err(hdev->dev,
124462306a36Sopenharmony_ci			"Failed to disable PCI access of device CPU\n");
124562306a36Sopenharmony_ci
124662306a36Sopenharmony_ci	/* Shut down the device */
124762306a36Sopenharmony_ci	pci_disable_device(hdev->pdev);
124862306a36Sopenharmony_ci	pci_set_power_state(hdev->pdev, PCI_D3hot);
124962306a36Sopenharmony_ci
125062306a36Sopenharmony_ci	return 0;
125162306a36Sopenharmony_ci}
125262306a36Sopenharmony_ci
125362306a36Sopenharmony_ci/*
125462306a36Sopenharmony_ci * hl_device_resume - initiate device resume
125562306a36Sopenharmony_ci *
125662306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
125762306a36Sopenharmony_ci *
125862306a36Sopenharmony_ci * Bring the hw back to operating state (all asics).
125962306a36Sopenharmony_ci * Returns 0 for success or an error on failure.
126062306a36Sopenharmony_ci * Called at driver resume.
126162306a36Sopenharmony_ci */
126262306a36Sopenharmony_ciint hl_device_resume(struct hl_device *hdev)
126362306a36Sopenharmony_ci{
126462306a36Sopenharmony_ci	int rc;
126562306a36Sopenharmony_ci
126662306a36Sopenharmony_ci	pci_set_power_state(hdev->pdev, PCI_D0);
126762306a36Sopenharmony_ci	pci_restore_state(hdev->pdev);
126862306a36Sopenharmony_ci	rc = pci_enable_device_mem(hdev->pdev);
126962306a36Sopenharmony_ci	if (rc) {
127062306a36Sopenharmony_ci		dev_err(hdev->dev,
127162306a36Sopenharmony_ci			"Failed to enable PCI device in resume\n");
127262306a36Sopenharmony_ci		return rc;
127362306a36Sopenharmony_ci	}
127462306a36Sopenharmony_ci
127562306a36Sopenharmony_ci	pci_set_master(hdev->pdev);
127662306a36Sopenharmony_ci
127762306a36Sopenharmony_ci	rc = hdev->asic_funcs->resume(hdev);
127862306a36Sopenharmony_ci	if (rc) {
127962306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to resume device after suspend\n");
128062306a36Sopenharmony_ci		goto disable_device;
128162306a36Sopenharmony_ci	}
128262306a36Sopenharmony_ci
128362306a36Sopenharmony_ci
128462306a36Sopenharmony_ci	/* 'in_reset' was set to true during suspend, now we must clear it in order
128562306a36Sopenharmony_ci	 * for hard reset to be performed
128662306a36Sopenharmony_ci	 */
128762306a36Sopenharmony_ci	spin_lock(&hdev->reset_info.lock);
128862306a36Sopenharmony_ci	hdev->reset_info.in_reset = 0;
128962306a36Sopenharmony_ci	spin_unlock(&hdev->reset_info.lock);
129062306a36Sopenharmony_ci
129162306a36Sopenharmony_ci	rc = hl_device_reset(hdev, HL_DRV_RESET_HARD);
129262306a36Sopenharmony_ci	if (rc) {
129362306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to reset device during resume\n");
129462306a36Sopenharmony_ci		goto disable_device;
129562306a36Sopenharmony_ci	}
129662306a36Sopenharmony_ci
129762306a36Sopenharmony_ci	return 0;
129862306a36Sopenharmony_ci
129962306a36Sopenharmony_cidisable_device:
130062306a36Sopenharmony_ci	pci_disable_device(hdev->pdev);
130162306a36Sopenharmony_ci
130262306a36Sopenharmony_ci	return rc;
130362306a36Sopenharmony_ci}
130462306a36Sopenharmony_ci
130562306a36Sopenharmony_cistatic int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool control_dev)
130662306a36Sopenharmony_ci{
130762306a36Sopenharmony_ci	struct task_struct *task = NULL;
130862306a36Sopenharmony_ci	struct list_head *fd_list;
130962306a36Sopenharmony_ci	struct hl_fpriv	*hpriv;
131062306a36Sopenharmony_ci	struct mutex *fd_lock;
131162306a36Sopenharmony_ci	u32 pending_cnt;
131262306a36Sopenharmony_ci
131362306a36Sopenharmony_ci	fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
131462306a36Sopenharmony_ci	fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
131562306a36Sopenharmony_ci
131662306a36Sopenharmony_ci	/* Giving time for user to close FD, and for processes that are inside
131762306a36Sopenharmony_ci	 * hl_device_open to finish
131862306a36Sopenharmony_ci	 */
131962306a36Sopenharmony_ci	if (!list_empty(fd_list))
132062306a36Sopenharmony_ci		ssleep(1);
132162306a36Sopenharmony_ci
132262306a36Sopenharmony_ci	if (timeout) {
132362306a36Sopenharmony_ci		pending_cnt = timeout;
132462306a36Sopenharmony_ci	} else {
132562306a36Sopenharmony_ci		if (hdev->process_kill_trial_cnt) {
132662306a36Sopenharmony_ci			/* Processes have been already killed */
132762306a36Sopenharmony_ci			pending_cnt = 1;
132862306a36Sopenharmony_ci			goto wait_for_processes;
132962306a36Sopenharmony_ci		} else {
133062306a36Sopenharmony_ci			/* Wait a small period after process kill */
133162306a36Sopenharmony_ci			pending_cnt = HL_PENDING_RESET_PER_SEC;
133262306a36Sopenharmony_ci		}
133362306a36Sopenharmony_ci	}
133462306a36Sopenharmony_ci
133562306a36Sopenharmony_ci	mutex_lock(fd_lock);
133662306a36Sopenharmony_ci
133762306a36Sopenharmony_ci	/* This section must be protected because we are dereferencing
133862306a36Sopenharmony_ci	 * pointers that are freed if the process exits
133962306a36Sopenharmony_ci	 */
134062306a36Sopenharmony_ci	list_for_each_entry(hpriv, fd_list, dev_node) {
134162306a36Sopenharmony_ci		task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
134262306a36Sopenharmony_ci		if (task) {
134362306a36Sopenharmony_ci			dev_info(hdev->dev, "Killing user process pid=%d\n",
134462306a36Sopenharmony_ci				task_pid_nr(task));
134562306a36Sopenharmony_ci			send_sig(SIGKILL, task, 1);
134662306a36Sopenharmony_ci			usleep_range(1000, 10000);
134762306a36Sopenharmony_ci
134862306a36Sopenharmony_ci			put_task_struct(task);
134962306a36Sopenharmony_ci		} else {
135062306a36Sopenharmony_ci			/*
135162306a36Sopenharmony_ci			 * If we got here, it means that process was killed from outside the driver
135262306a36Sopenharmony_ci			 * right after it started looping on fd_list and before get_pid_task, thus
135362306a36Sopenharmony_ci			 * we don't need to kill it.
135462306a36Sopenharmony_ci			 */
135562306a36Sopenharmony_ci			dev_dbg(hdev->dev,
135662306a36Sopenharmony_ci				"Can't get task struct for user process, assuming process was killed from outside the driver\n");
135762306a36Sopenharmony_ci		}
135862306a36Sopenharmony_ci	}
135962306a36Sopenharmony_ci
136062306a36Sopenharmony_ci	mutex_unlock(fd_lock);
136162306a36Sopenharmony_ci
136262306a36Sopenharmony_ci	/*
136362306a36Sopenharmony_ci	 * We killed the open users, but that doesn't mean they are closed.
136462306a36Sopenharmony_ci	 * It could be that they are running a long cleanup phase in the driver
136562306a36Sopenharmony_ci	 * e.g. MMU unmappings, or running other long teardown flow even before
136662306a36Sopenharmony_ci	 * our cleanup.
136762306a36Sopenharmony_ci	 * Therefore we need to wait again to make sure they are closed before
136862306a36Sopenharmony_ci	 * continuing with the reset.
136962306a36Sopenharmony_ci	 */
137062306a36Sopenharmony_ci
137162306a36Sopenharmony_ciwait_for_processes:
137262306a36Sopenharmony_ci	while ((!list_empty(fd_list)) && (pending_cnt)) {
137362306a36Sopenharmony_ci		dev_dbg(hdev->dev,
137462306a36Sopenharmony_ci			"Waiting for all unmap operations to finish before hard reset\n");
137562306a36Sopenharmony_ci
137662306a36Sopenharmony_ci		pending_cnt--;
137762306a36Sopenharmony_ci
137862306a36Sopenharmony_ci		ssleep(1);
137962306a36Sopenharmony_ci	}
138062306a36Sopenharmony_ci
138162306a36Sopenharmony_ci	/* All processes exited successfully */
138262306a36Sopenharmony_ci	if (list_empty(fd_list))
138362306a36Sopenharmony_ci		return 0;
138462306a36Sopenharmony_ci
138562306a36Sopenharmony_ci	/* Give up waiting for processes to exit */
138662306a36Sopenharmony_ci	if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS)
138762306a36Sopenharmony_ci		return -ETIME;
138862306a36Sopenharmony_ci
138962306a36Sopenharmony_ci	hdev->process_kill_trial_cnt++;
139062306a36Sopenharmony_ci
139162306a36Sopenharmony_ci	return -EBUSY;
139262306a36Sopenharmony_ci}
139362306a36Sopenharmony_ci
139462306a36Sopenharmony_cistatic void device_disable_open_processes(struct hl_device *hdev, bool control_dev)
139562306a36Sopenharmony_ci{
139662306a36Sopenharmony_ci	struct list_head *fd_list;
139762306a36Sopenharmony_ci	struct hl_fpriv *hpriv;
139862306a36Sopenharmony_ci	struct mutex *fd_lock;
139962306a36Sopenharmony_ci
140062306a36Sopenharmony_ci	fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
140162306a36Sopenharmony_ci	fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
140262306a36Sopenharmony_ci
140362306a36Sopenharmony_ci	mutex_lock(fd_lock);
140462306a36Sopenharmony_ci	list_for_each_entry(hpriv, fd_list, dev_node)
140562306a36Sopenharmony_ci		hpriv->hdev = NULL;
140662306a36Sopenharmony_ci	mutex_unlock(fd_lock);
140762306a36Sopenharmony_ci}
140862306a36Sopenharmony_ci
140962306a36Sopenharmony_cistatic void send_disable_pci_access(struct hl_device *hdev, u32 flags)
141062306a36Sopenharmony_ci{
141162306a36Sopenharmony_ci	/* If reset is due to heartbeat, device CPU is no responsive in
141262306a36Sopenharmony_ci	 * which case no point sending PCI disable message to it.
141362306a36Sopenharmony_ci	 */
141462306a36Sopenharmony_ci	if ((flags & HL_DRV_RESET_HARD) &&
141562306a36Sopenharmony_ci			!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
141662306a36Sopenharmony_ci		/* Disable PCI access from device F/W so he won't send
141762306a36Sopenharmony_ci		 * us additional interrupts. We disable MSI/MSI-X at
141862306a36Sopenharmony_ci		 * the halt_engines function and we can't have the F/W
141962306a36Sopenharmony_ci		 * sending us interrupts after that. We need to disable
142062306a36Sopenharmony_ci		 * the access here because if the device is marked
142162306a36Sopenharmony_ci		 * disable, the message won't be send. Also, in case
142262306a36Sopenharmony_ci		 * of heartbeat, the device CPU is marked as disable
142362306a36Sopenharmony_ci		 * so this message won't be sent
142462306a36Sopenharmony_ci		 */
142562306a36Sopenharmony_ci		if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
142662306a36Sopenharmony_ci			dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
142762306a36Sopenharmony_ci			return;
142862306a36Sopenharmony_ci		}
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci		/* verify that last EQs are handled before disabled is set */
143162306a36Sopenharmony_ci		if (hdev->cpu_queues_enable)
143262306a36Sopenharmony_ci			synchronize_irq(pci_irq_vector(hdev->pdev,
143362306a36Sopenharmony_ci					hdev->asic_prop.eq_interrupt_id));
143462306a36Sopenharmony_ci	}
143562306a36Sopenharmony_ci}
143662306a36Sopenharmony_ci
143762306a36Sopenharmony_cistatic void handle_reset_trigger(struct hl_device *hdev, u32 flags)
143862306a36Sopenharmony_ci{
143962306a36Sopenharmony_ci	u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
144062306a36Sopenharmony_ci
144162306a36Sopenharmony_ci	/* No consecutive mechanism when user context exists */
144262306a36Sopenharmony_ci	if (hdev->is_compute_ctx_active)
144362306a36Sopenharmony_ci		return;
144462306a36Sopenharmony_ci
144562306a36Sopenharmony_ci	/*
144662306a36Sopenharmony_ci	 * 'reset cause' is being updated here, because getting here
144762306a36Sopenharmony_ci	 * means that it's the 1st time and the last time we're here
144862306a36Sopenharmony_ci	 * ('in_reset' makes sure of it). This makes sure that
144962306a36Sopenharmony_ci	 * 'reset_cause' will continue holding its 1st recorded reason!
145062306a36Sopenharmony_ci	 */
145162306a36Sopenharmony_ci	if (flags & HL_DRV_RESET_HEARTBEAT) {
145262306a36Sopenharmony_ci		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
145362306a36Sopenharmony_ci		cur_reset_trigger = HL_DRV_RESET_HEARTBEAT;
145462306a36Sopenharmony_ci	} else if (flags & HL_DRV_RESET_TDR) {
145562306a36Sopenharmony_ci		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR;
145662306a36Sopenharmony_ci		cur_reset_trigger = HL_DRV_RESET_TDR;
145762306a36Sopenharmony_ci	} else if (flags & HL_DRV_RESET_FW_FATAL_ERR) {
145862306a36Sopenharmony_ci		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
145962306a36Sopenharmony_ci		cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR;
146062306a36Sopenharmony_ci	} else {
146162306a36Sopenharmony_ci		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
146262306a36Sopenharmony_ci	}
146362306a36Sopenharmony_ci
146462306a36Sopenharmony_ci	/*
146562306a36Sopenharmony_ci	 * If reset cause is same twice, then reset_trigger_repeated
146662306a36Sopenharmony_ci	 * is set and if this reset is due to a fatal FW error
146762306a36Sopenharmony_ci	 * device is set to an unstable state.
146862306a36Sopenharmony_ci	 */
146962306a36Sopenharmony_ci	if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) {
147062306a36Sopenharmony_ci		hdev->reset_info.prev_reset_trigger = cur_reset_trigger;
147162306a36Sopenharmony_ci		hdev->reset_info.reset_trigger_repeated = 0;
147262306a36Sopenharmony_ci	} else {
147362306a36Sopenharmony_ci		hdev->reset_info.reset_trigger_repeated = 1;
147462306a36Sopenharmony_ci	}
147562306a36Sopenharmony_ci}
147662306a36Sopenharmony_ci
147762306a36Sopenharmony_ci/*
147862306a36Sopenharmony_ci * hl_device_reset - reset the device
147962306a36Sopenharmony_ci *
148062306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
148162306a36Sopenharmony_ci * @flags: reset flags.
148262306a36Sopenharmony_ci *
148362306a36Sopenharmony_ci * Block future CS and wait for pending CS to be enqueued
148462306a36Sopenharmony_ci * Call ASIC H/W fini
148562306a36Sopenharmony_ci * Flush all completions
148662306a36Sopenharmony_ci * Re-initialize all internal data structures
148762306a36Sopenharmony_ci * Call ASIC H/W init, late_init
148862306a36Sopenharmony_ci * Test queues
148962306a36Sopenharmony_ci * Enable device
149062306a36Sopenharmony_ci *
149162306a36Sopenharmony_ci * Returns 0 for success or an error on failure.
149262306a36Sopenharmony_ci */
149362306a36Sopenharmony_ciint hl_device_reset(struct hl_device *hdev, u32 flags)
149462306a36Sopenharmony_ci{
149562306a36Sopenharmony_ci	bool hard_reset, from_hard_reset_thread, fw_reset, reset_upon_device_release,
149662306a36Sopenharmony_ci		schedule_hard_reset = false, delay_reset, from_dev_release, from_watchdog_thread;
149762306a36Sopenharmony_ci	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
149862306a36Sopenharmony_ci	struct hl_ctx *ctx;
149962306a36Sopenharmony_ci	int i, rc, hw_fini_rc;
150062306a36Sopenharmony_ci
150162306a36Sopenharmony_ci	if (!hdev->init_done) {
150262306a36Sopenharmony_ci		dev_err(hdev->dev, "Can't reset before initialization is done\n");
150362306a36Sopenharmony_ci		return 0;
150462306a36Sopenharmony_ci	}
150562306a36Sopenharmony_ci
150662306a36Sopenharmony_ci	hard_reset = !!(flags & HL_DRV_RESET_HARD);
150762306a36Sopenharmony_ci	from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
150862306a36Sopenharmony_ci	fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
150962306a36Sopenharmony_ci	from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE);
151062306a36Sopenharmony_ci	delay_reset = !!(flags & HL_DRV_RESET_DELAY);
151162306a36Sopenharmony_ci	from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR);
151262306a36Sopenharmony_ci	reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
151362306a36Sopenharmony_ci
151462306a36Sopenharmony_ci	if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) {
151562306a36Sopenharmony_ci		dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
151662306a36Sopenharmony_ci		return 0;
151762306a36Sopenharmony_ci	}
151862306a36Sopenharmony_ci
151962306a36Sopenharmony_ci	if (!hard_reset && !hdev->asic_prop.supports_compute_reset) {
152062306a36Sopenharmony_ci		dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n");
152162306a36Sopenharmony_ci		hard_reset = true;
152262306a36Sopenharmony_ci	}
152362306a36Sopenharmony_ci
152462306a36Sopenharmony_ci	if (reset_upon_device_release) {
152562306a36Sopenharmony_ci		if (hard_reset) {
152662306a36Sopenharmony_ci			dev_crit(hdev->dev,
152762306a36Sopenharmony_ci				"Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
152862306a36Sopenharmony_ci			return -EINVAL;
152962306a36Sopenharmony_ci		}
153062306a36Sopenharmony_ci
153162306a36Sopenharmony_ci		goto do_reset;
153262306a36Sopenharmony_ci	}
153362306a36Sopenharmony_ci
153462306a36Sopenharmony_ci	if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
153562306a36Sopenharmony_ci		dev_dbg(hdev->dev,
153662306a36Sopenharmony_ci			"asic doesn't allow inference soft reset - do hard-reset instead\n");
153762306a36Sopenharmony_ci		hard_reset = true;
153862306a36Sopenharmony_ci	}
153962306a36Sopenharmony_ci
154062306a36Sopenharmony_cido_reset:
154162306a36Sopenharmony_ci	/* Re-entry of reset thread */
154262306a36Sopenharmony_ci	if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
154362306a36Sopenharmony_ci		goto kill_processes;
154462306a36Sopenharmony_ci
154562306a36Sopenharmony_ci	/*
154662306a36Sopenharmony_ci	 * Prevent concurrency in this function - only one reset should be
154762306a36Sopenharmony_ci	 * done at any given time. We need to perform this only if we didn't
154862306a36Sopenharmony_ci	 * get here from a dedicated hard reset thread.
154962306a36Sopenharmony_ci	 */
155062306a36Sopenharmony_ci	if (!from_hard_reset_thread) {
155162306a36Sopenharmony_ci		/* Block future CS/VM/JOB completion operations */
155262306a36Sopenharmony_ci		spin_lock(&hdev->reset_info.lock);
155362306a36Sopenharmony_ci		if (hdev->reset_info.in_reset) {
155462306a36Sopenharmony_ci			/* We allow scheduling of a hard reset only during a compute reset */
155562306a36Sopenharmony_ci			if (hard_reset && hdev->reset_info.in_compute_reset)
155662306a36Sopenharmony_ci				hdev->reset_info.hard_reset_schedule_flags = flags;
155762306a36Sopenharmony_ci			spin_unlock(&hdev->reset_info.lock);
155862306a36Sopenharmony_ci			return 0;
155962306a36Sopenharmony_ci		}
156062306a36Sopenharmony_ci
156162306a36Sopenharmony_ci		/* This still allows the completion of some KDMA ops
156262306a36Sopenharmony_ci		 * Update this before in_reset because in_compute_reset implies we are in reset
156362306a36Sopenharmony_ci		 */
156462306a36Sopenharmony_ci		hdev->reset_info.in_compute_reset = !hard_reset;
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci		hdev->reset_info.in_reset = 1;
156762306a36Sopenharmony_ci
156862306a36Sopenharmony_ci		spin_unlock(&hdev->reset_info.lock);
156962306a36Sopenharmony_ci
157062306a36Sopenharmony_ci		/* Cancel the device release watchdog work if required.
157162306a36Sopenharmony_ci		 * In case of reset-upon-device-release while the release watchdog work is
157262306a36Sopenharmony_ci		 * scheduled due to a hard-reset, do hard-reset instead of compute-reset.
157362306a36Sopenharmony_ci		 */
157462306a36Sopenharmony_ci		if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) {
157562306a36Sopenharmony_ci			struct hl_device_reset_work *watchdog_work =
157662306a36Sopenharmony_ci					&hdev->device_release_watchdog_work;
157762306a36Sopenharmony_ci
157862306a36Sopenharmony_ci			hdev->reset_info.watchdog_active = 0;
157962306a36Sopenharmony_ci			if (!from_watchdog_thread)
158062306a36Sopenharmony_ci				cancel_delayed_work_sync(&watchdog_work->reset_work);
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci			if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) {
158362306a36Sopenharmony_ci				hdev->reset_info.in_compute_reset = 0;
158462306a36Sopenharmony_ci				flags |= HL_DRV_RESET_HARD;
158562306a36Sopenharmony_ci				flags &= ~HL_DRV_RESET_DEV_RELEASE;
158662306a36Sopenharmony_ci				hard_reset = true;
158762306a36Sopenharmony_ci			}
158862306a36Sopenharmony_ci		}
158962306a36Sopenharmony_ci
159062306a36Sopenharmony_ci		if (delay_reset)
159162306a36Sopenharmony_ci			usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1);
159262306a36Sopenharmony_ci
159362306a36Sopenharmony_ciescalate_reset_flow:
159462306a36Sopenharmony_ci		handle_reset_trigger(hdev, flags);
159562306a36Sopenharmony_ci		send_disable_pci_access(hdev, flags);
159662306a36Sopenharmony_ci
159762306a36Sopenharmony_ci		/* This also blocks future CS/VM/JOB completion operations */
159862306a36Sopenharmony_ci		hdev->disabled = true;
159962306a36Sopenharmony_ci
160062306a36Sopenharmony_ci		take_release_locks(hdev);
160162306a36Sopenharmony_ci
160262306a36Sopenharmony_ci		if (hard_reset)
160362306a36Sopenharmony_ci			dev_info(hdev->dev, "Going to reset device\n");
160462306a36Sopenharmony_ci		else if (reset_upon_device_release)
160562306a36Sopenharmony_ci			dev_dbg(hdev->dev, "Going to reset device after release by user\n");
160662306a36Sopenharmony_ci		else
160762306a36Sopenharmony_ci			dev_dbg(hdev->dev, "Going to reset engines of inference device\n");
160862306a36Sopenharmony_ci	}
160962306a36Sopenharmony_ci
161062306a36Sopenharmony_ci	if ((hard_reset) && (!from_hard_reset_thread)) {
161162306a36Sopenharmony_ci		hdev->reset_info.hard_reset_pending = true;
161262306a36Sopenharmony_ci
161362306a36Sopenharmony_ci		hdev->process_kill_trial_cnt = 0;
161462306a36Sopenharmony_ci
161562306a36Sopenharmony_ci		hdev->device_reset_work.flags = flags;
161662306a36Sopenharmony_ci
161762306a36Sopenharmony_ci		/*
161862306a36Sopenharmony_ci		 * Because the reset function can't run from heartbeat work,
161962306a36Sopenharmony_ci		 * we need to call the reset function from a dedicated work.
162062306a36Sopenharmony_ci		 */
162162306a36Sopenharmony_ci		queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0);
162262306a36Sopenharmony_ci
162362306a36Sopenharmony_ci		return 0;
162462306a36Sopenharmony_ci	}
162562306a36Sopenharmony_ci
162662306a36Sopenharmony_ci	cleanup_resources(hdev, hard_reset, fw_reset, from_dev_release);
162762306a36Sopenharmony_ci
162862306a36Sopenharmony_cikill_processes:
162962306a36Sopenharmony_ci	if (hard_reset) {
163062306a36Sopenharmony_ci		/* Kill processes here after CS rollback. This is because the
163162306a36Sopenharmony_ci		 * process can't really exit until all its CSs are done, which
163262306a36Sopenharmony_ci		 * is what we do in cs rollback
163362306a36Sopenharmony_ci		 */
163462306a36Sopenharmony_ci		rc = device_kill_open_processes(hdev, 0, false);
163562306a36Sopenharmony_ci
163662306a36Sopenharmony_ci		if (rc == -EBUSY) {
163762306a36Sopenharmony_ci			if (hdev->device_fini_pending) {
163862306a36Sopenharmony_ci				dev_crit(hdev->dev,
163962306a36Sopenharmony_ci					"%s Failed to kill all open processes, stopping hard reset\n",
164062306a36Sopenharmony_ci					dev_name(&(hdev)->pdev->dev));
164162306a36Sopenharmony_ci				goto out_err;
164262306a36Sopenharmony_ci			}
164362306a36Sopenharmony_ci
164462306a36Sopenharmony_ci			/* signal reset thread to reschedule */
164562306a36Sopenharmony_ci			return rc;
164662306a36Sopenharmony_ci		}
164762306a36Sopenharmony_ci
164862306a36Sopenharmony_ci		if (rc) {
164962306a36Sopenharmony_ci			dev_crit(hdev->dev,
165062306a36Sopenharmony_ci				"%s Failed to kill all open processes, stopping hard reset\n",
165162306a36Sopenharmony_ci				dev_name(&(hdev)->pdev->dev));
165262306a36Sopenharmony_ci			goto out_err;
165362306a36Sopenharmony_ci		}
165462306a36Sopenharmony_ci
165562306a36Sopenharmony_ci		/* Flush the Event queue workers to make sure no other thread is
165662306a36Sopenharmony_ci		 * reading or writing to registers during the reset
165762306a36Sopenharmony_ci		 */
165862306a36Sopenharmony_ci		flush_workqueue(hdev->eq_wq);
165962306a36Sopenharmony_ci	}
166062306a36Sopenharmony_ci
166162306a36Sopenharmony_ci	/* Reset the H/W. It will be in idle state after this returns */
166262306a36Sopenharmony_ci	hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
166362306a36Sopenharmony_ci
166462306a36Sopenharmony_ci	if (hard_reset) {
166562306a36Sopenharmony_ci		hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
166662306a36Sopenharmony_ci
166762306a36Sopenharmony_ci		/* Release kernel context */
166862306a36Sopenharmony_ci		if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
166962306a36Sopenharmony_ci			hdev->kernel_ctx = NULL;
167062306a36Sopenharmony_ci
167162306a36Sopenharmony_ci		hl_vm_fini(hdev);
167262306a36Sopenharmony_ci		hl_mmu_fini(hdev);
167362306a36Sopenharmony_ci		hl_eq_reset(hdev, &hdev->event_queue);
167462306a36Sopenharmony_ci	}
167562306a36Sopenharmony_ci
167662306a36Sopenharmony_ci	/* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
167762306a36Sopenharmony_ci	hl_hw_queue_reset(hdev, hard_reset);
167862306a36Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
167962306a36Sopenharmony_ci		hl_cq_reset(hdev, &hdev->completion_queue[i]);
168062306a36Sopenharmony_ci
168162306a36Sopenharmony_ci	/* Make sure the context switch phase will run again */
168262306a36Sopenharmony_ci	ctx = hl_get_compute_ctx(hdev);
168362306a36Sopenharmony_ci	if (ctx) {
168462306a36Sopenharmony_ci		atomic_set(&ctx->thread_ctx_switch_token, 1);
168562306a36Sopenharmony_ci		ctx->thread_ctx_switch_wait_token = 0;
168662306a36Sopenharmony_ci		hl_ctx_put(ctx);
168762306a36Sopenharmony_ci	}
168862306a36Sopenharmony_ci
168962306a36Sopenharmony_ci	if (hw_fini_rc) {
169062306a36Sopenharmony_ci		rc = hw_fini_rc;
169162306a36Sopenharmony_ci		goto out_err;
169262306a36Sopenharmony_ci	}
169362306a36Sopenharmony_ci	/* Finished tear-down, starting to re-initialize */
169462306a36Sopenharmony_ci
169562306a36Sopenharmony_ci	if (hard_reset) {
169662306a36Sopenharmony_ci		hdev->device_cpu_disabled = false;
169762306a36Sopenharmony_ci		hdev->reset_info.hard_reset_pending = false;
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_ci		if (hdev->reset_info.reset_trigger_repeated &&
170062306a36Sopenharmony_ci				(hdev->reset_info.prev_reset_trigger ==
170162306a36Sopenharmony_ci						HL_DRV_RESET_FW_FATAL_ERR)) {
170262306a36Sopenharmony_ci			/* if there 2 back to back resets from FW,
170362306a36Sopenharmony_ci			 * ensure driver puts the driver in a unusable state
170462306a36Sopenharmony_ci			 */
170562306a36Sopenharmony_ci			dev_crit(hdev->dev,
170662306a36Sopenharmony_ci				"%s Consecutive FW fatal errors received, stopping hard reset\n",
170762306a36Sopenharmony_ci				dev_name(&(hdev)->pdev->dev));
170862306a36Sopenharmony_ci			rc = -EIO;
170962306a36Sopenharmony_ci			goto out_err;
171062306a36Sopenharmony_ci		}
171162306a36Sopenharmony_ci
171262306a36Sopenharmony_ci		if (hdev->kernel_ctx) {
171362306a36Sopenharmony_ci			dev_crit(hdev->dev,
171462306a36Sopenharmony_ci				"%s kernel ctx was alive during hard reset, something is terribly wrong\n",
171562306a36Sopenharmony_ci				dev_name(&(hdev)->pdev->dev));
171662306a36Sopenharmony_ci			rc = -EBUSY;
171762306a36Sopenharmony_ci			goto out_err;
171862306a36Sopenharmony_ci		}
171962306a36Sopenharmony_ci
172062306a36Sopenharmony_ci		rc = hl_mmu_init(hdev);
172162306a36Sopenharmony_ci		if (rc) {
172262306a36Sopenharmony_ci			dev_err(hdev->dev,
172362306a36Sopenharmony_ci				"Failed to initialize MMU S/W after hard reset\n");
172462306a36Sopenharmony_ci			goto out_err;
172562306a36Sopenharmony_ci		}
172662306a36Sopenharmony_ci
172762306a36Sopenharmony_ci		/* Allocate the kernel context */
172862306a36Sopenharmony_ci		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
172962306a36Sopenharmony_ci						GFP_KERNEL);
173062306a36Sopenharmony_ci		if (!hdev->kernel_ctx) {
173162306a36Sopenharmony_ci			rc = -ENOMEM;
173262306a36Sopenharmony_ci			hl_mmu_fini(hdev);
173362306a36Sopenharmony_ci			goto out_err;
173462306a36Sopenharmony_ci		}
173562306a36Sopenharmony_ci
173662306a36Sopenharmony_ci		hdev->is_compute_ctx_active = false;
173762306a36Sopenharmony_ci
173862306a36Sopenharmony_ci		rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
173962306a36Sopenharmony_ci		if (rc) {
174062306a36Sopenharmony_ci			dev_err(hdev->dev,
174162306a36Sopenharmony_ci				"failed to init kernel ctx in hard reset\n");
174262306a36Sopenharmony_ci			kfree(hdev->kernel_ctx);
174362306a36Sopenharmony_ci			hdev->kernel_ctx = NULL;
174462306a36Sopenharmony_ci			hl_mmu_fini(hdev);
174562306a36Sopenharmony_ci			goto out_err;
174662306a36Sopenharmony_ci		}
174762306a36Sopenharmony_ci	}
174862306a36Sopenharmony_ci
174962306a36Sopenharmony_ci	/* Device is now enabled as part of the initialization requires
175062306a36Sopenharmony_ci	 * communication with the device firmware to get information that
175162306a36Sopenharmony_ci	 * is required for the initialization itself
175262306a36Sopenharmony_ci	 */
175362306a36Sopenharmony_ci	hdev->disabled = false;
175462306a36Sopenharmony_ci
175562306a36Sopenharmony_ci	/* F/W security enabled indication might be updated after hard-reset */
175662306a36Sopenharmony_ci	if (hard_reset) {
175762306a36Sopenharmony_ci		rc = hl_fw_read_preboot_status(hdev);
175862306a36Sopenharmony_ci		if (rc)
175962306a36Sopenharmony_ci			goto out_err;
176062306a36Sopenharmony_ci	}
176162306a36Sopenharmony_ci
176262306a36Sopenharmony_ci	rc = hdev->asic_funcs->hw_init(hdev);
176362306a36Sopenharmony_ci	if (rc) {
176462306a36Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
176562306a36Sopenharmony_ci		goto out_err;
176662306a36Sopenharmony_ci	}
176762306a36Sopenharmony_ci
176862306a36Sopenharmony_ci	/* If device is not idle fail the reset process */
176962306a36Sopenharmony_ci	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
177062306a36Sopenharmony_ci						HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
177162306a36Sopenharmony_ci		print_idle_status_mask(hdev, "device is not idle after reset", idle_mask);
177262306a36Sopenharmony_ci		rc = -EIO;
177362306a36Sopenharmony_ci		goto out_err;
177462306a36Sopenharmony_ci	}
177562306a36Sopenharmony_ci
177662306a36Sopenharmony_ci	/* Check that the communication with the device is working */
177762306a36Sopenharmony_ci	rc = hdev->asic_funcs->test_queues(hdev);
177862306a36Sopenharmony_ci	if (rc) {
177962306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
178062306a36Sopenharmony_ci		goto out_err;
178162306a36Sopenharmony_ci	}
178262306a36Sopenharmony_ci
178362306a36Sopenharmony_ci	if (hard_reset) {
178462306a36Sopenharmony_ci		rc = device_late_init(hdev);
178562306a36Sopenharmony_ci		if (rc) {
178662306a36Sopenharmony_ci			dev_err(hdev->dev, "Failed late init after hard reset\n");
178762306a36Sopenharmony_ci			goto out_err;
178862306a36Sopenharmony_ci		}
178962306a36Sopenharmony_ci
179062306a36Sopenharmony_ci		rc = hl_vm_init(hdev);
179162306a36Sopenharmony_ci		if (rc) {
179262306a36Sopenharmony_ci			dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
179362306a36Sopenharmony_ci			goto out_err;
179462306a36Sopenharmony_ci		}
179562306a36Sopenharmony_ci
179662306a36Sopenharmony_ci		if (!hdev->asic_prop.fw_security_enabled)
179762306a36Sopenharmony_ci			hl_fw_set_max_power(hdev);
179862306a36Sopenharmony_ci	} else {
179962306a36Sopenharmony_ci		rc = hdev->asic_funcs->compute_reset_late_init(hdev);
180062306a36Sopenharmony_ci		if (rc) {
180162306a36Sopenharmony_ci			if (reset_upon_device_release)
180262306a36Sopenharmony_ci				dev_err(hdev->dev,
180362306a36Sopenharmony_ci					"Failed late init in reset after device release\n");
180462306a36Sopenharmony_ci			else
180562306a36Sopenharmony_ci				dev_err(hdev->dev, "Failed late init after compute reset\n");
180662306a36Sopenharmony_ci			goto out_err;
180762306a36Sopenharmony_ci		}
180862306a36Sopenharmony_ci	}
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ci	rc = hdev->asic_funcs->scrub_device_mem(hdev);
181162306a36Sopenharmony_ci	if (rc) {
181262306a36Sopenharmony_ci		dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc);
181362306a36Sopenharmony_ci		goto out_err;
181462306a36Sopenharmony_ci	}
181562306a36Sopenharmony_ci
181662306a36Sopenharmony_ci	spin_lock(&hdev->reset_info.lock);
181762306a36Sopenharmony_ci	hdev->reset_info.in_compute_reset = 0;
181862306a36Sopenharmony_ci
181962306a36Sopenharmony_ci	/* Schedule hard reset only if requested and if not already in hard reset.
182062306a36Sopenharmony_ci	 * We keep 'in_reset' enabled, so no other reset can go in during the hard
182162306a36Sopenharmony_ci	 * reset schedule
182262306a36Sopenharmony_ci	 */
182362306a36Sopenharmony_ci	if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags)
182462306a36Sopenharmony_ci		schedule_hard_reset = true;
182562306a36Sopenharmony_ci	else
182662306a36Sopenharmony_ci		hdev->reset_info.in_reset = 0;
182762306a36Sopenharmony_ci
182862306a36Sopenharmony_ci	spin_unlock(&hdev->reset_info.lock);
182962306a36Sopenharmony_ci
183062306a36Sopenharmony_ci	hdev->reset_info.needs_reset = false;
183162306a36Sopenharmony_ci
183262306a36Sopenharmony_ci	if (hard_reset)
183362306a36Sopenharmony_ci		dev_info(hdev->dev,
183462306a36Sopenharmony_ci			 "Successfully finished resetting the %s device\n",
183562306a36Sopenharmony_ci			 dev_name(&(hdev)->pdev->dev));
183662306a36Sopenharmony_ci	else
183762306a36Sopenharmony_ci		dev_dbg(hdev->dev,
183862306a36Sopenharmony_ci			"Successfully finished resetting the %s device\n",
183962306a36Sopenharmony_ci			dev_name(&(hdev)->pdev->dev));
184062306a36Sopenharmony_ci
184162306a36Sopenharmony_ci	if (hard_reset) {
184262306a36Sopenharmony_ci		hdev->reset_info.hard_reset_cnt++;
184362306a36Sopenharmony_ci
184462306a36Sopenharmony_ci		/* After reset is done, we are ready to receive events from
184562306a36Sopenharmony_ci		 * the F/W. We can't do it before because we will ignore events
184662306a36Sopenharmony_ci		 * and if those events are fatal, we won't know about it and
184762306a36Sopenharmony_ci		 * the device will be operational although it shouldn't be
184862306a36Sopenharmony_ci		 */
184962306a36Sopenharmony_ci		hdev->asic_funcs->enable_events_from_fw(hdev);
185062306a36Sopenharmony_ci	} else {
185162306a36Sopenharmony_ci		if (!reset_upon_device_release)
185262306a36Sopenharmony_ci			hdev->reset_info.compute_reset_cnt++;
185362306a36Sopenharmony_ci
185462306a36Sopenharmony_ci		if (schedule_hard_reset) {
185562306a36Sopenharmony_ci			dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
185662306a36Sopenharmony_ci			flags = hdev->reset_info.hard_reset_schedule_flags;
185762306a36Sopenharmony_ci			hdev->reset_info.hard_reset_schedule_flags = 0;
185862306a36Sopenharmony_ci			hard_reset = true;
185962306a36Sopenharmony_ci			goto escalate_reset_flow;
186062306a36Sopenharmony_ci		}
186162306a36Sopenharmony_ci	}
186262306a36Sopenharmony_ci
186362306a36Sopenharmony_ci	return 0;
186462306a36Sopenharmony_ci
186562306a36Sopenharmony_ciout_err:
186662306a36Sopenharmony_ci	hdev->disabled = true;
186762306a36Sopenharmony_ci
186862306a36Sopenharmony_ci	spin_lock(&hdev->reset_info.lock);
186962306a36Sopenharmony_ci	hdev->reset_info.in_compute_reset = 0;
187062306a36Sopenharmony_ci
187162306a36Sopenharmony_ci	if (hard_reset) {
187262306a36Sopenharmony_ci		dev_err(hdev->dev,
187362306a36Sopenharmony_ci			"%s Failed to reset! Device is NOT usable\n",
187462306a36Sopenharmony_ci			dev_name(&(hdev)->pdev->dev));
187562306a36Sopenharmony_ci		hdev->reset_info.hard_reset_cnt++;
187662306a36Sopenharmony_ci	} else {
187762306a36Sopenharmony_ci		if (reset_upon_device_release) {
187862306a36Sopenharmony_ci			dev_err(hdev->dev, "Failed to reset device after user release\n");
187962306a36Sopenharmony_ci			flags &= ~HL_DRV_RESET_DEV_RELEASE;
188062306a36Sopenharmony_ci		} else {
188162306a36Sopenharmony_ci			dev_err(hdev->dev, "Failed to do compute reset\n");
188262306a36Sopenharmony_ci			hdev->reset_info.compute_reset_cnt++;
188362306a36Sopenharmony_ci		}
188462306a36Sopenharmony_ci
188562306a36Sopenharmony_ci		spin_unlock(&hdev->reset_info.lock);
188662306a36Sopenharmony_ci		flags |= HL_DRV_RESET_HARD;
188762306a36Sopenharmony_ci		hard_reset = true;
188862306a36Sopenharmony_ci		goto escalate_reset_flow;
188962306a36Sopenharmony_ci	}
189062306a36Sopenharmony_ci
189162306a36Sopenharmony_ci	hdev->reset_info.in_reset = 0;
189262306a36Sopenharmony_ci
189362306a36Sopenharmony_ci	spin_unlock(&hdev->reset_info.lock);
189462306a36Sopenharmony_ci
189562306a36Sopenharmony_ci	return rc;
189662306a36Sopenharmony_ci}
189762306a36Sopenharmony_ci
189862306a36Sopenharmony_ci/*
189962306a36Sopenharmony_ci * hl_device_cond_reset() - conditionally reset the device.
190062306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure.
190162306a36Sopenharmony_ci * @reset_flags: reset flags.
190262306a36Sopenharmony_ci * @event_mask: events to notify user about.
190362306a36Sopenharmony_ci *
190462306a36Sopenharmony_ci * Conditionally reset the device, or alternatively schedule a watchdog work to reset the device
190562306a36Sopenharmony_ci * unless another reset precedes it.
190662306a36Sopenharmony_ci */
190762306a36Sopenharmony_ciint hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask)
190862306a36Sopenharmony_ci{
190962306a36Sopenharmony_ci	struct hl_ctx *ctx = NULL;
191062306a36Sopenharmony_ci
191162306a36Sopenharmony_ci	/* F/W reset cannot be postponed */
191262306a36Sopenharmony_ci	if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW)
191362306a36Sopenharmony_ci		goto device_reset;
191462306a36Sopenharmony_ci
191562306a36Sopenharmony_ci	/* Device release watchdog is relevant only if user exists and gets a reset notification */
191662306a36Sopenharmony_ci	if (!(event_mask & HL_NOTIFIER_EVENT_DEVICE_RESET)) {
191762306a36Sopenharmony_ci		dev_err(hdev->dev, "Resetting device without a reset indication to user\n");
191862306a36Sopenharmony_ci		goto device_reset;
191962306a36Sopenharmony_ci	}
192062306a36Sopenharmony_ci
192162306a36Sopenharmony_ci	ctx = hl_get_compute_ctx(hdev);
192262306a36Sopenharmony_ci	if (!ctx || !ctx->hpriv->notifier_event.eventfd)
192362306a36Sopenharmony_ci		goto device_reset;
192462306a36Sopenharmony_ci
192562306a36Sopenharmony_ci	/* Schedule the device release watchdog work unless reset is already in progress or if the
192662306a36Sopenharmony_ci	 * work is already scheduled.
192762306a36Sopenharmony_ci	 */
192862306a36Sopenharmony_ci	spin_lock(&hdev->reset_info.lock);
192962306a36Sopenharmony_ci	if (hdev->reset_info.in_reset) {
193062306a36Sopenharmony_ci		spin_unlock(&hdev->reset_info.lock);
193162306a36Sopenharmony_ci		goto device_reset;
193262306a36Sopenharmony_ci	}
193362306a36Sopenharmony_ci
193462306a36Sopenharmony_ci	if (hdev->reset_info.watchdog_active)
193562306a36Sopenharmony_ci		goto out;
193662306a36Sopenharmony_ci
193762306a36Sopenharmony_ci	hdev->device_release_watchdog_work.flags = flags;
193862306a36Sopenharmony_ci	dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n",
193962306a36Sopenharmony_ci		hdev->device_release_watchdog_timeout_sec);
194062306a36Sopenharmony_ci	schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work,
194162306a36Sopenharmony_ci				msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000));
194262306a36Sopenharmony_ci	hdev->reset_info.watchdog_active = 1;
194362306a36Sopenharmony_ciout:
194462306a36Sopenharmony_ci	spin_unlock(&hdev->reset_info.lock);
194562306a36Sopenharmony_ci
194662306a36Sopenharmony_ci	hl_notifier_event_send_all(hdev, event_mask);
194762306a36Sopenharmony_ci
194862306a36Sopenharmony_ci	hl_ctx_put(ctx);
194962306a36Sopenharmony_ci
195062306a36Sopenharmony_ci	hl_abort_waiting_for_completions(hdev);
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_ci	return 0;
195362306a36Sopenharmony_ci
195462306a36Sopenharmony_cidevice_reset:
195562306a36Sopenharmony_ci	if (event_mask)
195662306a36Sopenharmony_ci		hl_notifier_event_send_all(hdev, event_mask);
195762306a36Sopenharmony_ci	if (ctx)
195862306a36Sopenharmony_ci		hl_ctx_put(ctx);
195962306a36Sopenharmony_ci
196062306a36Sopenharmony_ci	return hl_device_reset(hdev, flags);
196162306a36Sopenharmony_ci}
196262306a36Sopenharmony_ci
196362306a36Sopenharmony_cistatic void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask)
196462306a36Sopenharmony_ci{
196562306a36Sopenharmony_ci	mutex_lock(&notifier_event->lock);
196662306a36Sopenharmony_ci	notifier_event->events_mask |= event_mask;
196762306a36Sopenharmony_ci
196862306a36Sopenharmony_ci	if (notifier_event->eventfd)
196962306a36Sopenharmony_ci		eventfd_signal(notifier_event->eventfd, 1);
197062306a36Sopenharmony_ci
197162306a36Sopenharmony_ci	mutex_unlock(&notifier_event->lock);
197262306a36Sopenharmony_ci}
197362306a36Sopenharmony_ci
197462306a36Sopenharmony_ci/*
197562306a36Sopenharmony_ci * hl_notifier_event_send_all - notify all user processes via eventfd
197662306a36Sopenharmony_ci *
197762306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
197862306a36Sopenharmony_ci * @event_mask: the occurred event/s
197962306a36Sopenharmony_ci * Returns 0 for success or an error on failure.
198062306a36Sopenharmony_ci */
198162306a36Sopenharmony_civoid hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask)
198262306a36Sopenharmony_ci{
198362306a36Sopenharmony_ci	struct hl_fpriv	*hpriv;
198462306a36Sopenharmony_ci
198562306a36Sopenharmony_ci	if (!event_mask) {
198662306a36Sopenharmony_ci		dev_warn(hdev->dev, "Skip sending zero event");
198762306a36Sopenharmony_ci		return;
198862306a36Sopenharmony_ci	}
198962306a36Sopenharmony_ci
199062306a36Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
199162306a36Sopenharmony_ci
199262306a36Sopenharmony_ci	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
199362306a36Sopenharmony_ci		hl_notifier_event_send(&hpriv->notifier_event, event_mask);
199462306a36Sopenharmony_ci
199562306a36Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
199662306a36Sopenharmony_ci
199762306a36Sopenharmony_ci	/* control device */
199862306a36Sopenharmony_ci	mutex_lock(&hdev->fpriv_ctrl_list_lock);
199962306a36Sopenharmony_ci
200062306a36Sopenharmony_ci	list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node)
200162306a36Sopenharmony_ci		hl_notifier_event_send(&hpriv->notifier_event, event_mask);
200262306a36Sopenharmony_ci
200362306a36Sopenharmony_ci	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
200462306a36Sopenharmony_ci}
200562306a36Sopenharmony_ci
200662306a36Sopenharmony_cistatic int create_cdev(struct hl_device *hdev)
200762306a36Sopenharmony_ci{
200862306a36Sopenharmony_ci	char *name;
200962306a36Sopenharmony_ci	int rc;
201062306a36Sopenharmony_ci
201162306a36Sopenharmony_ci	hdev->cdev_idx = hdev->id / 2;
201262306a36Sopenharmony_ci
201362306a36Sopenharmony_ci	name = kasprintf(GFP_KERNEL, "hl%d", hdev->cdev_idx);
201462306a36Sopenharmony_ci	if (!name) {
201562306a36Sopenharmony_ci		rc = -ENOMEM;
201662306a36Sopenharmony_ci		goto out_err;
201762306a36Sopenharmony_ci	}
201862306a36Sopenharmony_ci
201962306a36Sopenharmony_ci	/* Initialize cdev and device structures */
202062306a36Sopenharmony_ci	rc = device_init_cdev(hdev, hdev->hclass, hdev->id, &hl_ops, name,
202162306a36Sopenharmony_ci				&hdev->cdev, &hdev->dev);
202262306a36Sopenharmony_ci
202362306a36Sopenharmony_ci	kfree(name);
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_ci	if (rc)
202662306a36Sopenharmony_ci		goto out_err;
202762306a36Sopenharmony_ci
202862306a36Sopenharmony_ci	name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->cdev_idx);
202962306a36Sopenharmony_ci	if (!name) {
203062306a36Sopenharmony_ci		rc = -ENOMEM;
203162306a36Sopenharmony_ci		goto free_dev;
203262306a36Sopenharmony_ci	}
203362306a36Sopenharmony_ci
203462306a36Sopenharmony_ci	/* Initialize cdev and device structures for control device */
203562306a36Sopenharmony_ci	rc = device_init_cdev(hdev, hdev->hclass, hdev->id_control, &hl_ctrl_ops,
203662306a36Sopenharmony_ci				name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
203762306a36Sopenharmony_ci
203862306a36Sopenharmony_ci	kfree(name);
203962306a36Sopenharmony_ci
204062306a36Sopenharmony_ci	if (rc)
204162306a36Sopenharmony_ci		goto free_dev;
204262306a36Sopenharmony_ci
204362306a36Sopenharmony_ci	return 0;
204462306a36Sopenharmony_ci
204562306a36Sopenharmony_cifree_dev:
204662306a36Sopenharmony_ci	put_device(hdev->dev);
204762306a36Sopenharmony_ciout_err:
204862306a36Sopenharmony_ci	return rc;
204962306a36Sopenharmony_ci}
205062306a36Sopenharmony_ci
205162306a36Sopenharmony_ci/*
205262306a36Sopenharmony_ci * hl_device_init - main initialization function for habanalabs device
205362306a36Sopenharmony_ci *
205462306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
205562306a36Sopenharmony_ci *
205662306a36Sopenharmony_ci * Allocate an id for the device, do early initialization and then call the
205762306a36Sopenharmony_ci * ASIC specific initialization functions. Finally, create the cdev and the
205862306a36Sopenharmony_ci * Linux device to expose it to the user
205962306a36Sopenharmony_ci */
206062306a36Sopenharmony_ciint hl_device_init(struct hl_device *hdev)
206162306a36Sopenharmony_ci{
206262306a36Sopenharmony_ci	int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt;
206362306a36Sopenharmony_ci	bool expose_interfaces_on_err = false;
206462306a36Sopenharmony_ci
206562306a36Sopenharmony_ci	rc = create_cdev(hdev);
206662306a36Sopenharmony_ci	if (rc)
206762306a36Sopenharmony_ci		goto out_disabled;
206862306a36Sopenharmony_ci
206962306a36Sopenharmony_ci	/* Initialize ASIC function pointers and perform early init */
207062306a36Sopenharmony_ci	rc = device_early_init(hdev);
207162306a36Sopenharmony_ci	if (rc)
207262306a36Sopenharmony_ci		goto free_dev;
207362306a36Sopenharmony_ci
207462306a36Sopenharmony_ci	user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
207562306a36Sopenharmony_ci				hdev->asic_prop.user_interrupt_count;
207662306a36Sopenharmony_ci
207762306a36Sopenharmony_ci	if (user_interrupt_cnt) {
207862306a36Sopenharmony_ci		hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt),
207962306a36Sopenharmony_ci						GFP_KERNEL);
208062306a36Sopenharmony_ci		if (!hdev->user_interrupt) {
208162306a36Sopenharmony_ci			rc = -ENOMEM;
208262306a36Sopenharmony_ci			goto early_fini;
208362306a36Sopenharmony_ci		}
208462306a36Sopenharmony_ci	}
208562306a36Sopenharmony_ci
208662306a36Sopenharmony_ci	/*
208762306a36Sopenharmony_ci	 * Start calling ASIC initialization. First S/W then H/W and finally
208862306a36Sopenharmony_ci	 * late init
208962306a36Sopenharmony_ci	 */
209062306a36Sopenharmony_ci	rc = hdev->asic_funcs->sw_init(hdev);
209162306a36Sopenharmony_ci	if (rc)
209262306a36Sopenharmony_ci		goto free_usr_intr_mem;
209362306a36Sopenharmony_ci
209462306a36Sopenharmony_ci
209562306a36Sopenharmony_ci	/* initialize completion structure for multi CS wait */
209662306a36Sopenharmony_ci	hl_multi_cs_completion_init(hdev);
209762306a36Sopenharmony_ci
209862306a36Sopenharmony_ci	/*
209962306a36Sopenharmony_ci	 * Initialize the H/W queues. Must be done before hw_init, because
210062306a36Sopenharmony_ci	 * there the addresses of the kernel queue are being written to the
210162306a36Sopenharmony_ci	 * registers of the device
210262306a36Sopenharmony_ci	 */
210362306a36Sopenharmony_ci	rc = hl_hw_queues_create(hdev);
210462306a36Sopenharmony_ci	if (rc) {
210562306a36Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize kernel queues\n");
210662306a36Sopenharmony_ci		goto sw_fini;
210762306a36Sopenharmony_ci	}
210862306a36Sopenharmony_ci
210962306a36Sopenharmony_ci	cq_cnt = hdev->asic_prop.completion_queues_count;
211062306a36Sopenharmony_ci
211162306a36Sopenharmony_ci	/*
211262306a36Sopenharmony_ci	 * Initialize the completion queues. Must be done before hw_init,
211362306a36Sopenharmony_ci	 * because there the addresses of the completion queues are being
211462306a36Sopenharmony_ci	 * passed as arguments to request_irq
211562306a36Sopenharmony_ci	 */
211662306a36Sopenharmony_ci	if (cq_cnt) {
211762306a36Sopenharmony_ci		hdev->completion_queue = kcalloc(cq_cnt,
211862306a36Sopenharmony_ci				sizeof(*hdev->completion_queue),
211962306a36Sopenharmony_ci				GFP_KERNEL);
212062306a36Sopenharmony_ci
212162306a36Sopenharmony_ci		if (!hdev->completion_queue) {
212262306a36Sopenharmony_ci			dev_err(hdev->dev,
212362306a36Sopenharmony_ci				"failed to allocate completion queues\n");
212462306a36Sopenharmony_ci			rc = -ENOMEM;
212562306a36Sopenharmony_ci			goto hw_queues_destroy;
212662306a36Sopenharmony_ci		}
212762306a36Sopenharmony_ci	}
212862306a36Sopenharmony_ci
212962306a36Sopenharmony_ci	for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) {
213062306a36Sopenharmony_ci		rc = hl_cq_init(hdev, &hdev->completion_queue[i],
213162306a36Sopenharmony_ci				hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
213262306a36Sopenharmony_ci		if (rc) {
213362306a36Sopenharmony_ci			dev_err(hdev->dev,
213462306a36Sopenharmony_ci				"failed to initialize completion queue\n");
213562306a36Sopenharmony_ci			goto cq_fini;
213662306a36Sopenharmony_ci		}
213762306a36Sopenharmony_ci		hdev->completion_queue[i].cq_idx = i;
213862306a36Sopenharmony_ci	}
213962306a36Sopenharmony_ci
214062306a36Sopenharmony_ci	hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs,
214162306a36Sopenharmony_ci					sizeof(struct hl_cs *), GFP_KERNEL);
214262306a36Sopenharmony_ci	if (!hdev->shadow_cs_queue) {
214362306a36Sopenharmony_ci		rc = -ENOMEM;
214462306a36Sopenharmony_ci		goto cq_fini;
214562306a36Sopenharmony_ci	}
214662306a36Sopenharmony_ci
214762306a36Sopenharmony_ci	/*
214862306a36Sopenharmony_ci	 * Initialize the event queue. Must be done before hw_init,
214962306a36Sopenharmony_ci	 * because there the address of the event queue is being
215062306a36Sopenharmony_ci	 * passed as argument to request_irq
215162306a36Sopenharmony_ci	 */
215262306a36Sopenharmony_ci	rc = hl_eq_init(hdev, &hdev->event_queue);
215362306a36Sopenharmony_ci	if (rc) {
215462306a36Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize event queue\n");
215562306a36Sopenharmony_ci		goto free_shadow_cs_queue;
215662306a36Sopenharmony_ci	}
215762306a36Sopenharmony_ci
215862306a36Sopenharmony_ci	/* MMU S/W must be initialized before kernel context is created */
215962306a36Sopenharmony_ci	rc = hl_mmu_init(hdev);
216062306a36Sopenharmony_ci	if (rc) {
216162306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
216262306a36Sopenharmony_ci		goto eq_fini;
216362306a36Sopenharmony_ci	}
216462306a36Sopenharmony_ci
216562306a36Sopenharmony_ci	/* Allocate the kernel context */
216662306a36Sopenharmony_ci	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
216762306a36Sopenharmony_ci	if (!hdev->kernel_ctx) {
216862306a36Sopenharmony_ci		rc = -ENOMEM;
216962306a36Sopenharmony_ci		goto mmu_fini;
217062306a36Sopenharmony_ci	}
217162306a36Sopenharmony_ci
217262306a36Sopenharmony_ci	hdev->is_compute_ctx_active = false;
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_ci	hdev->asic_funcs->state_dump_init(hdev);
217562306a36Sopenharmony_ci
217662306a36Sopenharmony_ci	hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC;
217762306a36Sopenharmony_ci
217862306a36Sopenharmony_ci	hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL;
217962306a36Sopenharmony_ci
218062306a36Sopenharmony_ci	rc = hl_debugfs_device_init(hdev);
218162306a36Sopenharmony_ci	if (rc) {
218262306a36Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize debugfs entry structure\n");
218362306a36Sopenharmony_ci		kfree(hdev->kernel_ctx);
218462306a36Sopenharmony_ci		goto mmu_fini;
218562306a36Sopenharmony_ci	}
218662306a36Sopenharmony_ci
218762306a36Sopenharmony_ci	/* The debugfs entry structure is accessed in hl_ctx_init(), so it must be called after
218862306a36Sopenharmony_ci	 * hl_debugfs_device_init().
218962306a36Sopenharmony_ci	 */
219062306a36Sopenharmony_ci	rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
219162306a36Sopenharmony_ci	if (rc) {
219262306a36Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize kernel context\n");
219362306a36Sopenharmony_ci		kfree(hdev->kernel_ctx);
219462306a36Sopenharmony_ci		goto debugfs_device_fini;
219562306a36Sopenharmony_ci	}
219662306a36Sopenharmony_ci
219762306a36Sopenharmony_ci	rc = hl_cb_pool_init(hdev);
219862306a36Sopenharmony_ci	if (rc) {
219962306a36Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize CB pool\n");
220062306a36Sopenharmony_ci		goto release_ctx;
220162306a36Sopenharmony_ci	}
220262306a36Sopenharmony_ci
220362306a36Sopenharmony_ci	rc = hl_dec_init(hdev);
220462306a36Sopenharmony_ci	if (rc) {
220562306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to initialize the decoder module\n");
220662306a36Sopenharmony_ci		goto cb_pool_fini;
220762306a36Sopenharmony_ci	}
220862306a36Sopenharmony_ci
220962306a36Sopenharmony_ci	/*
221062306a36Sopenharmony_ci	 * From this point, override rc (=0) in case of an error to allow debugging
221162306a36Sopenharmony_ci	 * (by adding char devices and creating sysfs/debugfs files as part of the error flow).
221262306a36Sopenharmony_ci	 */
221362306a36Sopenharmony_ci	expose_interfaces_on_err = true;
221462306a36Sopenharmony_ci
221562306a36Sopenharmony_ci	/* Device is now enabled as part of the initialization requires
221662306a36Sopenharmony_ci	 * communication with the device firmware to get information that
221762306a36Sopenharmony_ci	 * is required for the initialization itself
221862306a36Sopenharmony_ci	 */
221962306a36Sopenharmony_ci	hdev->disabled = false;
222062306a36Sopenharmony_ci
222162306a36Sopenharmony_ci	rc = hdev->asic_funcs->hw_init(hdev);
222262306a36Sopenharmony_ci	if (rc) {
222362306a36Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize the H/W\n");
222462306a36Sopenharmony_ci		rc = 0;
222562306a36Sopenharmony_ci		goto out_disabled;
222662306a36Sopenharmony_ci	}
222762306a36Sopenharmony_ci
222862306a36Sopenharmony_ci	/* Check that the communication with the device is working */
222962306a36Sopenharmony_ci	rc = hdev->asic_funcs->test_queues(hdev);
223062306a36Sopenharmony_ci	if (rc) {
223162306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to detect if device is alive\n");
223262306a36Sopenharmony_ci		rc = 0;
223362306a36Sopenharmony_ci		goto out_disabled;
223462306a36Sopenharmony_ci	}
223562306a36Sopenharmony_ci
223662306a36Sopenharmony_ci	rc = device_late_init(hdev);
223762306a36Sopenharmony_ci	if (rc) {
223862306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed late initialization\n");
223962306a36Sopenharmony_ci		rc = 0;
224062306a36Sopenharmony_ci		goto out_disabled;
224162306a36Sopenharmony_ci	}
224262306a36Sopenharmony_ci
224362306a36Sopenharmony_ci	dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
224462306a36Sopenharmony_ci		hdev->asic_name,
224562306a36Sopenharmony_ci		hdev->asic_prop.dram_size / SZ_1G);
224662306a36Sopenharmony_ci
224762306a36Sopenharmony_ci	rc = hl_vm_init(hdev);
224862306a36Sopenharmony_ci	if (rc) {
224962306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to initialize memory module\n");
225062306a36Sopenharmony_ci		rc = 0;
225162306a36Sopenharmony_ci		goto out_disabled;
225262306a36Sopenharmony_ci	}
225362306a36Sopenharmony_ci
225462306a36Sopenharmony_ci	/*
225562306a36Sopenharmony_ci	 * Expose devices and sysfs/debugfs files to user.
225662306a36Sopenharmony_ci	 * From here there is no need to expose them in case of an error.
225762306a36Sopenharmony_ci	 */
225862306a36Sopenharmony_ci	expose_interfaces_on_err = false;
225962306a36Sopenharmony_ci	rc = cdev_sysfs_debugfs_add(hdev);
226062306a36Sopenharmony_ci	if (rc) {
226162306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n");
226262306a36Sopenharmony_ci		rc = 0;
226362306a36Sopenharmony_ci		goto out_disabled;
226462306a36Sopenharmony_ci	}
226562306a36Sopenharmony_ci
226662306a36Sopenharmony_ci	/* Need to call this again because the max power might change,
226762306a36Sopenharmony_ci	 * depending on card type for certain ASICs
226862306a36Sopenharmony_ci	 */
226962306a36Sopenharmony_ci	if (hdev->asic_prop.set_max_power_on_device_init &&
227062306a36Sopenharmony_ci			!hdev->asic_prop.fw_security_enabled)
227162306a36Sopenharmony_ci		hl_fw_set_max_power(hdev);
227262306a36Sopenharmony_ci
227362306a36Sopenharmony_ci	/*
227462306a36Sopenharmony_ci	 * hl_hwmon_init() must be called after device_late_init(), because only
227562306a36Sopenharmony_ci	 * there we get the information from the device about which
227662306a36Sopenharmony_ci	 * hwmon-related sensors the device supports.
227762306a36Sopenharmony_ci	 * Furthermore, it must be done after adding the device to the system.
227862306a36Sopenharmony_ci	 */
227962306a36Sopenharmony_ci	rc = hl_hwmon_init(hdev);
228062306a36Sopenharmony_ci	if (rc) {
228162306a36Sopenharmony_ci		dev_err(hdev->dev, "Failed to initialize hwmon\n");
228262306a36Sopenharmony_ci		rc = 0;
228362306a36Sopenharmony_ci		goto out_disabled;
228462306a36Sopenharmony_ci	}
228562306a36Sopenharmony_ci
228662306a36Sopenharmony_ci	dev_notice(hdev->dev,
228762306a36Sopenharmony_ci		"Successfully added device %s to habanalabs driver\n",
228862306a36Sopenharmony_ci		dev_name(&(hdev)->pdev->dev));
228962306a36Sopenharmony_ci
229062306a36Sopenharmony_ci	hdev->init_done = true;
229162306a36Sopenharmony_ci
229262306a36Sopenharmony_ci	/* After initialization is done, we are ready to receive events from
229362306a36Sopenharmony_ci	 * the F/W. We can't do it before because we will ignore events and if
229462306a36Sopenharmony_ci	 * those events are fatal, we won't know about it and the device will
229562306a36Sopenharmony_ci	 * be operational although it shouldn't be
229662306a36Sopenharmony_ci	 */
229762306a36Sopenharmony_ci	hdev->asic_funcs->enable_events_from_fw(hdev);
229862306a36Sopenharmony_ci
229962306a36Sopenharmony_ci	return 0;
230062306a36Sopenharmony_ci
230162306a36Sopenharmony_cicb_pool_fini:
230262306a36Sopenharmony_ci	hl_cb_pool_fini(hdev);
230362306a36Sopenharmony_cirelease_ctx:
230462306a36Sopenharmony_ci	if (hl_ctx_put(hdev->kernel_ctx) != 1)
230562306a36Sopenharmony_ci		dev_err(hdev->dev,
230662306a36Sopenharmony_ci			"kernel ctx is still alive on initialization failure\n");
230762306a36Sopenharmony_cidebugfs_device_fini:
230862306a36Sopenharmony_ci	hl_debugfs_device_fini(hdev);
230962306a36Sopenharmony_cimmu_fini:
231062306a36Sopenharmony_ci	hl_mmu_fini(hdev);
231162306a36Sopenharmony_cieq_fini:
231262306a36Sopenharmony_ci	hl_eq_fini(hdev, &hdev->event_queue);
231362306a36Sopenharmony_cifree_shadow_cs_queue:
231462306a36Sopenharmony_ci	kfree(hdev->shadow_cs_queue);
231562306a36Sopenharmony_cicq_fini:
231662306a36Sopenharmony_ci	for (i = 0 ; i < cq_ready_cnt ; i++)
231762306a36Sopenharmony_ci		hl_cq_fini(hdev, &hdev->completion_queue[i]);
231862306a36Sopenharmony_ci	kfree(hdev->completion_queue);
231962306a36Sopenharmony_cihw_queues_destroy:
232062306a36Sopenharmony_ci	hl_hw_queues_destroy(hdev);
232162306a36Sopenharmony_cisw_fini:
232262306a36Sopenharmony_ci	hdev->asic_funcs->sw_fini(hdev);
232362306a36Sopenharmony_cifree_usr_intr_mem:
232462306a36Sopenharmony_ci	kfree(hdev->user_interrupt);
232562306a36Sopenharmony_ciearly_fini:
232662306a36Sopenharmony_ci	device_early_fini(hdev);
232762306a36Sopenharmony_cifree_dev:
232862306a36Sopenharmony_ci	put_device(hdev->dev_ctrl);
232962306a36Sopenharmony_ci	put_device(hdev->dev);
233062306a36Sopenharmony_ciout_disabled:
233162306a36Sopenharmony_ci	hdev->disabled = true;
233262306a36Sopenharmony_ci	if (expose_interfaces_on_err)
233362306a36Sopenharmony_ci		cdev_sysfs_debugfs_add(hdev);
233462306a36Sopenharmony_ci	dev_err(&hdev->pdev->dev,
233562306a36Sopenharmony_ci		"Failed to initialize hl%d. Device %s is NOT usable !\n",
233662306a36Sopenharmony_ci		hdev->cdev_idx, dev_name(&hdev->pdev->dev));
233762306a36Sopenharmony_ci
233862306a36Sopenharmony_ci	return rc;
233962306a36Sopenharmony_ci}
234062306a36Sopenharmony_ci
234162306a36Sopenharmony_ci/*
234262306a36Sopenharmony_ci * hl_device_fini - main tear-down function for habanalabs device
234362306a36Sopenharmony_ci *
234462306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
234562306a36Sopenharmony_ci *
234662306a36Sopenharmony_ci * Destroy the device, call ASIC fini functions and release the id
234762306a36Sopenharmony_ci */
234862306a36Sopenharmony_civoid hl_device_fini(struct hl_device *hdev)
234962306a36Sopenharmony_ci{
235062306a36Sopenharmony_ci	bool device_in_reset;
235162306a36Sopenharmony_ci	ktime_t timeout;
235262306a36Sopenharmony_ci	u64 reset_sec;
235362306a36Sopenharmony_ci	int i, rc;
235462306a36Sopenharmony_ci
235562306a36Sopenharmony_ci	dev_info(hdev->dev, "Removing device\n");
235662306a36Sopenharmony_ci
235762306a36Sopenharmony_ci	hdev->device_fini_pending = 1;
235862306a36Sopenharmony_ci	flush_delayed_work(&hdev->device_reset_work.reset_work);
235962306a36Sopenharmony_ci
236062306a36Sopenharmony_ci	if (hdev->pldm)
236162306a36Sopenharmony_ci		reset_sec = HL_PLDM_HARD_RESET_MAX_TIMEOUT;
236262306a36Sopenharmony_ci	else
236362306a36Sopenharmony_ci		reset_sec = HL_HARD_RESET_MAX_TIMEOUT;
236462306a36Sopenharmony_ci
236562306a36Sopenharmony_ci	/*
236662306a36Sopenharmony_ci	 * This function is competing with the reset function, so try to
236762306a36Sopenharmony_ci	 * take the reset atomic and if we are already in middle of reset,
236862306a36Sopenharmony_ci	 * wait until reset function is finished. Reset function is designed
236962306a36Sopenharmony_ci	 * to always finish. However, in Gaudi, because of all the network
237062306a36Sopenharmony_ci	 * ports, the hard reset could take between 10-30 seconds
237162306a36Sopenharmony_ci	 */
237262306a36Sopenharmony_ci
237362306a36Sopenharmony_ci	timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
237462306a36Sopenharmony_ci
237562306a36Sopenharmony_ci	spin_lock(&hdev->reset_info.lock);
237662306a36Sopenharmony_ci	device_in_reset = !!hdev->reset_info.in_reset;
237762306a36Sopenharmony_ci	if (!device_in_reset)
237862306a36Sopenharmony_ci		hdev->reset_info.in_reset = 1;
237962306a36Sopenharmony_ci	spin_unlock(&hdev->reset_info.lock);
238062306a36Sopenharmony_ci
238162306a36Sopenharmony_ci	while (device_in_reset) {
238262306a36Sopenharmony_ci		usleep_range(50, 200);
238362306a36Sopenharmony_ci
238462306a36Sopenharmony_ci		spin_lock(&hdev->reset_info.lock);
238562306a36Sopenharmony_ci		device_in_reset = !!hdev->reset_info.in_reset;
238662306a36Sopenharmony_ci		if (!device_in_reset)
238762306a36Sopenharmony_ci			hdev->reset_info.in_reset = 1;
238862306a36Sopenharmony_ci		spin_unlock(&hdev->reset_info.lock);
238962306a36Sopenharmony_ci
239062306a36Sopenharmony_ci		if (ktime_compare(ktime_get(), timeout) > 0) {
239162306a36Sopenharmony_ci			dev_crit(hdev->dev,
239262306a36Sopenharmony_ci				"%s Failed to remove device because reset function did not finish\n",
239362306a36Sopenharmony_ci				dev_name(&(hdev)->pdev->dev));
239462306a36Sopenharmony_ci			return;
239562306a36Sopenharmony_ci		}
239662306a36Sopenharmony_ci	}
239762306a36Sopenharmony_ci
239862306a36Sopenharmony_ci	cancel_delayed_work_sync(&hdev->device_release_watchdog_work.reset_work);
239962306a36Sopenharmony_ci
240062306a36Sopenharmony_ci	/* Disable PCI access from device F/W so it won't send us additional
240162306a36Sopenharmony_ci	 * interrupts. We disable MSI/MSI-X at the halt_engines function and we
240262306a36Sopenharmony_ci	 * can't have the F/W sending us interrupts after that. We need to
240362306a36Sopenharmony_ci	 * disable the access here because if the device is marked disable, the
240462306a36Sopenharmony_ci	 * message won't be send. Also, in case of heartbeat, the device CPU is
240562306a36Sopenharmony_ci	 * marked as disable so this message won't be sent
240662306a36Sopenharmony_ci	 */
240762306a36Sopenharmony_ci	hl_fw_send_pci_access_msg(hdev,	CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
240862306a36Sopenharmony_ci
240962306a36Sopenharmony_ci	/* Mark device as disabled */
241062306a36Sopenharmony_ci	hdev->disabled = true;
241162306a36Sopenharmony_ci
241262306a36Sopenharmony_ci	take_release_locks(hdev);
241362306a36Sopenharmony_ci
241462306a36Sopenharmony_ci	hdev->reset_info.hard_reset_pending = true;
241562306a36Sopenharmony_ci
241662306a36Sopenharmony_ci	hl_hwmon_fini(hdev);
241762306a36Sopenharmony_ci
241862306a36Sopenharmony_ci	cleanup_resources(hdev, true, false, false);
241962306a36Sopenharmony_ci
242062306a36Sopenharmony_ci	/* Kill processes here after CS rollback. This is because the process
242162306a36Sopenharmony_ci	 * can't really exit until all its CSs are done, which is what we
242262306a36Sopenharmony_ci	 * do in cs rollback
242362306a36Sopenharmony_ci	 */
242462306a36Sopenharmony_ci	dev_info(hdev->dev,
242562306a36Sopenharmony_ci		"Waiting for all processes to exit (timeout of %u seconds)",
242662306a36Sopenharmony_ci		HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI);
242762306a36Sopenharmony_ci
242862306a36Sopenharmony_ci	hdev->process_kill_trial_cnt = 0;
242962306a36Sopenharmony_ci	rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, false);
243062306a36Sopenharmony_ci	if (rc) {
243162306a36Sopenharmony_ci		dev_crit(hdev->dev, "Failed to kill all open processes\n");
243262306a36Sopenharmony_ci		device_disable_open_processes(hdev, false);
243362306a36Sopenharmony_ci	}
243462306a36Sopenharmony_ci
243562306a36Sopenharmony_ci	hdev->process_kill_trial_cnt = 0;
243662306a36Sopenharmony_ci	rc = device_kill_open_processes(hdev, 0, true);
243762306a36Sopenharmony_ci	if (rc) {
243862306a36Sopenharmony_ci		dev_crit(hdev->dev, "Failed to kill all control device open processes\n");
243962306a36Sopenharmony_ci		device_disable_open_processes(hdev, true);
244062306a36Sopenharmony_ci	}
244162306a36Sopenharmony_ci
244262306a36Sopenharmony_ci	hl_cb_pool_fini(hdev);
244362306a36Sopenharmony_ci
244462306a36Sopenharmony_ci	/* Reset the H/W. It will be in idle state after this returns */
244562306a36Sopenharmony_ci	rc = hdev->asic_funcs->hw_fini(hdev, true, false);
244662306a36Sopenharmony_ci	if (rc)
244762306a36Sopenharmony_ci		dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
244862306a36Sopenharmony_ci
244962306a36Sopenharmony_ci	hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
245062306a36Sopenharmony_ci
245162306a36Sopenharmony_ci	/* Release kernel context */
245262306a36Sopenharmony_ci	if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
245362306a36Sopenharmony_ci		dev_err(hdev->dev, "kernel ctx is still alive\n");
245462306a36Sopenharmony_ci
245562306a36Sopenharmony_ci	hl_dec_fini(hdev);
245662306a36Sopenharmony_ci
245762306a36Sopenharmony_ci	hl_vm_fini(hdev);
245862306a36Sopenharmony_ci
245962306a36Sopenharmony_ci	hl_mmu_fini(hdev);
246062306a36Sopenharmony_ci
246162306a36Sopenharmony_ci	vfree(hdev->captured_err_info.page_fault_info.user_mappings);
246262306a36Sopenharmony_ci
246362306a36Sopenharmony_ci	hl_eq_fini(hdev, &hdev->event_queue);
246462306a36Sopenharmony_ci
246562306a36Sopenharmony_ci	kfree(hdev->shadow_cs_queue);
246662306a36Sopenharmony_ci
246762306a36Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
246862306a36Sopenharmony_ci		hl_cq_fini(hdev, &hdev->completion_queue[i]);
246962306a36Sopenharmony_ci	kfree(hdev->completion_queue);
247062306a36Sopenharmony_ci	kfree(hdev->user_interrupt);
247162306a36Sopenharmony_ci
247262306a36Sopenharmony_ci	hl_hw_queues_destroy(hdev);
247362306a36Sopenharmony_ci
247462306a36Sopenharmony_ci	/* Call ASIC S/W finalize function */
247562306a36Sopenharmony_ci	hdev->asic_funcs->sw_fini(hdev);
247662306a36Sopenharmony_ci
247762306a36Sopenharmony_ci	device_early_fini(hdev);
247862306a36Sopenharmony_ci
247962306a36Sopenharmony_ci	/* Hide devices and sysfs/debugfs files from user */
248062306a36Sopenharmony_ci	cdev_sysfs_debugfs_remove(hdev);
248162306a36Sopenharmony_ci
248262306a36Sopenharmony_ci	hl_debugfs_device_fini(hdev);
248362306a36Sopenharmony_ci
248462306a36Sopenharmony_ci	pr_info("removed device successfully\n");
248562306a36Sopenharmony_ci}
248662306a36Sopenharmony_ci
248762306a36Sopenharmony_ci/*
248862306a36Sopenharmony_ci * MMIO register access helper functions.
248962306a36Sopenharmony_ci */
249062306a36Sopenharmony_ci
249162306a36Sopenharmony_ci/*
249262306a36Sopenharmony_ci * hl_rreg - Read an MMIO register
249362306a36Sopenharmony_ci *
249462306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
249562306a36Sopenharmony_ci * @reg: MMIO register offset (in bytes)
249662306a36Sopenharmony_ci *
249762306a36Sopenharmony_ci * Returns the value of the MMIO register we are asked to read
249862306a36Sopenharmony_ci *
249962306a36Sopenharmony_ci */
250062306a36Sopenharmony_ciinline u32 hl_rreg(struct hl_device *hdev, u32 reg)
250162306a36Sopenharmony_ci{
250262306a36Sopenharmony_ci	u32 val = readl(hdev->rmmio + reg);
250362306a36Sopenharmony_ci
250462306a36Sopenharmony_ci	if (unlikely(trace_habanalabs_rreg32_enabled()))
250562306a36Sopenharmony_ci		trace_habanalabs_rreg32(hdev->dev, reg, val);
250662306a36Sopenharmony_ci
250762306a36Sopenharmony_ci	return val;
250862306a36Sopenharmony_ci}
250962306a36Sopenharmony_ci
251062306a36Sopenharmony_ci/*
251162306a36Sopenharmony_ci * hl_wreg - Write to an MMIO register
251262306a36Sopenharmony_ci *
251362306a36Sopenharmony_ci * @hdev: pointer to habanalabs device structure
251462306a36Sopenharmony_ci * @reg: MMIO register offset (in bytes)
251562306a36Sopenharmony_ci * @val: 32-bit value
251662306a36Sopenharmony_ci *
251762306a36Sopenharmony_ci * Writes the 32-bit value into the MMIO register
251862306a36Sopenharmony_ci *
251962306a36Sopenharmony_ci */
252062306a36Sopenharmony_ciinline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
252162306a36Sopenharmony_ci{
252262306a36Sopenharmony_ci	if (unlikely(trace_habanalabs_wreg32_enabled()))
252362306a36Sopenharmony_ci		trace_habanalabs_wreg32(hdev->dev, reg, val);
252462306a36Sopenharmony_ci
252562306a36Sopenharmony_ci	writel(val, hdev->rmmio + reg);
252662306a36Sopenharmony_ci}
252762306a36Sopenharmony_ci
252862306a36Sopenharmony_civoid hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
252962306a36Sopenharmony_ci			u8 flags)
253062306a36Sopenharmony_ci{
253162306a36Sopenharmony_ci	struct razwi_info *razwi_info = &hdev->captured_err_info.razwi_info;
253262306a36Sopenharmony_ci
253362306a36Sopenharmony_ci	if (num_of_engines > HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR) {
253462306a36Sopenharmony_ci		dev_err(hdev->dev,
253562306a36Sopenharmony_ci				"Number of possible razwi initiators (%u) exceeded limit (%u)\n",
253662306a36Sopenharmony_ci				num_of_engines, HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR);
253762306a36Sopenharmony_ci		return;
253862306a36Sopenharmony_ci	}
253962306a36Sopenharmony_ci
254062306a36Sopenharmony_ci	/* In case it's the first razwi since the device was opened, capture its parameters */
254162306a36Sopenharmony_ci	if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info.razwi_detected, 0, 1))
254262306a36Sopenharmony_ci		return;
254362306a36Sopenharmony_ci
254462306a36Sopenharmony_ci	razwi_info->razwi.timestamp = ktime_to_ns(ktime_get());
254562306a36Sopenharmony_ci	razwi_info->razwi.addr = addr;
254662306a36Sopenharmony_ci	razwi_info->razwi.num_of_possible_engines = num_of_engines;
254762306a36Sopenharmony_ci	memcpy(&razwi_info->razwi.engine_id[0], &engine_id[0],
254862306a36Sopenharmony_ci			num_of_engines * sizeof(u16));
254962306a36Sopenharmony_ci	razwi_info->razwi.flags = flags;
255062306a36Sopenharmony_ci
255162306a36Sopenharmony_ci	razwi_info->razwi_info_available = true;
255262306a36Sopenharmony_ci}
255362306a36Sopenharmony_ci
255462306a36Sopenharmony_civoid hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
255562306a36Sopenharmony_ci			u8 flags, u64 *event_mask)
255662306a36Sopenharmony_ci{
255762306a36Sopenharmony_ci	hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags);
255862306a36Sopenharmony_ci
255962306a36Sopenharmony_ci	if (event_mask)
256062306a36Sopenharmony_ci		*event_mask |= HL_NOTIFIER_EVENT_RAZWI;
256162306a36Sopenharmony_ci}
256262306a36Sopenharmony_ci
256362306a36Sopenharmony_cistatic void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu)
256462306a36Sopenharmony_ci{
256562306a36Sopenharmony_ci	struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info;
256662306a36Sopenharmony_ci	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
256762306a36Sopenharmony_ci	struct hl_vm_hash_node *hnode;
256862306a36Sopenharmony_ci	struct hl_userptr *userptr;
256962306a36Sopenharmony_ci	enum vm_type *vm_type;
257062306a36Sopenharmony_ci	struct hl_ctx *ctx;
257162306a36Sopenharmony_ci	u32 map_idx = 0;
257262306a36Sopenharmony_ci	int i;
257362306a36Sopenharmony_ci
257462306a36Sopenharmony_ci	/* Reset previous session count*/
257562306a36Sopenharmony_ci	pgf_info->num_of_user_mappings = 0;
257662306a36Sopenharmony_ci
257762306a36Sopenharmony_ci	ctx = hl_get_compute_ctx(hdev);
257862306a36Sopenharmony_ci	if (!ctx) {
257962306a36Sopenharmony_ci		dev_err(hdev->dev, "Can't get user context for user mappings\n");
258062306a36Sopenharmony_ci		return;
258162306a36Sopenharmony_ci	}
258262306a36Sopenharmony_ci
258362306a36Sopenharmony_ci	mutex_lock(&ctx->mem_hash_lock);
258462306a36Sopenharmony_ci	hash_for_each(ctx->mem_hash, i, hnode, node) {
258562306a36Sopenharmony_ci		vm_type = hnode->ptr;
258662306a36Sopenharmony_ci		if (((*vm_type == VM_TYPE_USERPTR) && is_pmmu) ||
258762306a36Sopenharmony_ci				((*vm_type == VM_TYPE_PHYS_PACK) && !is_pmmu))
258862306a36Sopenharmony_ci			pgf_info->num_of_user_mappings++;
258962306a36Sopenharmony_ci
259062306a36Sopenharmony_ci	}
259162306a36Sopenharmony_ci
259262306a36Sopenharmony_ci	if (!pgf_info->num_of_user_mappings)
259362306a36Sopenharmony_ci		goto finish;
259462306a36Sopenharmony_ci
259562306a36Sopenharmony_ci	/* In case we already allocated in previous session, need to release it before
259662306a36Sopenharmony_ci	 * allocating new buffer.
259762306a36Sopenharmony_ci	 */
259862306a36Sopenharmony_ci	vfree(pgf_info->user_mappings);
259962306a36Sopenharmony_ci	pgf_info->user_mappings =
260062306a36Sopenharmony_ci			vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping));
260162306a36Sopenharmony_ci	if (!pgf_info->user_mappings) {
260262306a36Sopenharmony_ci		pgf_info->num_of_user_mappings = 0;
260362306a36Sopenharmony_ci		goto finish;
260462306a36Sopenharmony_ci	}
260562306a36Sopenharmony_ci
260662306a36Sopenharmony_ci	hash_for_each(ctx->mem_hash, i, hnode, node) {
260762306a36Sopenharmony_ci		vm_type = hnode->ptr;
260862306a36Sopenharmony_ci		if ((*vm_type == VM_TYPE_USERPTR) && (is_pmmu)) {
260962306a36Sopenharmony_ci			userptr = hnode->ptr;
261062306a36Sopenharmony_ci			pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
261162306a36Sopenharmony_ci			pgf_info->user_mappings[map_idx].size = userptr->size;
261262306a36Sopenharmony_ci			map_idx++;
261362306a36Sopenharmony_ci		} else if ((*vm_type == VM_TYPE_PHYS_PACK) && (!is_pmmu)) {
261462306a36Sopenharmony_ci			phys_pg_pack = hnode->ptr;
261562306a36Sopenharmony_ci			pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
261662306a36Sopenharmony_ci			pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size;
261762306a36Sopenharmony_ci			map_idx++;
261862306a36Sopenharmony_ci		}
261962306a36Sopenharmony_ci	}
262062306a36Sopenharmony_cifinish:
262162306a36Sopenharmony_ci	mutex_unlock(&ctx->mem_hash_lock);
262262306a36Sopenharmony_ci	hl_ctx_put(ctx);
262362306a36Sopenharmony_ci}
262462306a36Sopenharmony_ci
262562306a36Sopenharmony_civoid hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu)
262662306a36Sopenharmony_ci{
262762306a36Sopenharmony_ci	struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info;
262862306a36Sopenharmony_ci
262962306a36Sopenharmony_ci	/* Capture only the first page fault */
263062306a36Sopenharmony_ci	if (atomic_cmpxchg(&pgf_info->page_fault_detected, 0, 1))
263162306a36Sopenharmony_ci		return;
263262306a36Sopenharmony_ci
263362306a36Sopenharmony_ci	pgf_info->page_fault.timestamp = ktime_to_ns(ktime_get());
263462306a36Sopenharmony_ci	pgf_info->page_fault.addr = addr;
263562306a36Sopenharmony_ci	pgf_info->page_fault.engine_id = eng_id;
263662306a36Sopenharmony_ci	hl_capture_user_mappings(hdev, is_pmmu);
263762306a36Sopenharmony_ci
263862306a36Sopenharmony_ci	pgf_info->page_fault_info_available = true;
263962306a36Sopenharmony_ci}
264062306a36Sopenharmony_ci
264162306a36Sopenharmony_civoid hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
264262306a36Sopenharmony_ci				u64 *event_mask)
264362306a36Sopenharmony_ci{
264462306a36Sopenharmony_ci	hl_capture_page_fault(hdev, addr, eng_id, is_pmmu);
264562306a36Sopenharmony_ci
264662306a36Sopenharmony_ci	if (event_mask)
264762306a36Sopenharmony_ci		*event_mask |=  HL_NOTIFIER_EVENT_PAGE_FAULT;
264862306a36Sopenharmony_ci}
264962306a36Sopenharmony_ci
265062306a36Sopenharmony_cistatic void hl_capture_hw_err(struct hl_device *hdev, u16 event_id)
265162306a36Sopenharmony_ci{
265262306a36Sopenharmony_ci	struct hw_err_info *info = &hdev->captured_err_info.hw_err;
265362306a36Sopenharmony_ci
265462306a36Sopenharmony_ci	/* Capture only the first HW err */
265562306a36Sopenharmony_ci	if (atomic_cmpxchg(&info->event_detected, 0, 1))
265662306a36Sopenharmony_ci		return;
265762306a36Sopenharmony_ci
265862306a36Sopenharmony_ci	info->event.timestamp = ktime_to_ns(ktime_get());
265962306a36Sopenharmony_ci	info->event.event_id = event_id;
266062306a36Sopenharmony_ci
266162306a36Sopenharmony_ci	info->event_info_available = true;
266262306a36Sopenharmony_ci}
266362306a36Sopenharmony_ci
266462306a36Sopenharmony_civoid hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask)
266562306a36Sopenharmony_ci{
266662306a36Sopenharmony_ci	hl_capture_hw_err(hdev, event_id);
266762306a36Sopenharmony_ci
266862306a36Sopenharmony_ci	if (event_mask)
266962306a36Sopenharmony_ci		*event_mask |= HL_NOTIFIER_EVENT_CRITICL_HW_ERR;
267062306a36Sopenharmony_ci}
267162306a36Sopenharmony_ci
267262306a36Sopenharmony_cistatic void hl_capture_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *fw_info)
267362306a36Sopenharmony_ci{
267462306a36Sopenharmony_ci	struct fw_err_info *info = &hdev->captured_err_info.fw_err;
267562306a36Sopenharmony_ci
267662306a36Sopenharmony_ci	/* Capture only the first FW error */
267762306a36Sopenharmony_ci	if (atomic_cmpxchg(&info->event_detected, 0, 1))
267862306a36Sopenharmony_ci		return;
267962306a36Sopenharmony_ci
268062306a36Sopenharmony_ci	info->event.timestamp = ktime_to_ns(ktime_get());
268162306a36Sopenharmony_ci	info->event.err_type = fw_info->err_type;
268262306a36Sopenharmony_ci	if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR)
268362306a36Sopenharmony_ci		info->event.event_id = fw_info->event_id;
268462306a36Sopenharmony_ci
268562306a36Sopenharmony_ci	info->event_info_available = true;
268662306a36Sopenharmony_ci}
268762306a36Sopenharmony_ci
268862306a36Sopenharmony_civoid hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
268962306a36Sopenharmony_ci{
269062306a36Sopenharmony_ci	hl_capture_fw_err(hdev, info);
269162306a36Sopenharmony_ci
269262306a36Sopenharmony_ci	if (info->event_mask)
269362306a36Sopenharmony_ci		*info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
269462306a36Sopenharmony_ci}
269562306a36Sopenharmony_ci
269662306a36Sopenharmony_civoid hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
269762306a36Sopenharmony_ci{
269862306a36Sopenharmony_ci	vfree(captured_err_info->page_fault_info.user_mappings);
269962306a36Sopenharmony_ci	memset(captured_err_info, 0, sizeof(struct hl_error_info));
270062306a36Sopenharmony_ci	atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
270162306a36Sopenharmony_ci	captured_err_info->undef_opcode.write_enable = true;
270262306a36Sopenharmony_ci}
2703