18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci
38c2ecf20Sopenharmony_ci/*
48c2ecf20Sopenharmony_ci * Copyright 2016-2019 HabanaLabs, Ltd.
58c2ecf20Sopenharmony_ci * All Rights Reserved.
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#define pr_fmt(fmt)			"habanalabs: " fmt
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#include "habanalabs.h"
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci#include <linux/pci.h>
138c2ecf20Sopenharmony_ci#include <linux/sched/signal.h>
148c2ecf20Sopenharmony_ci#include <linux/hwmon.h>
158c2ecf20Sopenharmony_ci#include <uapi/misc/habanalabs.h>
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci#define HL_PLDM_PENDING_RESET_PER_SEC	(HL_PENDING_RESET_PER_SEC * 10)
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_cibool hl_device_disabled_or_in_reset(struct hl_device *hdev)
208c2ecf20Sopenharmony_ci{
218c2ecf20Sopenharmony_ci	if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
228c2ecf20Sopenharmony_ci		return true;
238c2ecf20Sopenharmony_ci	else
248c2ecf20Sopenharmony_ci		return false;
258c2ecf20Sopenharmony_ci}
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_cienum hl_device_status hl_device_status(struct hl_device *hdev)
288c2ecf20Sopenharmony_ci{
298c2ecf20Sopenharmony_ci	enum hl_device_status status;
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci	if (hdev->disabled)
328c2ecf20Sopenharmony_ci		status = HL_DEVICE_STATUS_MALFUNCTION;
338c2ecf20Sopenharmony_ci	else if (atomic_read(&hdev->in_reset))
348c2ecf20Sopenharmony_ci		status = HL_DEVICE_STATUS_IN_RESET;
358c2ecf20Sopenharmony_ci	else
368c2ecf20Sopenharmony_ci		status = HL_DEVICE_STATUS_OPERATIONAL;
378c2ecf20Sopenharmony_ci
388c2ecf20Sopenharmony_ci	return status;
398c2ecf20Sopenharmony_ci}
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_cistatic void hpriv_release(struct kref *ref)
428c2ecf20Sopenharmony_ci{
438c2ecf20Sopenharmony_ci	struct hl_fpriv *hpriv;
448c2ecf20Sopenharmony_ci	struct hl_device *hdev;
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci	hpriv = container_of(ref, struct hl_fpriv, refcount);
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ci	hdev = hpriv->hdev;
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ci	put_pid(hpriv->taskpid);
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci	hl_debugfs_remove_file(hpriv);
538c2ecf20Sopenharmony_ci
548c2ecf20Sopenharmony_ci	mutex_destroy(&hpriv->restore_phase_mutex);
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
578c2ecf20Sopenharmony_ci	list_del(&hpriv->dev_node);
588c2ecf20Sopenharmony_ci	hdev->compute_ctx = NULL;
598c2ecf20Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
608c2ecf20Sopenharmony_ci
618c2ecf20Sopenharmony_ci	kfree(hpriv);
628c2ecf20Sopenharmony_ci}
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_civoid hl_hpriv_get(struct hl_fpriv *hpriv)
658c2ecf20Sopenharmony_ci{
668c2ecf20Sopenharmony_ci	kref_get(&hpriv->refcount);
678c2ecf20Sopenharmony_ci}
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_civoid hl_hpriv_put(struct hl_fpriv *hpriv)
708c2ecf20Sopenharmony_ci{
718c2ecf20Sopenharmony_ci	kref_put(&hpriv->refcount, hpriv_release);
728c2ecf20Sopenharmony_ci}
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ci/*
758c2ecf20Sopenharmony_ci * hl_device_release - release function for habanalabs device
768c2ecf20Sopenharmony_ci *
778c2ecf20Sopenharmony_ci * @inode: pointer to inode structure
788c2ecf20Sopenharmony_ci * @filp: pointer to file structure
798c2ecf20Sopenharmony_ci *
808c2ecf20Sopenharmony_ci * Called when process closes an habanalabs device
818c2ecf20Sopenharmony_ci */
828c2ecf20Sopenharmony_cistatic int hl_device_release(struct inode *inode, struct file *filp)
838c2ecf20Sopenharmony_ci{
848c2ecf20Sopenharmony_ci	struct hl_fpriv *hpriv = filp->private_data;
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
878c2ecf20Sopenharmony_ci	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_ci	filp->private_data = NULL;
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci	hl_hpriv_put(hpriv);
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci	return 0;
948c2ecf20Sopenharmony_ci}
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_cistatic int hl_device_release_ctrl(struct inode *inode, struct file *filp)
978c2ecf20Sopenharmony_ci{
988c2ecf20Sopenharmony_ci	struct hl_fpriv *hpriv = filp->private_data;
998c2ecf20Sopenharmony_ci	struct hl_device *hdev;
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	filp->private_data = NULL;
1028c2ecf20Sopenharmony_ci
1038c2ecf20Sopenharmony_ci	hdev = hpriv->hdev;
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
1068c2ecf20Sopenharmony_ci	list_del(&hpriv->dev_node);
1078c2ecf20Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci	put_pid(hpriv->taskpid);
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci	kfree(hpriv);
1128c2ecf20Sopenharmony_ci
1138c2ecf20Sopenharmony_ci	return 0;
1148c2ecf20Sopenharmony_ci}
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci/*
1178c2ecf20Sopenharmony_ci * hl_mmap - mmap function for habanalabs device
1188c2ecf20Sopenharmony_ci *
1198c2ecf20Sopenharmony_ci * @*filp: pointer to file structure
1208c2ecf20Sopenharmony_ci * @*vma: pointer to vm_area_struct of the process
1218c2ecf20Sopenharmony_ci *
1228c2ecf20Sopenharmony_ci * Called when process does an mmap on habanalabs device. Call the device's mmap
1238c2ecf20Sopenharmony_ci * function at the end of the common code.
1248c2ecf20Sopenharmony_ci */
1258c2ecf20Sopenharmony_cistatic int hl_mmap(struct file *filp, struct vm_area_struct *vma)
1268c2ecf20Sopenharmony_ci{
1278c2ecf20Sopenharmony_ci	struct hl_fpriv *hpriv = filp->private_data;
1288c2ecf20Sopenharmony_ci	unsigned long vm_pgoff;
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci	vm_pgoff = vma->vm_pgoff;
1318c2ecf20Sopenharmony_ci	vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
1348c2ecf20Sopenharmony_ci	case HL_MMAP_TYPE_CB:
1358c2ecf20Sopenharmony_ci		return hl_cb_mmap(hpriv, vma);
1368c2ecf20Sopenharmony_ci	}
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci	return -EINVAL;
1398c2ecf20Sopenharmony_ci}
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_cistatic const struct file_operations hl_ops = {
1428c2ecf20Sopenharmony_ci	.owner = THIS_MODULE,
1438c2ecf20Sopenharmony_ci	.open = hl_device_open,
1448c2ecf20Sopenharmony_ci	.release = hl_device_release,
1458c2ecf20Sopenharmony_ci	.mmap = hl_mmap,
1468c2ecf20Sopenharmony_ci	.unlocked_ioctl = hl_ioctl,
1478c2ecf20Sopenharmony_ci	.compat_ioctl = hl_ioctl
1488c2ecf20Sopenharmony_ci};
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_cistatic const struct file_operations hl_ctrl_ops = {
1518c2ecf20Sopenharmony_ci	.owner = THIS_MODULE,
1528c2ecf20Sopenharmony_ci	.open = hl_device_open_ctrl,
1538c2ecf20Sopenharmony_ci	.release = hl_device_release_ctrl,
1548c2ecf20Sopenharmony_ci	.unlocked_ioctl = hl_ioctl_control,
1558c2ecf20Sopenharmony_ci	.compat_ioctl = hl_ioctl_control
1568c2ecf20Sopenharmony_ci};
1578c2ecf20Sopenharmony_ci
1588c2ecf20Sopenharmony_cistatic void device_release_func(struct device *dev)
1598c2ecf20Sopenharmony_ci{
1608c2ecf20Sopenharmony_ci	kfree(dev);
1618c2ecf20Sopenharmony_ci}
1628c2ecf20Sopenharmony_ci
1638c2ecf20Sopenharmony_ci/*
1648c2ecf20Sopenharmony_ci * device_init_cdev - Initialize cdev and device for habanalabs device
1658c2ecf20Sopenharmony_ci *
1668c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
1678c2ecf20Sopenharmony_ci * @hclass: pointer to the class object of the device
1688c2ecf20Sopenharmony_ci * @minor: minor number of the specific device
1698c2ecf20Sopenharmony_ci * @fpos: file operations to install for this device
1708c2ecf20Sopenharmony_ci * @name: name of the device as it will appear in the filesystem
1718c2ecf20Sopenharmony_ci * @cdev: pointer to the char device object that will be initialized
1728c2ecf20Sopenharmony_ci * @dev: pointer to the device object that will be initialized
1738c2ecf20Sopenharmony_ci *
1748c2ecf20Sopenharmony_ci * Initialize a cdev and a Linux device for habanalabs's device.
1758c2ecf20Sopenharmony_ci */
1768c2ecf20Sopenharmony_cistatic int device_init_cdev(struct hl_device *hdev, struct class *hclass,
1778c2ecf20Sopenharmony_ci				int minor, const struct file_operations *fops,
1788c2ecf20Sopenharmony_ci				char *name, struct cdev *cdev,
1798c2ecf20Sopenharmony_ci				struct device **dev)
1808c2ecf20Sopenharmony_ci{
1818c2ecf20Sopenharmony_ci	cdev_init(cdev, fops);
1828c2ecf20Sopenharmony_ci	cdev->owner = THIS_MODULE;
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ci	*dev = kzalloc(sizeof(**dev), GFP_KERNEL);
1858c2ecf20Sopenharmony_ci	if (!*dev)
1868c2ecf20Sopenharmony_ci		return -ENOMEM;
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci	device_initialize(*dev);
1898c2ecf20Sopenharmony_ci	(*dev)->devt = MKDEV(hdev->major, minor);
1908c2ecf20Sopenharmony_ci	(*dev)->class = hclass;
1918c2ecf20Sopenharmony_ci	(*dev)->release = device_release_func;
1928c2ecf20Sopenharmony_ci	dev_set_drvdata(*dev, hdev);
1938c2ecf20Sopenharmony_ci	dev_set_name(*dev, "%s", name);
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci	return 0;
1968c2ecf20Sopenharmony_ci}
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_cistatic int device_cdev_sysfs_add(struct hl_device *hdev)
1998c2ecf20Sopenharmony_ci{
2008c2ecf20Sopenharmony_ci	int rc;
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ci	rc = cdev_device_add(&hdev->cdev, hdev->dev);
2038c2ecf20Sopenharmony_ci	if (rc) {
2048c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
2058c2ecf20Sopenharmony_ci			"failed to add a char device to the system\n");
2068c2ecf20Sopenharmony_ci		return rc;
2078c2ecf20Sopenharmony_ci	}
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci	rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
2108c2ecf20Sopenharmony_ci	if (rc) {
2118c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
2128c2ecf20Sopenharmony_ci			"failed to add a control char device to the system\n");
2138c2ecf20Sopenharmony_ci		goto delete_cdev_device;
2148c2ecf20Sopenharmony_ci	}
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	/* hl_sysfs_init() must be done after adding the device to the system */
2178c2ecf20Sopenharmony_ci	rc = hl_sysfs_init(hdev);
2188c2ecf20Sopenharmony_ci	if (rc) {
2198c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize sysfs\n");
2208c2ecf20Sopenharmony_ci		goto delete_ctrl_cdev_device;
2218c2ecf20Sopenharmony_ci	}
2228c2ecf20Sopenharmony_ci
2238c2ecf20Sopenharmony_ci	hdev->cdev_sysfs_created = true;
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ci	return 0;
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_cidelete_ctrl_cdev_device:
2288c2ecf20Sopenharmony_ci	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
2298c2ecf20Sopenharmony_cidelete_cdev_device:
2308c2ecf20Sopenharmony_ci	cdev_device_del(&hdev->cdev, hdev->dev);
2318c2ecf20Sopenharmony_ci	return rc;
2328c2ecf20Sopenharmony_ci}
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_cistatic void device_cdev_sysfs_del(struct hl_device *hdev)
2358c2ecf20Sopenharmony_ci{
2368c2ecf20Sopenharmony_ci	if (!hdev->cdev_sysfs_created)
2378c2ecf20Sopenharmony_ci		goto put_devices;
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci	hl_sysfs_fini(hdev);
2408c2ecf20Sopenharmony_ci	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
2418c2ecf20Sopenharmony_ci	cdev_device_del(&hdev->cdev, hdev->dev);
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ciput_devices:
2448c2ecf20Sopenharmony_ci	put_device(hdev->dev);
2458c2ecf20Sopenharmony_ci	put_device(hdev->dev_ctrl);
2468c2ecf20Sopenharmony_ci}
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci/*
2498c2ecf20Sopenharmony_ci * device_early_init - do some early initialization for the habanalabs device
2508c2ecf20Sopenharmony_ci *
2518c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
2528c2ecf20Sopenharmony_ci *
2538c2ecf20Sopenharmony_ci * Install the relevant function pointers and call the early_init function,
2548c2ecf20Sopenharmony_ci * if such a function exists
2558c2ecf20Sopenharmony_ci */
2568c2ecf20Sopenharmony_cistatic int device_early_init(struct hl_device *hdev)
2578c2ecf20Sopenharmony_ci{
2588c2ecf20Sopenharmony_ci	int i, rc;
2598c2ecf20Sopenharmony_ci	char workq_name[32];
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_ci	switch (hdev->asic_type) {
2628c2ecf20Sopenharmony_ci	case ASIC_GOYA:
2638c2ecf20Sopenharmony_ci		goya_set_asic_funcs(hdev);
2648c2ecf20Sopenharmony_ci		strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
2658c2ecf20Sopenharmony_ci		break;
2668c2ecf20Sopenharmony_ci	case ASIC_GAUDI:
2678c2ecf20Sopenharmony_ci		gaudi_set_asic_funcs(hdev);
2688c2ecf20Sopenharmony_ci		sprintf(hdev->asic_name, "GAUDI");
2698c2ecf20Sopenharmony_ci		break;
2708c2ecf20Sopenharmony_ci	default:
2718c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
2728c2ecf20Sopenharmony_ci			hdev->asic_type);
2738c2ecf20Sopenharmony_ci		return -EINVAL;
2748c2ecf20Sopenharmony_ci	}
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci	rc = hdev->asic_funcs->early_init(hdev);
2778c2ecf20Sopenharmony_ci	if (rc)
2788c2ecf20Sopenharmony_ci		return rc;
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci	rc = hl_asid_init(hdev);
2818c2ecf20Sopenharmony_ci	if (rc)
2828c2ecf20Sopenharmony_ci		goto early_fini;
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci	if (hdev->asic_prop.completion_queues_count) {
2858c2ecf20Sopenharmony_ci		hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
2868c2ecf20Sopenharmony_ci				sizeof(*hdev->cq_wq),
2878c2ecf20Sopenharmony_ci				GFP_ATOMIC);
2888c2ecf20Sopenharmony_ci		if (!hdev->cq_wq) {
2898c2ecf20Sopenharmony_ci			rc = -ENOMEM;
2908c2ecf20Sopenharmony_ci			goto asid_fini;
2918c2ecf20Sopenharmony_ci		}
2928c2ecf20Sopenharmony_ci	}
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
2958c2ecf20Sopenharmony_ci		snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
2968c2ecf20Sopenharmony_ci		hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
2978c2ecf20Sopenharmony_ci		if (hdev->cq_wq[i] == NULL) {
2988c2ecf20Sopenharmony_ci			dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
2998c2ecf20Sopenharmony_ci			rc = -ENOMEM;
3008c2ecf20Sopenharmony_ci			goto free_cq_wq;
3018c2ecf20Sopenharmony_ci		}
3028c2ecf20Sopenharmony_ci	}
3038c2ecf20Sopenharmony_ci
3048c2ecf20Sopenharmony_ci	hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
3058c2ecf20Sopenharmony_ci	if (hdev->eq_wq == NULL) {
3068c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
3078c2ecf20Sopenharmony_ci		rc = -ENOMEM;
3088c2ecf20Sopenharmony_ci		goto free_cq_wq;
3098c2ecf20Sopenharmony_ci	}
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
3128c2ecf20Sopenharmony_ci					GFP_KERNEL);
3138c2ecf20Sopenharmony_ci	if (!hdev->hl_chip_info) {
3148c2ecf20Sopenharmony_ci		rc = -ENOMEM;
3158c2ecf20Sopenharmony_ci		goto free_eq_wq;
3168c2ecf20Sopenharmony_ci	}
3178c2ecf20Sopenharmony_ci
3188c2ecf20Sopenharmony_ci	hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
3198c2ecf20Sopenharmony_ci					sizeof(struct hl_device_idle_busy_ts),
3208c2ecf20Sopenharmony_ci					(GFP_KERNEL | __GFP_ZERO));
3218c2ecf20Sopenharmony_ci	if (!hdev->idle_busy_ts_arr) {
3228c2ecf20Sopenharmony_ci		rc = -ENOMEM;
3238c2ecf20Sopenharmony_ci		goto free_chip_info;
3248c2ecf20Sopenharmony_ci	}
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci	rc = hl_mmu_if_set_funcs(hdev);
3278c2ecf20Sopenharmony_ci	if (rc)
3288c2ecf20Sopenharmony_ci		goto free_idle_busy_ts_arr;
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci	hl_cb_mgr_init(&hdev->kernel_cb_mgr);
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci	mutex_init(&hdev->send_cpu_message_lock);
3338c2ecf20Sopenharmony_ci	mutex_init(&hdev->debug_lock);
3348c2ecf20Sopenharmony_ci	mutex_init(&hdev->mmu_cache_lock);
3358c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
3368c2ecf20Sopenharmony_ci	spin_lock_init(&hdev->hw_queues_mirror_lock);
3378c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&hdev->fpriv_list);
3388c2ecf20Sopenharmony_ci	mutex_init(&hdev->fpriv_list_lock);
3398c2ecf20Sopenharmony_ci	atomic_set(&hdev->in_reset, 0);
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	return 0;
3428c2ecf20Sopenharmony_ci
3438c2ecf20Sopenharmony_cifree_idle_busy_ts_arr:
3448c2ecf20Sopenharmony_ci	kfree(hdev->idle_busy_ts_arr);
3458c2ecf20Sopenharmony_cifree_chip_info:
3468c2ecf20Sopenharmony_ci	kfree(hdev->hl_chip_info);
3478c2ecf20Sopenharmony_cifree_eq_wq:
3488c2ecf20Sopenharmony_ci	destroy_workqueue(hdev->eq_wq);
3498c2ecf20Sopenharmony_cifree_cq_wq:
3508c2ecf20Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
3518c2ecf20Sopenharmony_ci		if (hdev->cq_wq[i])
3528c2ecf20Sopenharmony_ci			destroy_workqueue(hdev->cq_wq[i]);
3538c2ecf20Sopenharmony_ci	kfree(hdev->cq_wq);
3548c2ecf20Sopenharmony_ciasid_fini:
3558c2ecf20Sopenharmony_ci	hl_asid_fini(hdev);
3568c2ecf20Sopenharmony_ciearly_fini:
3578c2ecf20Sopenharmony_ci	if (hdev->asic_funcs->early_fini)
3588c2ecf20Sopenharmony_ci		hdev->asic_funcs->early_fini(hdev);
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci	return rc;
3618c2ecf20Sopenharmony_ci}
3628c2ecf20Sopenharmony_ci
3638c2ecf20Sopenharmony_ci/*
3648c2ecf20Sopenharmony_ci * device_early_fini - finalize all that was done in device_early_init
3658c2ecf20Sopenharmony_ci *
3668c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
3678c2ecf20Sopenharmony_ci *
3688c2ecf20Sopenharmony_ci */
3698c2ecf20Sopenharmony_cistatic void device_early_fini(struct hl_device *hdev)
3708c2ecf20Sopenharmony_ci{
3718c2ecf20Sopenharmony_ci	int i;
3728c2ecf20Sopenharmony_ci
3738c2ecf20Sopenharmony_ci	mutex_destroy(&hdev->mmu_cache_lock);
3748c2ecf20Sopenharmony_ci	mutex_destroy(&hdev->debug_lock);
3758c2ecf20Sopenharmony_ci	mutex_destroy(&hdev->send_cpu_message_lock);
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	mutex_destroy(&hdev->fpriv_list_lock);
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci	kfree(hdev->idle_busy_ts_arr);
3828c2ecf20Sopenharmony_ci	kfree(hdev->hl_chip_info);
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci	destroy_workqueue(hdev->eq_wq);
3858c2ecf20Sopenharmony_ci
3868c2ecf20Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
3878c2ecf20Sopenharmony_ci		destroy_workqueue(hdev->cq_wq[i]);
3888c2ecf20Sopenharmony_ci	kfree(hdev->cq_wq);
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ci	hl_asid_fini(hdev);
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci	if (hdev->asic_funcs->early_fini)
3938c2ecf20Sopenharmony_ci		hdev->asic_funcs->early_fini(hdev);
3948c2ecf20Sopenharmony_ci}
3958c2ecf20Sopenharmony_ci
3968c2ecf20Sopenharmony_cistatic void set_freq_to_low_job(struct work_struct *work)
3978c2ecf20Sopenharmony_ci{
3988c2ecf20Sopenharmony_ci	struct hl_device *hdev = container_of(work, struct hl_device,
3998c2ecf20Sopenharmony_ci						work_freq.work);
4008c2ecf20Sopenharmony_ci
4018c2ecf20Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
4028c2ecf20Sopenharmony_ci
4038c2ecf20Sopenharmony_ci	if (!hdev->compute_ctx)
4048c2ecf20Sopenharmony_ci		hl_device_set_frequency(hdev, PLL_LOW);
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
4078c2ecf20Sopenharmony_ci
4088c2ecf20Sopenharmony_ci	schedule_delayed_work(&hdev->work_freq,
4098c2ecf20Sopenharmony_ci			usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
4108c2ecf20Sopenharmony_ci}
4118c2ecf20Sopenharmony_ci
4128c2ecf20Sopenharmony_cistatic void hl_device_heartbeat(struct work_struct *work)
4138c2ecf20Sopenharmony_ci{
4148c2ecf20Sopenharmony_ci	struct hl_device *hdev = container_of(work, struct hl_device,
4158c2ecf20Sopenharmony_ci						work_heartbeat.work);
4168c2ecf20Sopenharmony_ci
4178c2ecf20Sopenharmony_ci	if (hl_device_disabled_or_in_reset(hdev))
4188c2ecf20Sopenharmony_ci		goto reschedule;
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_ci	if (!hdev->asic_funcs->send_heartbeat(hdev))
4218c2ecf20Sopenharmony_ci		goto reschedule;
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_ci	dev_err(hdev->dev, "Device heartbeat failed!\n");
4248c2ecf20Sopenharmony_ci	hl_device_reset(hdev, true, false);
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	return;
4278c2ecf20Sopenharmony_ci
4288c2ecf20Sopenharmony_cireschedule:
4298c2ecf20Sopenharmony_ci	schedule_delayed_work(&hdev->work_heartbeat,
4308c2ecf20Sopenharmony_ci			usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
4318c2ecf20Sopenharmony_ci}
4328c2ecf20Sopenharmony_ci
4338c2ecf20Sopenharmony_ci/*
4348c2ecf20Sopenharmony_ci * device_late_init - do late stuff initialization for the habanalabs device
4358c2ecf20Sopenharmony_ci *
4368c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
4378c2ecf20Sopenharmony_ci *
4388c2ecf20Sopenharmony_ci * Do stuff that either needs the device H/W queues to be active or needs
4398c2ecf20Sopenharmony_ci * to happen after all the rest of the initialization is finished
4408c2ecf20Sopenharmony_ci */
4418c2ecf20Sopenharmony_cistatic int device_late_init(struct hl_device *hdev)
4428c2ecf20Sopenharmony_ci{
4438c2ecf20Sopenharmony_ci	int rc;
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci	if (hdev->asic_funcs->late_init) {
4468c2ecf20Sopenharmony_ci		rc = hdev->asic_funcs->late_init(hdev);
4478c2ecf20Sopenharmony_ci		if (rc) {
4488c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
4498c2ecf20Sopenharmony_ci				"failed late initialization for the H/W\n");
4508c2ecf20Sopenharmony_ci			return rc;
4518c2ecf20Sopenharmony_ci		}
4528c2ecf20Sopenharmony_ci	}
4538c2ecf20Sopenharmony_ci
4548c2ecf20Sopenharmony_ci	hdev->high_pll = hdev->asic_prop.high_pll;
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	/* force setting to low frequency */
4578c2ecf20Sopenharmony_ci	hdev->curr_pll_profile = PLL_LOW;
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	if (hdev->pm_mng_profile == PM_AUTO)
4608c2ecf20Sopenharmony_ci		hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
4618c2ecf20Sopenharmony_ci	else
4628c2ecf20Sopenharmony_ci		hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
4638c2ecf20Sopenharmony_ci
4648c2ecf20Sopenharmony_ci	INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
4658c2ecf20Sopenharmony_ci	schedule_delayed_work(&hdev->work_freq,
4668c2ecf20Sopenharmony_ci	usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
4678c2ecf20Sopenharmony_ci
4688c2ecf20Sopenharmony_ci	if (hdev->heartbeat) {
4698c2ecf20Sopenharmony_ci		INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
4708c2ecf20Sopenharmony_ci		schedule_delayed_work(&hdev->work_heartbeat,
4718c2ecf20Sopenharmony_ci				usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
4728c2ecf20Sopenharmony_ci	}
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_ci	hdev->late_init_done = true;
4758c2ecf20Sopenharmony_ci
4768c2ecf20Sopenharmony_ci	return 0;
4778c2ecf20Sopenharmony_ci}
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_ci/*
4808c2ecf20Sopenharmony_ci * device_late_fini - finalize all that was done in device_late_init
4818c2ecf20Sopenharmony_ci *
4828c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
4838c2ecf20Sopenharmony_ci *
4848c2ecf20Sopenharmony_ci */
4858c2ecf20Sopenharmony_cistatic void device_late_fini(struct hl_device *hdev)
4868c2ecf20Sopenharmony_ci{
4878c2ecf20Sopenharmony_ci	if (!hdev->late_init_done)
4888c2ecf20Sopenharmony_ci		return;
4898c2ecf20Sopenharmony_ci
4908c2ecf20Sopenharmony_ci	cancel_delayed_work_sync(&hdev->work_freq);
4918c2ecf20Sopenharmony_ci	if (hdev->heartbeat)
4928c2ecf20Sopenharmony_ci		cancel_delayed_work_sync(&hdev->work_heartbeat);
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci	if (hdev->asic_funcs->late_fini)
4958c2ecf20Sopenharmony_ci		hdev->asic_funcs->late_fini(hdev);
4968c2ecf20Sopenharmony_ci
4978c2ecf20Sopenharmony_ci	hdev->late_init_done = false;
4988c2ecf20Sopenharmony_ci}
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ciuint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms)
5018c2ecf20Sopenharmony_ci{
5028c2ecf20Sopenharmony_ci	struct hl_device_idle_busy_ts *ts;
5038c2ecf20Sopenharmony_ci	ktime_t zero_ktime, curr = ktime_get();
5048c2ecf20Sopenharmony_ci	u32 overlap_cnt = 0, last_index = hdev->idle_busy_ts_idx;
5058c2ecf20Sopenharmony_ci	s64 period_us, last_start_us, last_end_us, last_busy_time_us,
5068c2ecf20Sopenharmony_ci		total_busy_time_us = 0, total_busy_time_ms;
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_ci	zero_ktime = ktime_set(0, 0);
5098c2ecf20Sopenharmony_ci	period_us = period_ms * USEC_PER_MSEC;
5108c2ecf20Sopenharmony_ci	ts = &hdev->idle_busy_ts_arr[last_index];
5118c2ecf20Sopenharmony_ci
5128c2ecf20Sopenharmony_ci	/* check case that device is currently in idle */
5138c2ecf20Sopenharmony_ci	if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime) &&
5148c2ecf20Sopenharmony_ci			!ktime_compare(ts->idle_to_busy_ts, zero_ktime)) {
5158c2ecf20Sopenharmony_ci
5168c2ecf20Sopenharmony_ci		last_index--;
5178c2ecf20Sopenharmony_ci		/* Handle case idle_busy_ts_idx was 0 */
5188c2ecf20Sopenharmony_ci		if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
5198c2ecf20Sopenharmony_ci			last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
5208c2ecf20Sopenharmony_ci
5218c2ecf20Sopenharmony_ci		ts = &hdev->idle_busy_ts_arr[last_index];
5228c2ecf20Sopenharmony_ci	}
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_ci	while (overlap_cnt < HL_IDLE_BUSY_TS_ARR_SIZE) {
5258c2ecf20Sopenharmony_ci		/* Check if we are in last sample case. i.e. if the sample
5268c2ecf20Sopenharmony_ci		 * begun before the sampling period. This could be a real
5278c2ecf20Sopenharmony_ci		 * sample or 0 so need to handle both cases
5288c2ecf20Sopenharmony_ci		 */
5298c2ecf20Sopenharmony_ci		last_start_us = ktime_to_us(
5308c2ecf20Sopenharmony_ci				ktime_sub(curr, ts->idle_to_busy_ts));
5318c2ecf20Sopenharmony_ci
5328c2ecf20Sopenharmony_ci		if (last_start_us > period_us) {
5338c2ecf20Sopenharmony_ci
5348c2ecf20Sopenharmony_ci			/* First check two cases:
5358c2ecf20Sopenharmony_ci			 * 1. If the device is currently busy
5368c2ecf20Sopenharmony_ci			 * 2. If the device was idle during the whole sampling
5378c2ecf20Sopenharmony_ci			 *    period
5388c2ecf20Sopenharmony_ci			 */
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_ci			if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime)) {
5418c2ecf20Sopenharmony_ci				/* Check if the device is currently busy */
5428c2ecf20Sopenharmony_ci				if (ktime_compare(ts->idle_to_busy_ts,
5438c2ecf20Sopenharmony_ci						zero_ktime))
5448c2ecf20Sopenharmony_ci					return 100;
5458c2ecf20Sopenharmony_ci
5468c2ecf20Sopenharmony_ci				/* We either didn't have any activity or we
5478c2ecf20Sopenharmony_ci				 * reached an entry which is 0. Either way,
5488c2ecf20Sopenharmony_ci				 * exit and return what was accumulated so far
5498c2ecf20Sopenharmony_ci				 */
5508c2ecf20Sopenharmony_ci				break;
5518c2ecf20Sopenharmony_ci			}
5528c2ecf20Sopenharmony_ci
5538c2ecf20Sopenharmony_ci			/* If sample has finished, check it is relevant */
5548c2ecf20Sopenharmony_ci			last_end_us = ktime_to_us(
5558c2ecf20Sopenharmony_ci					ktime_sub(curr, ts->busy_to_idle_ts));
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_ci			if (last_end_us > period_us)
5588c2ecf20Sopenharmony_ci				break;
5598c2ecf20Sopenharmony_ci
5608c2ecf20Sopenharmony_ci			/* It is relevant so add it but with adjustment */
5618c2ecf20Sopenharmony_ci			last_busy_time_us = ktime_to_us(
5628c2ecf20Sopenharmony_ci						ktime_sub(ts->busy_to_idle_ts,
5638c2ecf20Sopenharmony_ci						ts->idle_to_busy_ts));
5648c2ecf20Sopenharmony_ci			total_busy_time_us += last_busy_time_us -
5658c2ecf20Sopenharmony_ci					(last_start_us - period_us);
5668c2ecf20Sopenharmony_ci			break;
5678c2ecf20Sopenharmony_ci		}
5688c2ecf20Sopenharmony_ci
5698c2ecf20Sopenharmony_ci		/* Check if the sample is finished or still open */
5708c2ecf20Sopenharmony_ci		if (ktime_compare(ts->busy_to_idle_ts, zero_ktime))
5718c2ecf20Sopenharmony_ci			last_busy_time_us = ktime_to_us(
5728c2ecf20Sopenharmony_ci						ktime_sub(ts->busy_to_idle_ts,
5738c2ecf20Sopenharmony_ci						ts->idle_to_busy_ts));
5748c2ecf20Sopenharmony_ci		else
5758c2ecf20Sopenharmony_ci			last_busy_time_us = ktime_to_us(
5768c2ecf20Sopenharmony_ci					ktime_sub(curr, ts->idle_to_busy_ts));
5778c2ecf20Sopenharmony_ci
5788c2ecf20Sopenharmony_ci		total_busy_time_us += last_busy_time_us;
5798c2ecf20Sopenharmony_ci
5808c2ecf20Sopenharmony_ci		last_index--;
5818c2ecf20Sopenharmony_ci		/* Handle case idle_busy_ts_idx was 0 */
5828c2ecf20Sopenharmony_ci		if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
5838c2ecf20Sopenharmony_ci			last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
5848c2ecf20Sopenharmony_ci
5858c2ecf20Sopenharmony_ci		ts = &hdev->idle_busy_ts_arr[last_index];
5868c2ecf20Sopenharmony_ci
5878c2ecf20Sopenharmony_ci		overlap_cnt++;
5888c2ecf20Sopenharmony_ci	}
5898c2ecf20Sopenharmony_ci
5908c2ecf20Sopenharmony_ci	total_busy_time_ms = DIV_ROUND_UP_ULL(total_busy_time_us,
5918c2ecf20Sopenharmony_ci						USEC_PER_MSEC);
5928c2ecf20Sopenharmony_ci
5938c2ecf20Sopenharmony_ci	return DIV_ROUND_UP_ULL(total_busy_time_ms * 100, period_ms);
5948c2ecf20Sopenharmony_ci}
5958c2ecf20Sopenharmony_ci
5968c2ecf20Sopenharmony_ci/*
5978c2ecf20Sopenharmony_ci * hl_device_set_frequency - set the frequency of the device
5988c2ecf20Sopenharmony_ci *
5998c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
6008c2ecf20Sopenharmony_ci * @freq: the new frequency value
6018c2ecf20Sopenharmony_ci *
6028c2ecf20Sopenharmony_ci * Change the frequency if needed. This function has no protection against
6038c2ecf20Sopenharmony_ci * concurrency, therefore it is assumed that the calling function has protected
6048c2ecf20Sopenharmony_ci * itself against the case of calling this function from multiple threads with
6058c2ecf20Sopenharmony_ci * different values
6068c2ecf20Sopenharmony_ci *
6078c2ecf20Sopenharmony_ci * Returns 0 if no change was done, otherwise returns 1
6088c2ecf20Sopenharmony_ci */
6098c2ecf20Sopenharmony_ciint hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
6108c2ecf20Sopenharmony_ci{
6118c2ecf20Sopenharmony_ci	if ((hdev->pm_mng_profile == PM_MANUAL) ||
6128c2ecf20Sopenharmony_ci			(hdev->curr_pll_profile == freq))
6138c2ecf20Sopenharmony_ci		return 0;
6148c2ecf20Sopenharmony_ci
6158c2ecf20Sopenharmony_ci	dev_dbg(hdev->dev, "Changing device frequency to %s\n",
6168c2ecf20Sopenharmony_ci		freq == PLL_HIGH ? "high" : "low");
6178c2ecf20Sopenharmony_ci
6188c2ecf20Sopenharmony_ci	hdev->asic_funcs->set_pll_profile(hdev, freq);
6198c2ecf20Sopenharmony_ci
6208c2ecf20Sopenharmony_ci	hdev->curr_pll_profile = freq;
6218c2ecf20Sopenharmony_ci
6228c2ecf20Sopenharmony_ci	return 1;
6238c2ecf20Sopenharmony_ci}
6248c2ecf20Sopenharmony_ci
6258c2ecf20Sopenharmony_ciint hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
6268c2ecf20Sopenharmony_ci{
6278c2ecf20Sopenharmony_ci	int rc = 0;
6288c2ecf20Sopenharmony_ci
6298c2ecf20Sopenharmony_ci	mutex_lock(&hdev->debug_lock);
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci	if (!enable) {
6328c2ecf20Sopenharmony_ci		if (!hdev->in_debug) {
6338c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
6348c2ecf20Sopenharmony_ci				"Failed to disable debug mode because device was not in debug mode\n");
6358c2ecf20Sopenharmony_ci			rc = -EFAULT;
6368c2ecf20Sopenharmony_ci			goto out;
6378c2ecf20Sopenharmony_ci		}
6388c2ecf20Sopenharmony_ci
6398c2ecf20Sopenharmony_ci		if (!hdev->hard_reset_pending)
6408c2ecf20Sopenharmony_ci			hdev->asic_funcs->halt_coresight(hdev);
6418c2ecf20Sopenharmony_ci
6428c2ecf20Sopenharmony_ci		hdev->in_debug = 0;
6438c2ecf20Sopenharmony_ci
6448c2ecf20Sopenharmony_ci		if (!hdev->hard_reset_pending)
6458c2ecf20Sopenharmony_ci			hdev->asic_funcs->set_clock_gating(hdev);
6468c2ecf20Sopenharmony_ci
6478c2ecf20Sopenharmony_ci		goto out;
6488c2ecf20Sopenharmony_ci	}
6498c2ecf20Sopenharmony_ci
6508c2ecf20Sopenharmony_ci	if (hdev->in_debug) {
6518c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
6528c2ecf20Sopenharmony_ci			"Failed to enable debug mode because device is already in debug mode\n");
6538c2ecf20Sopenharmony_ci		rc = -EFAULT;
6548c2ecf20Sopenharmony_ci		goto out;
6558c2ecf20Sopenharmony_ci	}
6568c2ecf20Sopenharmony_ci
6578c2ecf20Sopenharmony_ci	hdev->asic_funcs->disable_clock_gating(hdev);
6588c2ecf20Sopenharmony_ci	hdev->in_debug = 1;
6598c2ecf20Sopenharmony_ci
6608c2ecf20Sopenharmony_ciout:
6618c2ecf20Sopenharmony_ci	mutex_unlock(&hdev->debug_lock);
6628c2ecf20Sopenharmony_ci
6638c2ecf20Sopenharmony_ci	return rc;
6648c2ecf20Sopenharmony_ci}
6658c2ecf20Sopenharmony_ci
6668c2ecf20Sopenharmony_ci/*
6678c2ecf20Sopenharmony_ci * hl_device_suspend - initiate device suspend
6688c2ecf20Sopenharmony_ci *
6698c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
6708c2ecf20Sopenharmony_ci *
6718c2ecf20Sopenharmony_ci * Puts the hw in the suspend state (all asics).
6728c2ecf20Sopenharmony_ci * Returns 0 for success or an error on failure.
6738c2ecf20Sopenharmony_ci * Called at driver suspend.
6748c2ecf20Sopenharmony_ci */
6758c2ecf20Sopenharmony_ciint hl_device_suspend(struct hl_device *hdev)
6768c2ecf20Sopenharmony_ci{
6778c2ecf20Sopenharmony_ci	int rc;
6788c2ecf20Sopenharmony_ci
6798c2ecf20Sopenharmony_ci	pci_save_state(hdev->pdev);
6808c2ecf20Sopenharmony_ci
6818c2ecf20Sopenharmony_ci	/* Block future CS/VM/JOB completion operations */
6828c2ecf20Sopenharmony_ci	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
6838c2ecf20Sopenharmony_ci	if (rc) {
6848c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Can't suspend while in reset\n");
6858c2ecf20Sopenharmony_ci		return -EIO;
6868c2ecf20Sopenharmony_ci	}
6878c2ecf20Sopenharmony_ci
6888c2ecf20Sopenharmony_ci	/* This blocks all other stuff that is not blocked by in_reset */
6898c2ecf20Sopenharmony_ci	hdev->disabled = true;
6908c2ecf20Sopenharmony_ci
6918c2ecf20Sopenharmony_ci	/*
6928c2ecf20Sopenharmony_ci	 * Flush anyone that is inside the critical section of enqueue
6938c2ecf20Sopenharmony_ci	 * jobs to the H/W
6948c2ecf20Sopenharmony_ci	 */
6958c2ecf20Sopenharmony_ci	hdev->asic_funcs->hw_queues_lock(hdev);
6968c2ecf20Sopenharmony_ci	hdev->asic_funcs->hw_queues_unlock(hdev);
6978c2ecf20Sopenharmony_ci
6988c2ecf20Sopenharmony_ci	/* Flush processes that are sending message to CPU */
6998c2ecf20Sopenharmony_ci	mutex_lock(&hdev->send_cpu_message_lock);
7008c2ecf20Sopenharmony_ci	mutex_unlock(&hdev->send_cpu_message_lock);
7018c2ecf20Sopenharmony_ci
7028c2ecf20Sopenharmony_ci	rc = hdev->asic_funcs->suspend(hdev);
7038c2ecf20Sopenharmony_ci	if (rc)
7048c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
7058c2ecf20Sopenharmony_ci			"Failed to disable PCI access of device CPU\n");
7068c2ecf20Sopenharmony_ci
7078c2ecf20Sopenharmony_ci	/* Shut down the device */
7088c2ecf20Sopenharmony_ci	pci_disable_device(hdev->pdev);
7098c2ecf20Sopenharmony_ci	pci_set_power_state(hdev->pdev, PCI_D3hot);
7108c2ecf20Sopenharmony_ci
7118c2ecf20Sopenharmony_ci	return 0;
7128c2ecf20Sopenharmony_ci}
7138c2ecf20Sopenharmony_ci
7148c2ecf20Sopenharmony_ci/*
7158c2ecf20Sopenharmony_ci * hl_device_resume - initiate device resume
7168c2ecf20Sopenharmony_ci *
7178c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
7188c2ecf20Sopenharmony_ci *
7198c2ecf20Sopenharmony_ci * Bring the hw back to operating state (all asics).
7208c2ecf20Sopenharmony_ci * Returns 0 for success or an error on failure.
7218c2ecf20Sopenharmony_ci * Called at driver resume.
7228c2ecf20Sopenharmony_ci */
7238c2ecf20Sopenharmony_ciint hl_device_resume(struct hl_device *hdev)
7248c2ecf20Sopenharmony_ci{
7258c2ecf20Sopenharmony_ci	int rc;
7268c2ecf20Sopenharmony_ci
7278c2ecf20Sopenharmony_ci	pci_set_power_state(hdev->pdev, PCI_D0);
7288c2ecf20Sopenharmony_ci	pci_restore_state(hdev->pdev);
7298c2ecf20Sopenharmony_ci	rc = pci_enable_device_mem(hdev->pdev);
7308c2ecf20Sopenharmony_ci	if (rc) {
7318c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
7328c2ecf20Sopenharmony_ci			"Failed to enable PCI device in resume\n");
7338c2ecf20Sopenharmony_ci		return rc;
7348c2ecf20Sopenharmony_ci	}
7358c2ecf20Sopenharmony_ci
7368c2ecf20Sopenharmony_ci	pci_set_master(hdev->pdev);
7378c2ecf20Sopenharmony_ci
7388c2ecf20Sopenharmony_ci	rc = hdev->asic_funcs->resume(hdev);
7398c2ecf20Sopenharmony_ci	if (rc) {
7408c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Failed to resume device after suspend\n");
7418c2ecf20Sopenharmony_ci		goto disable_device;
7428c2ecf20Sopenharmony_ci	}
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_ci
7458c2ecf20Sopenharmony_ci	hdev->disabled = false;
7468c2ecf20Sopenharmony_ci	atomic_set(&hdev->in_reset, 0);
7478c2ecf20Sopenharmony_ci
7488c2ecf20Sopenharmony_ci	rc = hl_device_reset(hdev, true, false);
7498c2ecf20Sopenharmony_ci	if (rc) {
7508c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Failed to reset device during resume\n");
7518c2ecf20Sopenharmony_ci		goto disable_device;
7528c2ecf20Sopenharmony_ci	}
7538c2ecf20Sopenharmony_ci
7548c2ecf20Sopenharmony_ci	return 0;
7558c2ecf20Sopenharmony_ci
7568c2ecf20Sopenharmony_cidisable_device:
7578c2ecf20Sopenharmony_ci	pci_clear_master(hdev->pdev);
7588c2ecf20Sopenharmony_ci	pci_disable_device(hdev->pdev);
7598c2ecf20Sopenharmony_ci
7608c2ecf20Sopenharmony_ci	return rc;
7618c2ecf20Sopenharmony_ci}
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_cistatic int device_kill_open_processes(struct hl_device *hdev)
7648c2ecf20Sopenharmony_ci{
7658c2ecf20Sopenharmony_ci	u16 pending_total, pending_cnt;
7668c2ecf20Sopenharmony_ci	struct hl_fpriv	*hpriv;
7678c2ecf20Sopenharmony_ci	struct task_struct *task = NULL;
7688c2ecf20Sopenharmony_ci
7698c2ecf20Sopenharmony_ci	if (hdev->pldm)
7708c2ecf20Sopenharmony_ci		pending_total = HL_PLDM_PENDING_RESET_PER_SEC;
7718c2ecf20Sopenharmony_ci	else
7728c2ecf20Sopenharmony_ci		pending_total = HL_PENDING_RESET_PER_SEC;
7738c2ecf20Sopenharmony_ci
7748c2ecf20Sopenharmony_ci	/* Giving time for user to close FD, and for processes that are inside
7758c2ecf20Sopenharmony_ci	 * hl_device_open to finish
7768c2ecf20Sopenharmony_ci	 */
7778c2ecf20Sopenharmony_ci	if (!list_empty(&hdev->fpriv_list))
7788c2ecf20Sopenharmony_ci		ssleep(1);
7798c2ecf20Sopenharmony_ci
7808c2ecf20Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
7818c2ecf20Sopenharmony_ci
7828c2ecf20Sopenharmony_ci	/* This section must be protected because we are dereferencing
7838c2ecf20Sopenharmony_ci	 * pointers that are freed if the process exits
7848c2ecf20Sopenharmony_ci	 */
7858c2ecf20Sopenharmony_ci	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
7868c2ecf20Sopenharmony_ci		task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
7878c2ecf20Sopenharmony_ci		if (task) {
7888c2ecf20Sopenharmony_ci			dev_info(hdev->dev, "Killing user process pid=%d\n",
7898c2ecf20Sopenharmony_ci				task_pid_nr(task));
7908c2ecf20Sopenharmony_ci			send_sig(SIGKILL, task, 1);
7918c2ecf20Sopenharmony_ci			usleep_range(1000, 10000);
7928c2ecf20Sopenharmony_ci
7938c2ecf20Sopenharmony_ci			put_task_struct(task);
7948c2ecf20Sopenharmony_ci		}
7958c2ecf20Sopenharmony_ci	}
7968c2ecf20Sopenharmony_ci
7978c2ecf20Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
7988c2ecf20Sopenharmony_ci
7998c2ecf20Sopenharmony_ci	/* We killed the open users, but because the driver cleans up after the
8008c2ecf20Sopenharmony_ci	 * user contexts are closed (e.g. mmu mappings), we need to wait again
8018c2ecf20Sopenharmony_ci	 * to make sure the cleaning phase is finished before continuing with
8028c2ecf20Sopenharmony_ci	 * the reset
8038c2ecf20Sopenharmony_ci	 */
8048c2ecf20Sopenharmony_ci
8058c2ecf20Sopenharmony_ci	pending_cnt = pending_total;
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_ci	while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
8088c2ecf20Sopenharmony_ci		dev_info(hdev->dev,
8098c2ecf20Sopenharmony_ci			"Waiting for all unmap operations to finish before hard reset\n");
8108c2ecf20Sopenharmony_ci
8118c2ecf20Sopenharmony_ci		pending_cnt--;
8128c2ecf20Sopenharmony_ci
8138c2ecf20Sopenharmony_ci		ssleep(1);
8148c2ecf20Sopenharmony_ci	}
8158c2ecf20Sopenharmony_ci
8168c2ecf20Sopenharmony_ci	return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
8178c2ecf20Sopenharmony_ci}
8188c2ecf20Sopenharmony_ci
8198c2ecf20Sopenharmony_cistatic void device_hard_reset_pending(struct work_struct *work)
8208c2ecf20Sopenharmony_ci{
8218c2ecf20Sopenharmony_ci	struct hl_device_reset_work *device_reset_work =
8228c2ecf20Sopenharmony_ci		container_of(work, struct hl_device_reset_work, reset_work);
8238c2ecf20Sopenharmony_ci	struct hl_device *hdev = device_reset_work->hdev;
8248c2ecf20Sopenharmony_ci
8258c2ecf20Sopenharmony_ci	hl_device_reset(hdev, true, true);
8268c2ecf20Sopenharmony_ci
8278c2ecf20Sopenharmony_ci	kfree(device_reset_work);
8288c2ecf20Sopenharmony_ci}
8298c2ecf20Sopenharmony_ci
8308c2ecf20Sopenharmony_ci/*
8318c2ecf20Sopenharmony_ci * hl_device_reset - reset the device
8328c2ecf20Sopenharmony_ci *
8338c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
8348c2ecf20Sopenharmony_ci * @hard_reset: should we do hard reset to all engines or just reset the
8358c2ecf20Sopenharmony_ci *              compute/dma engines
8368c2ecf20Sopenharmony_ci * @from_hard_reset_thread: is the caller the hard-reset thread
8378c2ecf20Sopenharmony_ci *
8388c2ecf20Sopenharmony_ci * Block future CS and wait for pending CS to be enqueued
8398c2ecf20Sopenharmony_ci * Call ASIC H/W fini
8408c2ecf20Sopenharmony_ci * Flush all completions
8418c2ecf20Sopenharmony_ci * Re-initialize all internal data structures
8428c2ecf20Sopenharmony_ci * Call ASIC H/W init, late_init
8438c2ecf20Sopenharmony_ci * Test queues
8448c2ecf20Sopenharmony_ci * Enable device
8458c2ecf20Sopenharmony_ci *
8468c2ecf20Sopenharmony_ci * Returns 0 for success or an error on failure.
8478c2ecf20Sopenharmony_ci */
8488c2ecf20Sopenharmony_ciint hl_device_reset(struct hl_device *hdev, bool hard_reset,
8498c2ecf20Sopenharmony_ci			bool from_hard_reset_thread)
8508c2ecf20Sopenharmony_ci{
8518c2ecf20Sopenharmony_ci	int i, rc;
8528c2ecf20Sopenharmony_ci
8538c2ecf20Sopenharmony_ci	if (!hdev->init_done) {
8548c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
8558c2ecf20Sopenharmony_ci			"Can't reset before initialization is done\n");
8568c2ecf20Sopenharmony_ci		return 0;
8578c2ecf20Sopenharmony_ci	}
8588c2ecf20Sopenharmony_ci
8598c2ecf20Sopenharmony_ci	if ((!hard_reset) && (!hdev->supports_soft_reset)) {
8608c2ecf20Sopenharmony_ci		dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
8618c2ecf20Sopenharmony_ci		hard_reset = true;
8628c2ecf20Sopenharmony_ci	}
8638c2ecf20Sopenharmony_ci
8648c2ecf20Sopenharmony_ci	/*
8658c2ecf20Sopenharmony_ci	 * Prevent concurrency in this function - only one reset should be
8668c2ecf20Sopenharmony_ci	 * done at any given time. Only need to perform this if we didn't
8678c2ecf20Sopenharmony_ci	 * get from the dedicated hard reset thread
8688c2ecf20Sopenharmony_ci	 */
8698c2ecf20Sopenharmony_ci	if (!from_hard_reset_thread) {
8708c2ecf20Sopenharmony_ci		/* Block future CS/VM/JOB completion operations */
8718c2ecf20Sopenharmony_ci		rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
8728c2ecf20Sopenharmony_ci		if (rc)
8738c2ecf20Sopenharmony_ci			return 0;
8748c2ecf20Sopenharmony_ci
8758c2ecf20Sopenharmony_ci		if (hard_reset) {
8768c2ecf20Sopenharmony_ci			/* Disable PCI access from device F/W so he won't send
8778c2ecf20Sopenharmony_ci			 * us additional interrupts. We disable MSI/MSI-X at
8788c2ecf20Sopenharmony_ci			 * the halt_engines function and we can't have the F/W
8798c2ecf20Sopenharmony_ci			 * sending us interrupts after that. We need to disable
8808c2ecf20Sopenharmony_ci			 * the access here because if the device is marked
8818c2ecf20Sopenharmony_ci			 * disable, the message won't be send. Also, in case
8828c2ecf20Sopenharmony_ci			 * of heartbeat, the device CPU is marked as disable
8838c2ecf20Sopenharmony_ci			 * so this message won't be sent
8848c2ecf20Sopenharmony_ci			 */
8858c2ecf20Sopenharmony_ci			if (hl_fw_send_pci_access_msg(hdev,
8868c2ecf20Sopenharmony_ci					CPUCP_PACKET_DISABLE_PCI_ACCESS))
8878c2ecf20Sopenharmony_ci				dev_warn(hdev->dev,
8888c2ecf20Sopenharmony_ci					"Failed to disable PCI access by F/W\n");
8898c2ecf20Sopenharmony_ci		}
8908c2ecf20Sopenharmony_ci
8918c2ecf20Sopenharmony_ci		/* This also blocks future CS/VM/JOB completion operations */
8928c2ecf20Sopenharmony_ci		hdev->disabled = true;
8938c2ecf20Sopenharmony_ci
8948c2ecf20Sopenharmony_ci		/* Flush anyone that is inside the critical section of enqueue
8958c2ecf20Sopenharmony_ci		 * jobs to the H/W
8968c2ecf20Sopenharmony_ci		 */
8978c2ecf20Sopenharmony_ci		hdev->asic_funcs->hw_queues_lock(hdev);
8988c2ecf20Sopenharmony_ci		hdev->asic_funcs->hw_queues_unlock(hdev);
8998c2ecf20Sopenharmony_ci
9008c2ecf20Sopenharmony_ci		/* Flush anyone that is inside device open */
9018c2ecf20Sopenharmony_ci		mutex_lock(&hdev->fpriv_list_lock);
9028c2ecf20Sopenharmony_ci		mutex_unlock(&hdev->fpriv_list_lock);
9038c2ecf20Sopenharmony_ci
9048c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Going to RESET device!\n");
9058c2ecf20Sopenharmony_ci	}
9068c2ecf20Sopenharmony_ci
9078c2ecf20Sopenharmony_ciagain:
9088c2ecf20Sopenharmony_ci	if ((hard_reset) && (!from_hard_reset_thread)) {
9098c2ecf20Sopenharmony_ci		struct hl_device_reset_work *device_reset_work;
9108c2ecf20Sopenharmony_ci
9118c2ecf20Sopenharmony_ci		hdev->hard_reset_pending = true;
9128c2ecf20Sopenharmony_ci
9138c2ecf20Sopenharmony_ci		device_reset_work = kzalloc(sizeof(*device_reset_work),
9148c2ecf20Sopenharmony_ci						GFP_ATOMIC);
9158c2ecf20Sopenharmony_ci		if (!device_reset_work) {
9168c2ecf20Sopenharmony_ci			rc = -ENOMEM;
9178c2ecf20Sopenharmony_ci			goto out_err;
9188c2ecf20Sopenharmony_ci		}
9198c2ecf20Sopenharmony_ci
9208c2ecf20Sopenharmony_ci		/*
9218c2ecf20Sopenharmony_ci		 * Because the reset function can't run from interrupt or
9228c2ecf20Sopenharmony_ci		 * from heartbeat work, we need to call the reset function
9238c2ecf20Sopenharmony_ci		 * from a dedicated work
9248c2ecf20Sopenharmony_ci		 */
9258c2ecf20Sopenharmony_ci		INIT_WORK(&device_reset_work->reset_work,
9268c2ecf20Sopenharmony_ci				device_hard_reset_pending);
9278c2ecf20Sopenharmony_ci		device_reset_work->hdev = hdev;
9288c2ecf20Sopenharmony_ci		schedule_work(&device_reset_work->reset_work);
9298c2ecf20Sopenharmony_ci
9308c2ecf20Sopenharmony_ci		return 0;
9318c2ecf20Sopenharmony_ci	}
9328c2ecf20Sopenharmony_ci
9338c2ecf20Sopenharmony_ci	if (hard_reset) {
9348c2ecf20Sopenharmony_ci		device_late_fini(hdev);
9358c2ecf20Sopenharmony_ci
9368c2ecf20Sopenharmony_ci		/*
9378c2ecf20Sopenharmony_ci		 * Now that the heartbeat thread is closed, flush processes
9388c2ecf20Sopenharmony_ci		 * which are sending messages to CPU
9398c2ecf20Sopenharmony_ci		 */
9408c2ecf20Sopenharmony_ci		mutex_lock(&hdev->send_cpu_message_lock);
9418c2ecf20Sopenharmony_ci		mutex_unlock(&hdev->send_cpu_message_lock);
9428c2ecf20Sopenharmony_ci	}
9438c2ecf20Sopenharmony_ci
9448c2ecf20Sopenharmony_ci	/*
9458c2ecf20Sopenharmony_ci	 * Halt the engines and disable interrupts so we won't get any more
9468c2ecf20Sopenharmony_ci	 * completions from H/W and we won't have any accesses from the
9478c2ecf20Sopenharmony_ci	 * H/W to the host machine
9488c2ecf20Sopenharmony_ci	 */
9498c2ecf20Sopenharmony_ci	hdev->asic_funcs->halt_engines(hdev, hard_reset);
9508c2ecf20Sopenharmony_ci
9518c2ecf20Sopenharmony_ci	/* Go over all the queues, release all CS and their jobs */
9528c2ecf20Sopenharmony_ci	hl_cs_rollback_all(hdev);
9538c2ecf20Sopenharmony_ci
9548c2ecf20Sopenharmony_ci	if (hard_reset) {
9558c2ecf20Sopenharmony_ci		/* Kill processes here after CS rollback. This is because the
9568c2ecf20Sopenharmony_ci		 * process can't really exit until all its CSs are done, which
9578c2ecf20Sopenharmony_ci		 * is what we do in cs rollback
9588c2ecf20Sopenharmony_ci		 */
9598c2ecf20Sopenharmony_ci		rc = device_kill_open_processes(hdev);
9608c2ecf20Sopenharmony_ci		if (rc) {
9618c2ecf20Sopenharmony_ci			dev_crit(hdev->dev,
9628c2ecf20Sopenharmony_ci				"Failed to kill all open processes, stopping hard reset\n");
9638c2ecf20Sopenharmony_ci			goto out_err;
9648c2ecf20Sopenharmony_ci		}
9658c2ecf20Sopenharmony_ci
9668c2ecf20Sopenharmony_ci		/* Flush the Event queue workers to make sure no other thread is
9678c2ecf20Sopenharmony_ci		 * reading or writing to registers during the reset
9688c2ecf20Sopenharmony_ci		 */
9698c2ecf20Sopenharmony_ci		flush_workqueue(hdev->eq_wq);
9708c2ecf20Sopenharmony_ci	}
9718c2ecf20Sopenharmony_ci
9728c2ecf20Sopenharmony_ci	/* Reset the H/W. It will be in idle state after this returns */
9738c2ecf20Sopenharmony_ci	hdev->asic_funcs->hw_fini(hdev, hard_reset);
9748c2ecf20Sopenharmony_ci
9758c2ecf20Sopenharmony_ci	if (hard_reset) {
9768c2ecf20Sopenharmony_ci		/* Release kernel context */
9778c2ecf20Sopenharmony_ci		if (hl_ctx_put(hdev->kernel_ctx) == 1)
9788c2ecf20Sopenharmony_ci			hdev->kernel_ctx = NULL;
9798c2ecf20Sopenharmony_ci		hl_vm_fini(hdev);
9808c2ecf20Sopenharmony_ci		hl_mmu_fini(hdev);
9818c2ecf20Sopenharmony_ci		hl_eq_reset(hdev, &hdev->event_queue);
9828c2ecf20Sopenharmony_ci	}
9838c2ecf20Sopenharmony_ci
9848c2ecf20Sopenharmony_ci	/* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
9858c2ecf20Sopenharmony_ci	hl_hw_queue_reset(hdev, hard_reset);
9868c2ecf20Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
9878c2ecf20Sopenharmony_ci		hl_cq_reset(hdev, &hdev->completion_queue[i]);
9888c2ecf20Sopenharmony_ci
9898c2ecf20Sopenharmony_ci	hdev->idle_busy_ts_idx = 0;
9908c2ecf20Sopenharmony_ci	hdev->idle_busy_ts_arr[0].busy_to_idle_ts = ktime_set(0, 0);
9918c2ecf20Sopenharmony_ci	hdev->idle_busy_ts_arr[0].idle_to_busy_ts = ktime_set(0, 0);
9928c2ecf20Sopenharmony_ci
9938c2ecf20Sopenharmony_ci	if (hdev->cs_active_cnt)
9948c2ecf20Sopenharmony_ci		dev_crit(hdev->dev, "CS active cnt %d is not 0 during reset\n",
9958c2ecf20Sopenharmony_ci			hdev->cs_active_cnt);
9968c2ecf20Sopenharmony_ci
9978c2ecf20Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
9988c2ecf20Sopenharmony_ci
9998c2ecf20Sopenharmony_ci	/* Make sure the context switch phase will run again */
10008c2ecf20Sopenharmony_ci	if (hdev->compute_ctx) {
10018c2ecf20Sopenharmony_ci		atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1);
10028c2ecf20Sopenharmony_ci		hdev->compute_ctx->thread_ctx_switch_wait_token = 0;
10038c2ecf20Sopenharmony_ci	}
10048c2ecf20Sopenharmony_ci
10058c2ecf20Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
10068c2ecf20Sopenharmony_ci
10078c2ecf20Sopenharmony_ci	/* Finished tear-down, starting to re-initialize */
10088c2ecf20Sopenharmony_ci
10098c2ecf20Sopenharmony_ci	if (hard_reset) {
10108c2ecf20Sopenharmony_ci		hdev->device_cpu_disabled = false;
10118c2ecf20Sopenharmony_ci		hdev->hard_reset_pending = false;
10128c2ecf20Sopenharmony_ci
10138c2ecf20Sopenharmony_ci		if (hdev->kernel_ctx) {
10148c2ecf20Sopenharmony_ci			dev_crit(hdev->dev,
10158c2ecf20Sopenharmony_ci				"kernel ctx was alive during hard reset, something is terribly wrong\n");
10168c2ecf20Sopenharmony_ci			rc = -EBUSY;
10178c2ecf20Sopenharmony_ci			goto out_err;
10188c2ecf20Sopenharmony_ci		}
10198c2ecf20Sopenharmony_ci
10208c2ecf20Sopenharmony_ci		rc = hl_mmu_init(hdev);
10218c2ecf20Sopenharmony_ci		if (rc) {
10228c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
10238c2ecf20Sopenharmony_ci				"Failed to initialize MMU S/W after hard reset\n");
10248c2ecf20Sopenharmony_ci			goto out_err;
10258c2ecf20Sopenharmony_ci		}
10268c2ecf20Sopenharmony_ci
10278c2ecf20Sopenharmony_ci		/* Allocate the kernel context */
10288c2ecf20Sopenharmony_ci		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
10298c2ecf20Sopenharmony_ci						GFP_KERNEL);
10308c2ecf20Sopenharmony_ci		if (!hdev->kernel_ctx) {
10318c2ecf20Sopenharmony_ci			rc = -ENOMEM;
10328c2ecf20Sopenharmony_ci			hl_mmu_fini(hdev);
10338c2ecf20Sopenharmony_ci			goto out_err;
10348c2ecf20Sopenharmony_ci		}
10358c2ecf20Sopenharmony_ci
10368c2ecf20Sopenharmony_ci		hdev->compute_ctx = NULL;
10378c2ecf20Sopenharmony_ci
10388c2ecf20Sopenharmony_ci		rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
10398c2ecf20Sopenharmony_ci		if (rc) {
10408c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
10418c2ecf20Sopenharmony_ci				"failed to init kernel ctx in hard reset\n");
10428c2ecf20Sopenharmony_ci			kfree(hdev->kernel_ctx);
10438c2ecf20Sopenharmony_ci			hdev->kernel_ctx = NULL;
10448c2ecf20Sopenharmony_ci			hl_mmu_fini(hdev);
10458c2ecf20Sopenharmony_ci			goto out_err;
10468c2ecf20Sopenharmony_ci		}
10478c2ecf20Sopenharmony_ci	}
10488c2ecf20Sopenharmony_ci
10498c2ecf20Sopenharmony_ci	/* Device is now enabled as part of the initialization requires
10508c2ecf20Sopenharmony_ci	 * communication with the device firmware to get information that
10518c2ecf20Sopenharmony_ci	 * is required for the initialization itself
10528c2ecf20Sopenharmony_ci	 */
10538c2ecf20Sopenharmony_ci	hdev->disabled = false;
10548c2ecf20Sopenharmony_ci
10558c2ecf20Sopenharmony_ci	rc = hdev->asic_funcs->hw_init(hdev);
10568c2ecf20Sopenharmony_ci	if (rc) {
10578c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
10588c2ecf20Sopenharmony_ci			"failed to initialize the H/W after reset\n");
10598c2ecf20Sopenharmony_ci		goto out_err;
10608c2ecf20Sopenharmony_ci	}
10618c2ecf20Sopenharmony_ci
10628c2ecf20Sopenharmony_ci	/* Check that the communication with the device is working */
10638c2ecf20Sopenharmony_ci	rc = hdev->asic_funcs->test_queues(hdev);
10648c2ecf20Sopenharmony_ci	if (rc) {
10658c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
10668c2ecf20Sopenharmony_ci			"Failed to detect if device is alive after reset\n");
10678c2ecf20Sopenharmony_ci		goto out_err;
10688c2ecf20Sopenharmony_ci	}
10698c2ecf20Sopenharmony_ci
10708c2ecf20Sopenharmony_ci	if (hard_reset) {
10718c2ecf20Sopenharmony_ci		rc = device_late_init(hdev);
10728c2ecf20Sopenharmony_ci		if (rc) {
10738c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
10748c2ecf20Sopenharmony_ci				"Failed late init after hard reset\n");
10758c2ecf20Sopenharmony_ci			goto out_err;
10768c2ecf20Sopenharmony_ci		}
10778c2ecf20Sopenharmony_ci
10788c2ecf20Sopenharmony_ci		rc = hl_vm_init(hdev);
10798c2ecf20Sopenharmony_ci		if (rc) {
10808c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
10818c2ecf20Sopenharmony_ci				"Failed to init memory module after hard reset\n");
10828c2ecf20Sopenharmony_ci			goto out_err;
10838c2ecf20Sopenharmony_ci		}
10848c2ecf20Sopenharmony_ci
10858c2ecf20Sopenharmony_ci		hl_set_max_power(hdev);
10868c2ecf20Sopenharmony_ci	} else {
10878c2ecf20Sopenharmony_ci		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
10888c2ecf20Sopenharmony_ci		if (rc) {
10898c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
10908c2ecf20Sopenharmony_ci				"Failed late init after soft reset\n");
10918c2ecf20Sopenharmony_ci			goto out_err;
10928c2ecf20Sopenharmony_ci		}
10938c2ecf20Sopenharmony_ci	}
10948c2ecf20Sopenharmony_ci
10958c2ecf20Sopenharmony_ci	atomic_set(&hdev->in_reset, 0);
10968c2ecf20Sopenharmony_ci
10978c2ecf20Sopenharmony_ci	if (hard_reset)
10988c2ecf20Sopenharmony_ci		hdev->hard_reset_cnt++;
10998c2ecf20Sopenharmony_ci	else
11008c2ecf20Sopenharmony_ci		hdev->soft_reset_cnt++;
11018c2ecf20Sopenharmony_ci
11028c2ecf20Sopenharmony_ci	dev_warn(hdev->dev, "Successfully finished resetting the device\n");
11038c2ecf20Sopenharmony_ci
11048c2ecf20Sopenharmony_ci	return 0;
11058c2ecf20Sopenharmony_ci
11068c2ecf20Sopenharmony_ciout_err:
11078c2ecf20Sopenharmony_ci	hdev->disabled = true;
11088c2ecf20Sopenharmony_ci
11098c2ecf20Sopenharmony_ci	if (hard_reset) {
11108c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
11118c2ecf20Sopenharmony_ci			"Failed to reset! Device is NOT usable\n");
11128c2ecf20Sopenharmony_ci		hdev->hard_reset_cnt++;
11138c2ecf20Sopenharmony_ci	} else {
11148c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
11158c2ecf20Sopenharmony_ci			"Failed to do soft-reset, trying hard reset\n");
11168c2ecf20Sopenharmony_ci		hdev->soft_reset_cnt++;
11178c2ecf20Sopenharmony_ci		hard_reset = true;
11188c2ecf20Sopenharmony_ci		goto again;
11198c2ecf20Sopenharmony_ci	}
11208c2ecf20Sopenharmony_ci
11218c2ecf20Sopenharmony_ci	atomic_set(&hdev->in_reset, 0);
11228c2ecf20Sopenharmony_ci
11238c2ecf20Sopenharmony_ci	return rc;
11248c2ecf20Sopenharmony_ci}
11258c2ecf20Sopenharmony_ci
11268c2ecf20Sopenharmony_ci/*
11278c2ecf20Sopenharmony_ci * hl_device_init - main initialization function for habanalabs device
11288c2ecf20Sopenharmony_ci *
11298c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
11308c2ecf20Sopenharmony_ci *
11318c2ecf20Sopenharmony_ci * Allocate an id for the device, do early initialization and then call the
11328c2ecf20Sopenharmony_ci * ASIC specific initialization functions. Finally, create the cdev and the
11338c2ecf20Sopenharmony_ci * Linux device to expose it to the user
11348c2ecf20Sopenharmony_ci */
11358c2ecf20Sopenharmony_ciint hl_device_init(struct hl_device *hdev, struct class *hclass)
11368c2ecf20Sopenharmony_ci{
11378c2ecf20Sopenharmony_ci	int i, rc, cq_cnt, cq_ready_cnt;
11388c2ecf20Sopenharmony_ci	char *name;
11398c2ecf20Sopenharmony_ci	bool add_cdev_sysfs_on_err = false;
11408c2ecf20Sopenharmony_ci
11418c2ecf20Sopenharmony_ci	name = kasprintf(GFP_KERNEL, "hl%d", hdev->id / 2);
11428c2ecf20Sopenharmony_ci	if (!name) {
11438c2ecf20Sopenharmony_ci		rc = -ENOMEM;
11448c2ecf20Sopenharmony_ci		goto out_disabled;
11458c2ecf20Sopenharmony_ci	}
11468c2ecf20Sopenharmony_ci
11478c2ecf20Sopenharmony_ci	/* Initialize cdev and device structures */
11488c2ecf20Sopenharmony_ci	rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
11498c2ecf20Sopenharmony_ci				&hdev->cdev, &hdev->dev);
11508c2ecf20Sopenharmony_ci
11518c2ecf20Sopenharmony_ci	kfree(name);
11528c2ecf20Sopenharmony_ci
11538c2ecf20Sopenharmony_ci	if (rc)
11548c2ecf20Sopenharmony_ci		goto out_disabled;
11558c2ecf20Sopenharmony_ci
11568c2ecf20Sopenharmony_ci	name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->id / 2);
11578c2ecf20Sopenharmony_ci	if (!name) {
11588c2ecf20Sopenharmony_ci		rc = -ENOMEM;
11598c2ecf20Sopenharmony_ci		goto free_dev;
11608c2ecf20Sopenharmony_ci	}
11618c2ecf20Sopenharmony_ci
11628c2ecf20Sopenharmony_ci	/* Initialize cdev and device structures for control device */
11638c2ecf20Sopenharmony_ci	rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
11648c2ecf20Sopenharmony_ci				name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
11658c2ecf20Sopenharmony_ci
11668c2ecf20Sopenharmony_ci	kfree(name);
11678c2ecf20Sopenharmony_ci
11688c2ecf20Sopenharmony_ci	if (rc)
11698c2ecf20Sopenharmony_ci		goto free_dev;
11708c2ecf20Sopenharmony_ci
11718c2ecf20Sopenharmony_ci	/* Initialize ASIC function pointers and perform early init */
11728c2ecf20Sopenharmony_ci	rc = device_early_init(hdev);
11738c2ecf20Sopenharmony_ci	if (rc)
11748c2ecf20Sopenharmony_ci		goto free_dev_ctrl;
11758c2ecf20Sopenharmony_ci
11768c2ecf20Sopenharmony_ci	/*
11778c2ecf20Sopenharmony_ci	 * Start calling ASIC initialization. First S/W then H/W and finally
11788c2ecf20Sopenharmony_ci	 * late init
11798c2ecf20Sopenharmony_ci	 */
11808c2ecf20Sopenharmony_ci	rc = hdev->asic_funcs->sw_init(hdev);
11818c2ecf20Sopenharmony_ci	if (rc)
11828c2ecf20Sopenharmony_ci		goto early_fini;
11838c2ecf20Sopenharmony_ci
11848c2ecf20Sopenharmony_ci	/*
11858c2ecf20Sopenharmony_ci	 * Initialize the H/W queues. Must be done before hw_init, because
11868c2ecf20Sopenharmony_ci	 * there the addresses of the kernel queue are being written to the
11878c2ecf20Sopenharmony_ci	 * registers of the device
11888c2ecf20Sopenharmony_ci	 */
11898c2ecf20Sopenharmony_ci	rc = hl_hw_queues_create(hdev);
11908c2ecf20Sopenharmony_ci	if (rc) {
11918c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize kernel queues\n");
11928c2ecf20Sopenharmony_ci		goto sw_fini;
11938c2ecf20Sopenharmony_ci	}
11948c2ecf20Sopenharmony_ci
11958c2ecf20Sopenharmony_ci	cq_cnt = hdev->asic_prop.completion_queues_count;
11968c2ecf20Sopenharmony_ci
11978c2ecf20Sopenharmony_ci	/*
11988c2ecf20Sopenharmony_ci	 * Initialize the completion queues. Must be done before hw_init,
11998c2ecf20Sopenharmony_ci	 * because there the addresses of the completion queues are being
12008c2ecf20Sopenharmony_ci	 * passed as arguments to request_irq
12018c2ecf20Sopenharmony_ci	 */
12028c2ecf20Sopenharmony_ci	if (cq_cnt) {
12038c2ecf20Sopenharmony_ci		hdev->completion_queue = kcalloc(cq_cnt,
12048c2ecf20Sopenharmony_ci				sizeof(*hdev->completion_queue),
12058c2ecf20Sopenharmony_ci				GFP_KERNEL);
12068c2ecf20Sopenharmony_ci
12078c2ecf20Sopenharmony_ci		if (!hdev->completion_queue) {
12088c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
12098c2ecf20Sopenharmony_ci				"failed to allocate completion queues\n");
12108c2ecf20Sopenharmony_ci			rc = -ENOMEM;
12118c2ecf20Sopenharmony_ci			goto hw_queues_destroy;
12128c2ecf20Sopenharmony_ci		}
12138c2ecf20Sopenharmony_ci	}
12148c2ecf20Sopenharmony_ci
12158c2ecf20Sopenharmony_ci	for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) {
12168c2ecf20Sopenharmony_ci		rc = hl_cq_init(hdev, &hdev->completion_queue[i],
12178c2ecf20Sopenharmony_ci				hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
12188c2ecf20Sopenharmony_ci		if (rc) {
12198c2ecf20Sopenharmony_ci			dev_err(hdev->dev,
12208c2ecf20Sopenharmony_ci				"failed to initialize completion queue\n");
12218c2ecf20Sopenharmony_ci			goto cq_fini;
12228c2ecf20Sopenharmony_ci		}
12238c2ecf20Sopenharmony_ci		hdev->completion_queue[i].cq_idx = i;
12248c2ecf20Sopenharmony_ci	}
12258c2ecf20Sopenharmony_ci
12268c2ecf20Sopenharmony_ci	/*
12278c2ecf20Sopenharmony_ci	 * Initialize the event queue. Must be done before hw_init,
12288c2ecf20Sopenharmony_ci	 * because there the address of the event queue is being
12298c2ecf20Sopenharmony_ci	 * passed as argument to request_irq
12308c2ecf20Sopenharmony_ci	 */
12318c2ecf20Sopenharmony_ci	rc = hl_eq_init(hdev, &hdev->event_queue);
12328c2ecf20Sopenharmony_ci	if (rc) {
12338c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize event queue\n");
12348c2ecf20Sopenharmony_ci		goto cq_fini;
12358c2ecf20Sopenharmony_ci	}
12368c2ecf20Sopenharmony_ci
12378c2ecf20Sopenharmony_ci	/* MMU S/W must be initialized before kernel context is created */
12388c2ecf20Sopenharmony_ci	rc = hl_mmu_init(hdev);
12398c2ecf20Sopenharmony_ci	if (rc) {
12408c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
12418c2ecf20Sopenharmony_ci		goto eq_fini;
12428c2ecf20Sopenharmony_ci	}
12438c2ecf20Sopenharmony_ci
12448c2ecf20Sopenharmony_ci	/* Allocate the kernel context */
12458c2ecf20Sopenharmony_ci	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
12468c2ecf20Sopenharmony_ci	if (!hdev->kernel_ctx) {
12478c2ecf20Sopenharmony_ci		rc = -ENOMEM;
12488c2ecf20Sopenharmony_ci		goto mmu_fini;
12498c2ecf20Sopenharmony_ci	}
12508c2ecf20Sopenharmony_ci
12518c2ecf20Sopenharmony_ci	hdev->compute_ctx = NULL;
12528c2ecf20Sopenharmony_ci
12538c2ecf20Sopenharmony_ci	rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
12548c2ecf20Sopenharmony_ci	if (rc) {
12558c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize kernel context\n");
12568c2ecf20Sopenharmony_ci		kfree(hdev->kernel_ctx);
12578c2ecf20Sopenharmony_ci		goto mmu_fini;
12588c2ecf20Sopenharmony_ci	}
12598c2ecf20Sopenharmony_ci
12608c2ecf20Sopenharmony_ci	rc = hl_cb_pool_init(hdev);
12618c2ecf20Sopenharmony_ci	if (rc) {
12628c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize CB pool\n");
12638c2ecf20Sopenharmony_ci		goto release_ctx;
12648c2ecf20Sopenharmony_ci	}
12658c2ecf20Sopenharmony_ci
12668c2ecf20Sopenharmony_ci	hl_debugfs_add_device(hdev);
12678c2ecf20Sopenharmony_ci
12688c2ecf20Sopenharmony_ci	if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
12698c2ecf20Sopenharmony_ci		dev_info(hdev->dev,
12708c2ecf20Sopenharmony_ci			"H/W state is dirty, must reset before initializing\n");
12718c2ecf20Sopenharmony_ci		hdev->asic_funcs->halt_engines(hdev, true);
12728c2ecf20Sopenharmony_ci		hdev->asic_funcs->hw_fini(hdev, true);
12738c2ecf20Sopenharmony_ci	}
12748c2ecf20Sopenharmony_ci
12758c2ecf20Sopenharmony_ci	/*
12768c2ecf20Sopenharmony_ci	 * From this point, in case of an error, add char devices and create
12778c2ecf20Sopenharmony_ci	 * sysfs nodes as part of the error flow, to allow debugging.
12788c2ecf20Sopenharmony_ci	 */
12798c2ecf20Sopenharmony_ci	add_cdev_sysfs_on_err = true;
12808c2ecf20Sopenharmony_ci
12818c2ecf20Sopenharmony_ci	/* Device is now enabled as part of the initialization requires
12828c2ecf20Sopenharmony_ci	 * communication with the device firmware to get information that
12838c2ecf20Sopenharmony_ci	 * is required for the initialization itself
12848c2ecf20Sopenharmony_ci	 */
12858c2ecf20Sopenharmony_ci	hdev->disabled = false;
12868c2ecf20Sopenharmony_ci
12878c2ecf20Sopenharmony_ci	rc = hdev->asic_funcs->hw_init(hdev);
12888c2ecf20Sopenharmony_ci	if (rc) {
12898c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "failed to initialize the H/W\n");
12908c2ecf20Sopenharmony_ci		rc = 0;
12918c2ecf20Sopenharmony_ci		goto out_disabled;
12928c2ecf20Sopenharmony_ci	}
12938c2ecf20Sopenharmony_ci
12948c2ecf20Sopenharmony_ci	/* Check that the communication with the device is working */
12958c2ecf20Sopenharmony_ci	rc = hdev->asic_funcs->test_queues(hdev);
12968c2ecf20Sopenharmony_ci	if (rc) {
12978c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Failed to detect if device is alive\n");
12988c2ecf20Sopenharmony_ci		rc = 0;
12998c2ecf20Sopenharmony_ci		goto out_disabled;
13008c2ecf20Sopenharmony_ci	}
13018c2ecf20Sopenharmony_ci
13028c2ecf20Sopenharmony_ci	rc = device_late_init(hdev);
13038c2ecf20Sopenharmony_ci	if (rc) {
13048c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Failed late initialization\n");
13058c2ecf20Sopenharmony_ci		rc = 0;
13068c2ecf20Sopenharmony_ci		goto out_disabled;
13078c2ecf20Sopenharmony_ci	}
13088c2ecf20Sopenharmony_ci
13098c2ecf20Sopenharmony_ci	dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
13108c2ecf20Sopenharmony_ci		hdev->asic_name,
13118c2ecf20Sopenharmony_ci		hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
13128c2ecf20Sopenharmony_ci
13138c2ecf20Sopenharmony_ci	rc = hl_vm_init(hdev);
13148c2ecf20Sopenharmony_ci	if (rc) {
13158c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Failed to initialize memory module\n");
13168c2ecf20Sopenharmony_ci		rc = 0;
13178c2ecf20Sopenharmony_ci		goto out_disabled;
13188c2ecf20Sopenharmony_ci	}
13198c2ecf20Sopenharmony_ci
13208c2ecf20Sopenharmony_ci	/*
13218c2ecf20Sopenharmony_ci	 * Expose devices and sysfs nodes to user.
13228c2ecf20Sopenharmony_ci	 * From here there is no need to add char devices and create sysfs nodes
13238c2ecf20Sopenharmony_ci	 * in case of an error.
13248c2ecf20Sopenharmony_ci	 */
13258c2ecf20Sopenharmony_ci	add_cdev_sysfs_on_err = false;
13268c2ecf20Sopenharmony_ci	rc = device_cdev_sysfs_add(hdev);
13278c2ecf20Sopenharmony_ci	if (rc) {
13288c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
13298c2ecf20Sopenharmony_ci			"Failed to add char devices and sysfs nodes\n");
13308c2ecf20Sopenharmony_ci		rc = 0;
13318c2ecf20Sopenharmony_ci		goto out_disabled;
13328c2ecf20Sopenharmony_ci	}
13338c2ecf20Sopenharmony_ci
13348c2ecf20Sopenharmony_ci	/* Need to call this again because the max power might change,
13358c2ecf20Sopenharmony_ci	 * depending on card type for certain ASICs
13368c2ecf20Sopenharmony_ci	 */
13378c2ecf20Sopenharmony_ci	hl_set_max_power(hdev);
13388c2ecf20Sopenharmony_ci
13398c2ecf20Sopenharmony_ci	/*
13408c2ecf20Sopenharmony_ci	 * hl_hwmon_init() must be called after device_late_init(), because only
13418c2ecf20Sopenharmony_ci	 * there we get the information from the device about which
13428c2ecf20Sopenharmony_ci	 * hwmon-related sensors the device supports.
13438c2ecf20Sopenharmony_ci	 * Furthermore, it must be done after adding the device to the system.
13448c2ecf20Sopenharmony_ci	 */
13458c2ecf20Sopenharmony_ci	rc = hl_hwmon_init(hdev);
13468c2ecf20Sopenharmony_ci	if (rc) {
13478c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "Failed to initialize hwmon\n");
13488c2ecf20Sopenharmony_ci		rc = 0;
13498c2ecf20Sopenharmony_ci		goto out_disabled;
13508c2ecf20Sopenharmony_ci	}
13518c2ecf20Sopenharmony_ci
13528c2ecf20Sopenharmony_ci	dev_notice(hdev->dev,
13538c2ecf20Sopenharmony_ci		"Successfully added device to habanalabs driver\n");
13548c2ecf20Sopenharmony_ci
13558c2ecf20Sopenharmony_ci	hdev->init_done = true;
13568c2ecf20Sopenharmony_ci
13578c2ecf20Sopenharmony_ci	return 0;
13588c2ecf20Sopenharmony_ci
13598c2ecf20Sopenharmony_cirelease_ctx:
13608c2ecf20Sopenharmony_ci	if (hl_ctx_put(hdev->kernel_ctx) != 1)
13618c2ecf20Sopenharmony_ci		dev_err(hdev->dev,
13628c2ecf20Sopenharmony_ci			"kernel ctx is still alive on initialization failure\n");
13638c2ecf20Sopenharmony_cimmu_fini:
13648c2ecf20Sopenharmony_ci	hl_mmu_fini(hdev);
13658c2ecf20Sopenharmony_cieq_fini:
13668c2ecf20Sopenharmony_ci	hl_eq_fini(hdev, &hdev->event_queue);
13678c2ecf20Sopenharmony_cicq_fini:
13688c2ecf20Sopenharmony_ci	for (i = 0 ; i < cq_ready_cnt ; i++)
13698c2ecf20Sopenharmony_ci		hl_cq_fini(hdev, &hdev->completion_queue[i]);
13708c2ecf20Sopenharmony_ci	kfree(hdev->completion_queue);
13718c2ecf20Sopenharmony_cihw_queues_destroy:
13728c2ecf20Sopenharmony_ci	hl_hw_queues_destroy(hdev);
13738c2ecf20Sopenharmony_cisw_fini:
13748c2ecf20Sopenharmony_ci	hdev->asic_funcs->sw_fini(hdev);
13758c2ecf20Sopenharmony_ciearly_fini:
13768c2ecf20Sopenharmony_ci	device_early_fini(hdev);
13778c2ecf20Sopenharmony_cifree_dev_ctrl:
13788c2ecf20Sopenharmony_ci	put_device(hdev->dev_ctrl);
13798c2ecf20Sopenharmony_cifree_dev:
13808c2ecf20Sopenharmony_ci	put_device(hdev->dev);
13818c2ecf20Sopenharmony_ciout_disabled:
13828c2ecf20Sopenharmony_ci	hdev->disabled = true;
13838c2ecf20Sopenharmony_ci	if (add_cdev_sysfs_on_err)
13848c2ecf20Sopenharmony_ci		device_cdev_sysfs_add(hdev);
13858c2ecf20Sopenharmony_ci	if (hdev->pdev)
13868c2ecf20Sopenharmony_ci		dev_err(&hdev->pdev->dev,
13878c2ecf20Sopenharmony_ci			"Failed to initialize hl%d. Device is NOT usable !\n",
13888c2ecf20Sopenharmony_ci			hdev->id / 2);
13898c2ecf20Sopenharmony_ci	else
13908c2ecf20Sopenharmony_ci		pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
13918c2ecf20Sopenharmony_ci			hdev->id / 2);
13928c2ecf20Sopenharmony_ci
13938c2ecf20Sopenharmony_ci	return rc;
13948c2ecf20Sopenharmony_ci}
13958c2ecf20Sopenharmony_ci
13968c2ecf20Sopenharmony_ci/*
13978c2ecf20Sopenharmony_ci * hl_device_fini - main tear-down function for habanalabs device
13988c2ecf20Sopenharmony_ci *
13998c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
14008c2ecf20Sopenharmony_ci *
14018c2ecf20Sopenharmony_ci * Destroy the device, call ASIC fini functions and release the id
14028c2ecf20Sopenharmony_ci */
14038c2ecf20Sopenharmony_civoid hl_device_fini(struct hl_device *hdev)
14048c2ecf20Sopenharmony_ci{
14058c2ecf20Sopenharmony_ci	int i, rc;
14068c2ecf20Sopenharmony_ci	ktime_t timeout;
14078c2ecf20Sopenharmony_ci
14088c2ecf20Sopenharmony_ci	dev_info(hdev->dev, "Removing device\n");
14098c2ecf20Sopenharmony_ci
14108c2ecf20Sopenharmony_ci	/*
14118c2ecf20Sopenharmony_ci	 * This function is competing with the reset function, so try to
14128c2ecf20Sopenharmony_ci	 * take the reset atomic and if we are already in middle of reset,
14138c2ecf20Sopenharmony_ci	 * wait until reset function is finished. Reset function is designed
14148c2ecf20Sopenharmony_ci	 * to always finish. However, in Gaudi, because of all the network
14158c2ecf20Sopenharmony_ci	 * ports, the hard reset could take between 10-30 seconds
14168c2ecf20Sopenharmony_ci	 */
14178c2ecf20Sopenharmony_ci
14188c2ecf20Sopenharmony_ci	timeout = ktime_add_us(ktime_get(),
14198c2ecf20Sopenharmony_ci				HL_HARD_RESET_MAX_TIMEOUT * 1000 * 1000);
14208c2ecf20Sopenharmony_ci	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
14218c2ecf20Sopenharmony_ci	while (rc) {
14228c2ecf20Sopenharmony_ci		usleep_range(50, 200);
14238c2ecf20Sopenharmony_ci		rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
14248c2ecf20Sopenharmony_ci		if (ktime_compare(ktime_get(), timeout) > 0) {
14258c2ecf20Sopenharmony_ci			WARN(1, "Failed to remove device because reset function did not finish\n");
14268c2ecf20Sopenharmony_ci			return;
14278c2ecf20Sopenharmony_ci		}
14288c2ecf20Sopenharmony_ci	}
14298c2ecf20Sopenharmony_ci
14308c2ecf20Sopenharmony_ci	/* Disable PCI access from device F/W so it won't send us additional
14318c2ecf20Sopenharmony_ci	 * interrupts. We disable MSI/MSI-X at the halt_engines function and we
14328c2ecf20Sopenharmony_ci	 * can't have the F/W sending us interrupts after that. We need to
14338c2ecf20Sopenharmony_ci	 * disable the access here because if the device is marked disable, the
14348c2ecf20Sopenharmony_ci	 * message won't be send. Also, in case of heartbeat, the device CPU is
14358c2ecf20Sopenharmony_ci	 * marked as disable so this message won't be sent
14368c2ecf20Sopenharmony_ci	 */
14378c2ecf20Sopenharmony_ci	hl_fw_send_pci_access_msg(hdev,	CPUCP_PACKET_DISABLE_PCI_ACCESS);
14388c2ecf20Sopenharmony_ci
14398c2ecf20Sopenharmony_ci	/* Mark device as disabled */
14408c2ecf20Sopenharmony_ci	hdev->disabled = true;
14418c2ecf20Sopenharmony_ci
14428c2ecf20Sopenharmony_ci	/* Flush anyone that is inside the critical section of enqueue
14438c2ecf20Sopenharmony_ci	 * jobs to the H/W
14448c2ecf20Sopenharmony_ci	 */
14458c2ecf20Sopenharmony_ci	hdev->asic_funcs->hw_queues_lock(hdev);
14468c2ecf20Sopenharmony_ci	hdev->asic_funcs->hw_queues_unlock(hdev);
14478c2ecf20Sopenharmony_ci
14488c2ecf20Sopenharmony_ci	/* Flush anyone that is inside device open */
14498c2ecf20Sopenharmony_ci	mutex_lock(&hdev->fpriv_list_lock);
14508c2ecf20Sopenharmony_ci	mutex_unlock(&hdev->fpriv_list_lock);
14518c2ecf20Sopenharmony_ci
14528c2ecf20Sopenharmony_ci	hdev->hard_reset_pending = true;
14538c2ecf20Sopenharmony_ci
14548c2ecf20Sopenharmony_ci	hl_hwmon_fini(hdev);
14558c2ecf20Sopenharmony_ci
14568c2ecf20Sopenharmony_ci	device_late_fini(hdev);
14578c2ecf20Sopenharmony_ci
14588c2ecf20Sopenharmony_ci	hl_debugfs_remove_device(hdev);
14598c2ecf20Sopenharmony_ci
14608c2ecf20Sopenharmony_ci	/*
14618c2ecf20Sopenharmony_ci	 * Halt the engines and disable interrupts so we won't get any more
14628c2ecf20Sopenharmony_ci	 * completions from H/W and we won't have any accesses from the
14638c2ecf20Sopenharmony_ci	 * H/W to the host machine
14648c2ecf20Sopenharmony_ci	 */
14658c2ecf20Sopenharmony_ci	hdev->asic_funcs->halt_engines(hdev, true);
14668c2ecf20Sopenharmony_ci
14678c2ecf20Sopenharmony_ci	/* Go over all the queues, release all CS and their jobs */
14688c2ecf20Sopenharmony_ci	hl_cs_rollback_all(hdev);
14698c2ecf20Sopenharmony_ci
14708c2ecf20Sopenharmony_ci	/* Kill processes here after CS rollback. This is because the process
14718c2ecf20Sopenharmony_ci	 * can't really exit until all its CSs are done, which is what we
14728c2ecf20Sopenharmony_ci	 * do in cs rollback
14738c2ecf20Sopenharmony_ci	 */
14748c2ecf20Sopenharmony_ci	rc = device_kill_open_processes(hdev);
14758c2ecf20Sopenharmony_ci	if (rc)
14768c2ecf20Sopenharmony_ci		dev_crit(hdev->dev, "Failed to kill all open processes\n");
14778c2ecf20Sopenharmony_ci
14788c2ecf20Sopenharmony_ci	hl_cb_pool_fini(hdev);
14798c2ecf20Sopenharmony_ci
14808c2ecf20Sopenharmony_ci	/* Reset the H/W. It will be in idle state after this returns */
14818c2ecf20Sopenharmony_ci	hdev->asic_funcs->hw_fini(hdev, true);
14828c2ecf20Sopenharmony_ci
14838c2ecf20Sopenharmony_ci	/* Release kernel context */
14848c2ecf20Sopenharmony_ci	if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
14858c2ecf20Sopenharmony_ci		dev_err(hdev->dev, "kernel ctx is still alive\n");
14868c2ecf20Sopenharmony_ci
14878c2ecf20Sopenharmony_ci	hl_vm_fini(hdev);
14888c2ecf20Sopenharmony_ci
14898c2ecf20Sopenharmony_ci	hl_mmu_fini(hdev);
14908c2ecf20Sopenharmony_ci
14918c2ecf20Sopenharmony_ci	hl_eq_fini(hdev, &hdev->event_queue);
14928c2ecf20Sopenharmony_ci
14938c2ecf20Sopenharmony_ci	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
14948c2ecf20Sopenharmony_ci		hl_cq_fini(hdev, &hdev->completion_queue[i]);
14958c2ecf20Sopenharmony_ci	kfree(hdev->completion_queue);
14968c2ecf20Sopenharmony_ci
14978c2ecf20Sopenharmony_ci	hl_hw_queues_destroy(hdev);
14988c2ecf20Sopenharmony_ci
14998c2ecf20Sopenharmony_ci	/* Call ASIC S/W finalize function */
15008c2ecf20Sopenharmony_ci	hdev->asic_funcs->sw_fini(hdev);
15018c2ecf20Sopenharmony_ci
15028c2ecf20Sopenharmony_ci	device_early_fini(hdev);
15038c2ecf20Sopenharmony_ci
15048c2ecf20Sopenharmony_ci	/* Hide devices and sysfs nodes from user */
15058c2ecf20Sopenharmony_ci	device_cdev_sysfs_del(hdev);
15068c2ecf20Sopenharmony_ci
15078c2ecf20Sopenharmony_ci	pr_info("removed device successfully\n");
15088c2ecf20Sopenharmony_ci}
15098c2ecf20Sopenharmony_ci
15108c2ecf20Sopenharmony_ci/*
15118c2ecf20Sopenharmony_ci * MMIO register access helper functions.
15128c2ecf20Sopenharmony_ci */
15138c2ecf20Sopenharmony_ci
15148c2ecf20Sopenharmony_ci/*
15158c2ecf20Sopenharmony_ci * hl_rreg - Read an MMIO register
15168c2ecf20Sopenharmony_ci *
15178c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
15188c2ecf20Sopenharmony_ci * @reg: MMIO register offset (in bytes)
15198c2ecf20Sopenharmony_ci *
15208c2ecf20Sopenharmony_ci * Returns the value of the MMIO register we are asked to read
15218c2ecf20Sopenharmony_ci *
15228c2ecf20Sopenharmony_ci */
15238c2ecf20Sopenharmony_ciinline u32 hl_rreg(struct hl_device *hdev, u32 reg)
15248c2ecf20Sopenharmony_ci{
15258c2ecf20Sopenharmony_ci	return readl(hdev->rmmio + reg);
15268c2ecf20Sopenharmony_ci}
15278c2ecf20Sopenharmony_ci
15288c2ecf20Sopenharmony_ci/*
15298c2ecf20Sopenharmony_ci * hl_wreg - Write to an MMIO register
15308c2ecf20Sopenharmony_ci *
15318c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure
15328c2ecf20Sopenharmony_ci * @reg: MMIO register offset (in bytes)
15338c2ecf20Sopenharmony_ci * @val: 32-bit value
15348c2ecf20Sopenharmony_ci *
15358c2ecf20Sopenharmony_ci * Writes the 32-bit value into the MMIO register
15368c2ecf20Sopenharmony_ci *
15378c2ecf20Sopenharmony_ci */
15388c2ecf20Sopenharmony_ciinline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
15398c2ecf20Sopenharmony_ci{
15408c2ecf20Sopenharmony_ci	writel(val, hdev->rmmio + reg);
15418c2ecf20Sopenharmony_ci}
1542