18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci 38c2ecf20Sopenharmony_ci/* 48c2ecf20Sopenharmony_ci * Copyright 2016-2019 HabanaLabs, Ltd. 58c2ecf20Sopenharmony_ci * All Rights Reserved. 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#define pr_fmt(fmt) "habanalabs: " fmt 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci#include "habanalabs.h" 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci#include <linux/pci.h> 138c2ecf20Sopenharmony_ci#include <linux/sched/signal.h> 148c2ecf20Sopenharmony_ci#include <linux/hwmon.h> 158c2ecf20Sopenharmony_ci#include <uapi/misc/habanalabs.h> 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci#define HL_PLDM_PENDING_RESET_PER_SEC (HL_PENDING_RESET_PER_SEC * 10) 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_cibool hl_device_disabled_or_in_reset(struct hl_device *hdev) 208c2ecf20Sopenharmony_ci{ 218c2ecf20Sopenharmony_ci if ((hdev->disabled) || (atomic_read(&hdev->in_reset))) 228c2ecf20Sopenharmony_ci return true; 238c2ecf20Sopenharmony_ci else 248c2ecf20Sopenharmony_ci return false; 258c2ecf20Sopenharmony_ci} 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_cienum hl_device_status hl_device_status(struct hl_device *hdev) 288c2ecf20Sopenharmony_ci{ 298c2ecf20Sopenharmony_ci enum hl_device_status status; 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci if (hdev->disabled) 328c2ecf20Sopenharmony_ci status = HL_DEVICE_STATUS_MALFUNCTION; 338c2ecf20Sopenharmony_ci else if (atomic_read(&hdev->in_reset)) 348c2ecf20Sopenharmony_ci status = HL_DEVICE_STATUS_IN_RESET; 358c2ecf20Sopenharmony_ci else 368c2ecf20Sopenharmony_ci status = HL_DEVICE_STATUS_OPERATIONAL; 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_ci return status; 398c2ecf20Sopenharmony_ci} 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_cistatic void hpriv_release(struct kref *ref) 428c2ecf20Sopenharmony_ci{ 438c2ecf20Sopenharmony_ci struct hl_fpriv *hpriv; 448c2ecf20Sopenharmony_ci struct hl_device *hdev; 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci hpriv = container_of(ref, struct hl_fpriv, refcount); 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci hdev = hpriv->hdev; 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci put_pid(hpriv->taskpid); 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci hl_debugfs_remove_file(hpriv); 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci mutex_destroy(&hpriv->restore_phase_mutex); 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci mutex_lock(&hdev->fpriv_list_lock); 578c2ecf20Sopenharmony_ci list_del(&hpriv->dev_node); 588c2ecf20Sopenharmony_ci hdev->compute_ctx = NULL; 598c2ecf20Sopenharmony_ci mutex_unlock(&hdev->fpriv_list_lock); 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci kfree(hpriv); 628c2ecf20Sopenharmony_ci} 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_civoid hl_hpriv_get(struct hl_fpriv *hpriv) 658c2ecf20Sopenharmony_ci{ 668c2ecf20Sopenharmony_ci kref_get(&hpriv->refcount); 678c2ecf20Sopenharmony_ci} 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_civoid hl_hpriv_put(struct hl_fpriv *hpriv) 708c2ecf20Sopenharmony_ci{ 718c2ecf20Sopenharmony_ci kref_put(&hpriv->refcount, hpriv_release); 728c2ecf20Sopenharmony_ci} 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_ci/* 758c2ecf20Sopenharmony_ci * hl_device_release - release function for habanalabs device 768c2ecf20Sopenharmony_ci * 778c2ecf20Sopenharmony_ci * @inode: pointer to inode structure 788c2ecf20Sopenharmony_ci * @filp: pointer to file structure 798c2ecf20Sopenharmony_ci * 808c2ecf20Sopenharmony_ci * Called when process closes an habanalabs device 818c2ecf20Sopenharmony_ci */ 828c2ecf20Sopenharmony_cistatic int hl_device_release(struct inode *inode, struct file *filp) 838c2ecf20Sopenharmony_ci{ 848c2ecf20Sopenharmony_ci struct hl_fpriv *hpriv = filp->private_data; 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr); 878c2ecf20Sopenharmony_ci hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci filp->private_data = NULL; 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_ci hl_hpriv_put(hpriv); 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci return 0; 948c2ecf20Sopenharmony_ci} 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_cistatic int hl_device_release_ctrl(struct inode *inode, struct file *filp) 978c2ecf20Sopenharmony_ci{ 988c2ecf20Sopenharmony_ci struct hl_fpriv *hpriv = filp->private_data; 998c2ecf20Sopenharmony_ci struct hl_device *hdev; 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci filp->private_data = NULL; 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci hdev = hpriv->hdev; 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci mutex_lock(&hdev->fpriv_list_lock); 1068c2ecf20Sopenharmony_ci list_del(&hpriv->dev_node); 1078c2ecf20Sopenharmony_ci mutex_unlock(&hdev->fpriv_list_lock); 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci put_pid(hpriv->taskpid); 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci kfree(hpriv); 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci return 0; 1148c2ecf20Sopenharmony_ci} 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci/* 1178c2ecf20Sopenharmony_ci * hl_mmap - mmap function for habanalabs device 1188c2ecf20Sopenharmony_ci * 1198c2ecf20Sopenharmony_ci * @*filp: pointer to file structure 1208c2ecf20Sopenharmony_ci * @*vma: pointer to vm_area_struct of the process 1218c2ecf20Sopenharmony_ci * 1228c2ecf20Sopenharmony_ci * Called when process does an mmap on habanalabs device. Call the device's mmap 1238c2ecf20Sopenharmony_ci * function at the end of the common code. 1248c2ecf20Sopenharmony_ci */ 1258c2ecf20Sopenharmony_cistatic int hl_mmap(struct file *filp, struct vm_area_struct *vma) 1268c2ecf20Sopenharmony_ci{ 1278c2ecf20Sopenharmony_ci struct hl_fpriv *hpriv = filp->private_data; 1288c2ecf20Sopenharmony_ci unsigned long vm_pgoff; 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci vm_pgoff = vma->vm_pgoff; 1318c2ecf20Sopenharmony_ci vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff); 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci switch (vm_pgoff & HL_MMAP_TYPE_MASK) { 1348c2ecf20Sopenharmony_ci case HL_MMAP_TYPE_CB: 1358c2ecf20Sopenharmony_ci return hl_cb_mmap(hpriv, vma); 1368c2ecf20Sopenharmony_ci } 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci return -EINVAL; 1398c2ecf20Sopenharmony_ci} 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_cistatic const struct file_operations hl_ops = { 1428c2ecf20Sopenharmony_ci .owner = THIS_MODULE, 1438c2ecf20Sopenharmony_ci .open = hl_device_open, 1448c2ecf20Sopenharmony_ci .release = hl_device_release, 1458c2ecf20Sopenharmony_ci .mmap = hl_mmap, 1468c2ecf20Sopenharmony_ci .unlocked_ioctl = hl_ioctl, 1478c2ecf20Sopenharmony_ci .compat_ioctl = hl_ioctl 1488c2ecf20Sopenharmony_ci}; 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_cistatic const struct file_operations hl_ctrl_ops = { 1518c2ecf20Sopenharmony_ci .owner = THIS_MODULE, 1528c2ecf20Sopenharmony_ci .open = hl_device_open_ctrl, 1538c2ecf20Sopenharmony_ci .release = hl_device_release_ctrl, 1548c2ecf20Sopenharmony_ci .unlocked_ioctl = hl_ioctl_control, 1558c2ecf20Sopenharmony_ci .compat_ioctl = hl_ioctl_control 1568c2ecf20Sopenharmony_ci}; 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_cistatic void device_release_func(struct device *dev) 1598c2ecf20Sopenharmony_ci{ 1608c2ecf20Sopenharmony_ci kfree(dev); 1618c2ecf20Sopenharmony_ci} 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_ci/* 1648c2ecf20Sopenharmony_ci * device_init_cdev - Initialize cdev and device for habanalabs device 1658c2ecf20Sopenharmony_ci * 1668c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 1678c2ecf20Sopenharmony_ci * @hclass: pointer to the class object of the device 1688c2ecf20Sopenharmony_ci * @minor: minor number of the specific device 1698c2ecf20Sopenharmony_ci * @fpos: file operations to install for this device 1708c2ecf20Sopenharmony_ci * @name: name of the device as it will appear in the filesystem 1718c2ecf20Sopenharmony_ci * @cdev: pointer to the char device object that will be initialized 1728c2ecf20Sopenharmony_ci * @dev: pointer to the device object that will be initialized 1738c2ecf20Sopenharmony_ci * 1748c2ecf20Sopenharmony_ci * Initialize a cdev and a Linux device for habanalabs's device. 1758c2ecf20Sopenharmony_ci */ 1768c2ecf20Sopenharmony_cistatic int device_init_cdev(struct hl_device *hdev, struct class *hclass, 1778c2ecf20Sopenharmony_ci int minor, const struct file_operations *fops, 1788c2ecf20Sopenharmony_ci char *name, struct cdev *cdev, 1798c2ecf20Sopenharmony_ci struct device **dev) 1808c2ecf20Sopenharmony_ci{ 1818c2ecf20Sopenharmony_ci cdev_init(cdev, fops); 1828c2ecf20Sopenharmony_ci cdev->owner = THIS_MODULE; 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_ci *dev = kzalloc(sizeof(**dev), GFP_KERNEL); 1858c2ecf20Sopenharmony_ci if (!*dev) 1868c2ecf20Sopenharmony_ci return -ENOMEM; 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci device_initialize(*dev); 1898c2ecf20Sopenharmony_ci (*dev)->devt = MKDEV(hdev->major, minor); 1908c2ecf20Sopenharmony_ci (*dev)->class = hclass; 1918c2ecf20Sopenharmony_ci (*dev)->release = device_release_func; 1928c2ecf20Sopenharmony_ci dev_set_drvdata(*dev, hdev); 1938c2ecf20Sopenharmony_ci dev_set_name(*dev, "%s", name); 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci return 0; 1968c2ecf20Sopenharmony_ci} 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_cistatic int device_cdev_sysfs_add(struct hl_device *hdev) 1998c2ecf20Sopenharmony_ci{ 2008c2ecf20Sopenharmony_ci int rc; 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci rc = cdev_device_add(&hdev->cdev, hdev->dev); 2038c2ecf20Sopenharmony_ci if (rc) { 2048c2ecf20Sopenharmony_ci dev_err(hdev->dev, 2058c2ecf20Sopenharmony_ci "failed to add a char device to the system\n"); 2068c2ecf20Sopenharmony_ci return rc; 2078c2ecf20Sopenharmony_ci } 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl); 2108c2ecf20Sopenharmony_ci if (rc) { 2118c2ecf20Sopenharmony_ci dev_err(hdev->dev, 2128c2ecf20Sopenharmony_ci "failed to add a control char device to the system\n"); 2138c2ecf20Sopenharmony_ci goto delete_cdev_device; 2148c2ecf20Sopenharmony_ci } 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci /* hl_sysfs_init() must be done after adding the device to the system */ 2178c2ecf20Sopenharmony_ci rc = hl_sysfs_init(hdev); 2188c2ecf20Sopenharmony_ci if (rc) { 2198c2ecf20Sopenharmony_ci dev_err(hdev->dev, "failed to initialize sysfs\n"); 2208c2ecf20Sopenharmony_ci goto delete_ctrl_cdev_device; 2218c2ecf20Sopenharmony_ci } 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ci hdev->cdev_sysfs_created = true; 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci return 0; 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_cidelete_ctrl_cdev_device: 2288c2ecf20Sopenharmony_ci cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); 2298c2ecf20Sopenharmony_cidelete_cdev_device: 2308c2ecf20Sopenharmony_ci cdev_device_del(&hdev->cdev, hdev->dev); 2318c2ecf20Sopenharmony_ci return rc; 2328c2ecf20Sopenharmony_ci} 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_cistatic void device_cdev_sysfs_del(struct hl_device *hdev) 2358c2ecf20Sopenharmony_ci{ 2368c2ecf20Sopenharmony_ci if (!hdev->cdev_sysfs_created) 2378c2ecf20Sopenharmony_ci goto put_devices; 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci hl_sysfs_fini(hdev); 2408c2ecf20Sopenharmony_ci cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); 2418c2ecf20Sopenharmony_ci cdev_device_del(&hdev->cdev, hdev->dev); 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ciput_devices: 2448c2ecf20Sopenharmony_ci put_device(hdev->dev); 2458c2ecf20Sopenharmony_ci put_device(hdev->dev_ctrl); 2468c2ecf20Sopenharmony_ci} 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci/* 2498c2ecf20Sopenharmony_ci * device_early_init - do some early initialization for the habanalabs device 2508c2ecf20Sopenharmony_ci * 2518c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 2528c2ecf20Sopenharmony_ci * 2538c2ecf20Sopenharmony_ci * Install the relevant function pointers and call the early_init function, 2548c2ecf20Sopenharmony_ci * if such a function exists 2558c2ecf20Sopenharmony_ci */ 2568c2ecf20Sopenharmony_cistatic int device_early_init(struct hl_device *hdev) 2578c2ecf20Sopenharmony_ci{ 2588c2ecf20Sopenharmony_ci int i, rc; 2598c2ecf20Sopenharmony_ci char workq_name[32]; 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci switch (hdev->asic_type) { 2628c2ecf20Sopenharmony_ci case ASIC_GOYA: 2638c2ecf20Sopenharmony_ci goya_set_asic_funcs(hdev); 2648c2ecf20Sopenharmony_ci strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name)); 2658c2ecf20Sopenharmony_ci break; 2668c2ecf20Sopenharmony_ci case ASIC_GAUDI: 2678c2ecf20Sopenharmony_ci gaudi_set_asic_funcs(hdev); 2688c2ecf20Sopenharmony_ci sprintf(hdev->asic_name, "GAUDI"); 2698c2ecf20Sopenharmony_ci break; 2708c2ecf20Sopenharmony_ci default: 2718c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Unrecognized ASIC type %d\n", 2728c2ecf20Sopenharmony_ci hdev->asic_type); 2738c2ecf20Sopenharmony_ci return -EINVAL; 2748c2ecf20Sopenharmony_ci } 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->early_init(hdev); 2778c2ecf20Sopenharmony_ci if (rc) 2788c2ecf20Sopenharmony_ci return rc; 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci rc = hl_asid_init(hdev); 2818c2ecf20Sopenharmony_ci if (rc) 2828c2ecf20Sopenharmony_ci goto early_fini; 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci if (hdev->asic_prop.completion_queues_count) { 2858c2ecf20Sopenharmony_ci hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count, 2868c2ecf20Sopenharmony_ci sizeof(*hdev->cq_wq), 2878c2ecf20Sopenharmony_ci GFP_ATOMIC); 2888c2ecf20Sopenharmony_ci if (!hdev->cq_wq) { 2898c2ecf20Sopenharmony_ci rc = -ENOMEM; 2908c2ecf20Sopenharmony_ci goto asid_fini; 2918c2ecf20Sopenharmony_ci } 2928c2ecf20Sopenharmony_ci } 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_ci for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { 2958c2ecf20Sopenharmony_ci snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i); 2968c2ecf20Sopenharmony_ci hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); 2978c2ecf20Sopenharmony_ci if (hdev->cq_wq[i] == NULL) { 2988c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); 2998c2ecf20Sopenharmony_ci rc = -ENOMEM; 3008c2ecf20Sopenharmony_ci goto free_cq_wq; 3018c2ecf20Sopenharmony_ci } 3028c2ecf20Sopenharmony_ci } 3038c2ecf20Sopenharmony_ci 3048c2ecf20Sopenharmony_ci hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0); 3058c2ecf20Sopenharmony_ci if (hdev->eq_wq == NULL) { 3068c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); 3078c2ecf20Sopenharmony_ci rc = -ENOMEM; 3088c2ecf20Sopenharmony_ci goto free_cq_wq; 3098c2ecf20Sopenharmony_ci } 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), 3128c2ecf20Sopenharmony_ci GFP_KERNEL); 3138c2ecf20Sopenharmony_ci if (!hdev->hl_chip_info) { 3148c2ecf20Sopenharmony_ci rc = -ENOMEM; 3158c2ecf20Sopenharmony_ci goto free_eq_wq; 3168c2ecf20Sopenharmony_ci } 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE, 3198c2ecf20Sopenharmony_ci sizeof(struct hl_device_idle_busy_ts), 3208c2ecf20Sopenharmony_ci (GFP_KERNEL | __GFP_ZERO)); 3218c2ecf20Sopenharmony_ci if (!hdev->idle_busy_ts_arr) { 3228c2ecf20Sopenharmony_ci rc = -ENOMEM; 3238c2ecf20Sopenharmony_ci goto free_chip_info; 3248c2ecf20Sopenharmony_ci } 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci rc = hl_mmu_if_set_funcs(hdev); 3278c2ecf20Sopenharmony_ci if (rc) 3288c2ecf20Sopenharmony_ci goto free_idle_busy_ts_arr; 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci hl_cb_mgr_init(&hdev->kernel_cb_mgr); 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci mutex_init(&hdev->send_cpu_message_lock); 3338c2ecf20Sopenharmony_ci mutex_init(&hdev->debug_lock); 3348c2ecf20Sopenharmony_ci mutex_init(&hdev->mmu_cache_lock); 3358c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&hdev->hw_queues_mirror_list); 3368c2ecf20Sopenharmony_ci spin_lock_init(&hdev->hw_queues_mirror_lock); 3378c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&hdev->fpriv_list); 3388c2ecf20Sopenharmony_ci mutex_init(&hdev->fpriv_list_lock); 3398c2ecf20Sopenharmony_ci atomic_set(&hdev->in_reset, 0); 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci return 0; 3428c2ecf20Sopenharmony_ci 3438c2ecf20Sopenharmony_cifree_idle_busy_ts_arr: 3448c2ecf20Sopenharmony_ci kfree(hdev->idle_busy_ts_arr); 3458c2ecf20Sopenharmony_cifree_chip_info: 3468c2ecf20Sopenharmony_ci kfree(hdev->hl_chip_info); 3478c2ecf20Sopenharmony_cifree_eq_wq: 3488c2ecf20Sopenharmony_ci destroy_workqueue(hdev->eq_wq); 3498c2ecf20Sopenharmony_cifree_cq_wq: 3508c2ecf20Sopenharmony_ci for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) 3518c2ecf20Sopenharmony_ci if (hdev->cq_wq[i]) 3528c2ecf20Sopenharmony_ci destroy_workqueue(hdev->cq_wq[i]); 3538c2ecf20Sopenharmony_ci kfree(hdev->cq_wq); 3548c2ecf20Sopenharmony_ciasid_fini: 3558c2ecf20Sopenharmony_ci hl_asid_fini(hdev); 3568c2ecf20Sopenharmony_ciearly_fini: 3578c2ecf20Sopenharmony_ci if (hdev->asic_funcs->early_fini) 3588c2ecf20Sopenharmony_ci hdev->asic_funcs->early_fini(hdev); 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci return rc; 3618c2ecf20Sopenharmony_ci} 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci/* 3648c2ecf20Sopenharmony_ci * device_early_fini - finalize all that was done in device_early_init 3658c2ecf20Sopenharmony_ci * 3668c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 3678c2ecf20Sopenharmony_ci * 3688c2ecf20Sopenharmony_ci */ 3698c2ecf20Sopenharmony_cistatic void device_early_fini(struct hl_device *hdev) 3708c2ecf20Sopenharmony_ci{ 3718c2ecf20Sopenharmony_ci int i; 3728c2ecf20Sopenharmony_ci 3738c2ecf20Sopenharmony_ci mutex_destroy(&hdev->mmu_cache_lock); 3748c2ecf20Sopenharmony_ci mutex_destroy(&hdev->debug_lock); 3758c2ecf20Sopenharmony_ci mutex_destroy(&hdev->send_cpu_message_lock); 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci mutex_destroy(&hdev->fpriv_list_lock); 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr); 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci kfree(hdev->idle_busy_ts_arr); 3828c2ecf20Sopenharmony_ci kfree(hdev->hl_chip_info); 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci destroy_workqueue(hdev->eq_wq); 3858c2ecf20Sopenharmony_ci 3868c2ecf20Sopenharmony_ci for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) 3878c2ecf20Sopenharmony_ci destroy_workqueue(hdev->cq_wq[i]); 3888c2ecf20Sopenharmony_ci kfree(hdev->cq_wq); 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ci hl_asid_fini(hdev); 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci if (hdev->asic_funcs->early_fini) 3938c2ecf20Sopenharmony_ci hdev->asic_funcs->early_fini(hdev); 3948c2ecf20Sopenharmony_ci} 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_cistatic void set_freq_to_low_job(struct work_struct *work) 3978c2ecf20Sopenharmony_ci{ 3988c2ecf20Sopenharmony_ci struct hl_device *hdev = container_of(work, struct hl_device, 3998c2ecf20Sopenharmony_ci work_freq.work); 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci mutex_lock(&hdev->fpriv_list_lock); 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci if (!hdev->compute_ctx) 4048c2ecf20Sopenharmony_ci hl_device_set_frequency(hdev, PLL_LOW); 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci mutex_unlock(&hdev->fpriv_list_lock); 4078c2ecf20Sopenharmony_ci 4088c2ecf20Sopenharmony_ci schedule_delayed_work(&hdev->work_freq, 4098c2ecf20Sopenharmony_ci usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC)); 4108c2ecf20Sopenharmony_ci} 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_cistatic void hl_device_heartbeat(struct work_struct *work) 4138c2ecf20Sopenharmony_ci{ 4148c2ecf20Sopenharmony_ci struct hl_device *hdev = container_of(work, struct hl_device, 4158c2ecf20Sopenharmony_ci work_heartbeat.work); 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci if (hl_device_disabled_or_in_reset(hdev)) 4188c2ecf20Sopenharmony_ci goto reschedule; 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci if (!hdev->asic_funcs->send_heartbeat(hdev)) 4218c2ecf20Sopenharmony_ci goto reschedule; 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Device heartbeat failed!\n"); 4248c2ecf20Sopenharmony_ci hl_device_reset(hdev, true, false); 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci return; 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_cireschedule: 4298c2ecf20Sopenharmony_ci schedule_delayed_work(&hdev->work_heartbeat, 4308c2ecf20Sopenharmony_ci usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); 4318c2ecf20Sopenharmony_ci} 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci/* 4348c2ecf20Sopenharmony_ci * device_late_init - do late stuff initialization for the habanalabs device 4358c2ecf20Sopenharmony_ci * 4368c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 4378c2ecf20Sopenharmony_ci * 4388c2ecf20Sopenharmony_ci * Do stuff that either needs the device H/W queues to be active or needs 4398c2ecf20Sopenharmony_ci * to happen after all the rest of the initialization is finished 4408c2ecf20Sopenharmony_ci */ 4418c2ecf20Sopenharmony_cistatic int device_late_init(struct hl_device *hdev) 4428c2ecf20Sopenharmony_ci{ 4438c2ecf20Sopenharmony_ci int rc; 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci if (hdev->asic_funcs->late_init) { 4468c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->late_init(hdev); 4478c2ecf20Sopenharmony_ci if (rc) { 4488c2ecf20Sopenharmony_ci dev_err(hdev->dev, 4498c2ecf20Sopenharmony_ci "failed late initialization for the H/W\n"); 4508c2ecf20Sopenharmony_ci return rc; 4518c2ecf20Sopenharmony_ci } 4528c2ecf20Sopenharmony_ci } 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci hdev->high_pll = hdev->asic_prop.high_pll; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci /* force setting to low frequency */ 4578c2ecf20Sopenharmony_ci hdev->curr_pll_profile = PLL_LOW; 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci if (hdev->pm_mng_profile == PM_AUTO) 4608c2ecf20Sopenharmony_ci hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW); 4618c2ecf20Sopenharmony_ci else 4628c2ecf20Sopenharmony_ci hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST); 4638c2ecf20Sopenharmony_ci 4648c2ecf20Sopenharmony_ci INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job); 4658c2ecf20Sopenharmony_ci schedule_delayed_work(&hdev->work_freq, 4668c2ecf20Sopenharmony_ci usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC)); 4678c2ecf20Sopenharmony_ci 4688c2ecf20Sopenharmony_ci if (hdev->heartbeat) { 4698c2ecf20Sopenharmony_ci INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat); 4708c2ecf20Sopenharmony_ci schedule_delayed_work(&hdev->work_heartbeat, 4718c2ecf20Sopenharmony_ci usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); 4728c2ecf20Sopenharmony_ci } 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci hdev->late_init_done = true; 4758c2ecf20Sopenharmony_ci 4768c2ecf20Sopenharmony_ci return 0; 4778c2ecf20Sopenharmony_ci} 4788c2ecf20Sopenharmony_ci 4798c2ecf20Sopenharmony_ci/* 4808c2ecf20Sopenharmony_ci * device_late_fini - finalize all that was done in device_late_init 4818c2ecf20Sopenharmony_ci * 4828c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 4838c2ecf20Sopenharmony_ci * 4848c2ecf20Sopenharmony_ci */ 4858c2ecf20Sopenharmony_cistatic void device_late_fini(struct hl_device *hdev) 4868c2ecf20Sopenharmony_ci{ 4878c2ecf20Sopenharmony_ci if (!hdev->late_init_done) 4888c2ecf20Sopenharmony_ci return; 4898c2ecf20Sopenharmony_ci 4908c2ecf20Sopenharmony_ci cancel_delayed_work_sync(&hdev->work_freq); 4918c2ecf20Sopenharmony_ci if (hdev->heartbeat) 4928c2ecf20Sopenharmony_ci cancel_delayed_work_sync(&hdev->work_heartbeat); 4938c2ecf20Sopenharmony_ci 4948c2ecf20Sopenharmony_ci if (hdev->asic_funcs->late_fini) 4958c2ecf20Sopenharmony_ci hdev->asic_funcs->late_fini(hdev); 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_ci hdev->late_init_done = false; 4988c2ecf20Sopenharmony_ci} 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ciuint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms) 5018c2ecf20Sopenharmony_ci{ 5028c2ecf20Sopenharmony_ci struct hl_device_idle_busy_ts *ts; 5038c2ecf20Sopenharmony_ci ktime_t zero_ktime, curr = ktime_get(); 5048c2ecf20Sopenharmony_ci u32 overlap_cnt = 0, last_index = hdev->idle_busy_ts_idx; 5058c2ecf20Sopenharmony_ci s64 period_us, last_start_us, last_end_us, last_busy_time_us, 5068c2ecf20Sopenharmony_ci total_busy_time_us = 0, total_busy_time_ms; 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci zero_ktime = ktime_set(0, 0); 5098c2ecf20Sopenharmony_ci period_us = period_ms * USEC_PER_MSEC; 5108c2ecf20Sopenharmony_ci ts = &hdev->idle_busy_ts_arr[last_index]; 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci /* check case that device is currently in idle */ 5138c2ecf20Sopenharmony_ci if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime) && 5148c2ecf20Sopenharmony_ci !ktime_compare(ts->idle_to_busy_ts, zero_ktime)) { 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci last_index--; 5178c2ecf20Sopenharmony_ci /* Handle case idle_busy_ts_idx was 0 */ 5188c2ecf20Sopenharmony_ci if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE) 5198c2ecf20Sopenharmony_ci last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1; 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci ts = &hdev->idle_busy_ts_arr[last_index]; 5228c2ecf20Sopenharmony_ci } 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci while (overlap_cnt < HL_IDLE_BUSY_TS_ARR_SIZE) { 5258c2ecf20Sopenharmony_ci /* Check if we are in last sample case. i.e. if the sample 5268c2ecf20Sopenharmony_ci * begun before the sampling period. This could be a real 5278c2ecf20Sopenharmony_ci * sample or 0 so need to handle both cases 5288c2ecf20Sopenharmony_ci */ 5298c2ecf20Sopenharmony_ci last_start_us = ktime_to_us( 5308c2ecf20Sopenharmony_ci ktime_sub(curr, ts->idle_to_busy_ts)); 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ci if (last_start_us > period_us) { 5338c2ecf20Sopenharmony_ci 5348c2ecf20Sopenharmony_ci /* First check two cases: 5358c2ecf20Sopenharmony_ci * 1. If the device is currently busy 5368c2ecf20Sopenharmony_ci * 2. If the device was idle during the whole sampling 5378c2ecf20Sopenharmony_ci * period 5388c2ecf20Sopenharmony_ci */ 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime)) { 5418c2ecf20Sopenharmony_ci /* Check if the device is currently busy */ 5428c2ecf20Sopenharmony_ci if (ktime_compare(ts->idle_to_busy_ts, 5438c2ecf20Sopenharmony_ci zero_ktime)) 5448c2ecf20Sopenharmony_ci return 100; 5458c2ecf20Sopenharmony_ci 5468c2ecf20Sopenharmony_ci /* We either didn't have any activity or we 5478c2ecf20Sopenharmony_ci * reached an entry which is 0. Either way, 5488c2ecf20Sopenharmony_ci * exit and return what was accumulated so far 5498c2ecf20Sopenharmony_ci */ 5508c2ecf20Sopenharmony_ci break; 5518c2ecf20Sopenharmony_ci } 5528c2ecf20Sopenharmony_ci 5538c2ecf20Sopenharmony_ci /* If sample has finished, check it is relevant */ 5548c2ecf20Sopenharmony_ci last_end_us = ktime_to_us( 5558c2ecf20Sopenharmony_ci ktime_sub(curr, ts->busy_to_idle_ts)); 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci if (last_end_us > period_us) 5588c2ecf20Sopenharmony_ci break; 5598c2ecf20Sopenharmony_ci 5608c2ecf20Sopenharmony_ci /* It is relevant so add it but with adjustment */ 5618c2ecf20Sopenharmony_ci last_busy_time_us = ktime_to_us( 5628c2ecf20Sopenharmony_ci ktime_sub(ts->busy_to_idle_ts, 5638c2ecf20Sopenharmony_ci ts->idle_to_busy_ts)); 5648c2ecf20Sopenharmony_ci total_busy_time_us += last_busy_time_us - 5658c2ecf20Sopenharmony_ci (last_start_us - period_us); 5668c2ecf20Sopenharmony_ci break; 5678c2ecf20Sopenharmony_ci } 5688c2ecf20Sopenharmony_ci 5698c2ecf20Sopenharmony_ci /* Check if the sample is finished or still open */ 5708c2ecf20Sopenharmony_ci if (ktime_compare(ts->busy_to_idle_ts, zero_ktime)) 5718c2ecf20Sopenharmony_ci last_busy_time_us = ktime_to_us( 5728c2ecf20Sopenharmony_ci ktime_sub(ts->busy_to_idle_ts, 5738c2ecf20Sopenharmony_ci ts->idle_to_busy_ts)); 5748c2ecf20Sopenharmony_ci else 5758c2ecf20Sopenharmony_ci last_busy_time_us = ktime_to_us( 5768c2ecf20Sopenharmony_ci ktime_sub(curr, ts->idle_to_busy_ts)); 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_ci total_busy_time_us += last_busy_time_us; 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci last_index--; 5818c2ecf20Sopenharmony_ci /* Handle case idle_busy_ts_idx was 0 */ 5828c2ecf20Sopenharmony_ci if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE) 5838c2ecf20Sopenharmony_ci last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1; 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_ci ts = &hdev->idle_busy_ts_arr[last_index]; 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci overlap_cnt++; 5888c2ecf20Sopenharmony_ci } 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci total_busy_time_ms = DIV_ROUND_UP_ULL(total_busy_time_us, 5918c2ecf20Sopenharmony_ci USEC_PER_MSEC); 5928c2ecf20Sopenharmony_ci 5938c2ecf20Sopenharmony_ci return DIV_ROUND_UP_ULL(total_busy_time_ms * 100, period_ms); 5948c2ecf20Sopenharmony_ci} 5958c2ecf20Sopenharmony_ci 5968c2ecf20Sopenharmony_ci/* 5978c2ecf20Sopenharmony_ci * hl_device_set_frequency - set the frequency of the device 5988c2ecf20Sopenharmony_ci * 5998c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 6008c2ecf20Sopenharmony_ci * @freq: the new frequency value 6018c2ecf20Sopenharmony_ci * 6028c2ecf20Sopenharmony_ci * Change the frequency if needed. This function has no protection against 6038c2ecf20Sopenharmony_ci * concurrency, therefore it is assumed that the calling function has protected 6048c2ecf20Sopenharmony_ci * itself against the case of calling this function from multiple threads with 6058c2ecf20Sopenharmony_ci * different values 6068c2ecf20Sopenharmony_ci * 6078c2ecf20Sopenharmony_ci * Returns 0 if no change was done, otherwise returns 1 6088c2ecf20Sopenharmony_ci */ 6098c2ecf20Sopenharmony_ciint hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq) 6108c2ecf20Sopenharmony_ci{ 6118c2ecf20Sopenharmony_ci if ((hdev->pm_mng_profile == PM_MANUAL) || 6128c2ecf20Sopenharmony_ci (hdev->curr_pll_profile == freq)) 6138c2ecf20Sopenharmony_ci return 0; 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_ci dev_dbg(hdev->dev, "Changing device frequency to %s\n", 6168c2ecf20Sopenharmony_ci freq == PLL_HIGH ? "high" : "low"); 6178c2ecf20Sopenharmony_ci 6188c2ecf20Sopenharmony_ci hdev->asic_funcs->set_pll_profile(hdev, freq); 6198c2ecf20Sopenharmony_ci 6208c2ecf20Sopenharmony_ci hdev->curr_pll_profile = freq; 6218c2ecf20Sopenharmony_ci 6228c2ecf20Sopenharmony_ci return 1; 6238c2ecf20Sopenharmony_ci} 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_ciint hl_device_set_debug_mode(struct hl_device *hdev, bool enable) 6268c2ecf20Sopenharmony_ci{ 6278c2ecf20Sopenharmony_ci int rc = 0; 6288c2ecf20Sopenharmony_ci 6298c2ecf20Sopenharmony_ci mutex_lock(&hdev->debug_lock); 6308c2ecf20Sopenharmony_ci 6318c2ecf20Sopenharmony_ci if (!enable) { 6328c2ecf20Sopenharmony_ci if (!hdev->in_debug) { 6338c2ecf20Sopenharmony_ci dev_err(hdev->dev, 6348c2ecf20Sopenharmony_ci "Failed to disable debug mode because device was not in debug mode\n"); 6358c2ecf20Sopenharmony_ci rc = -EFAULT; 6368c2ecf20Sopenharmony_ci goto out; 6378c2ecf20Sopenharmony_ci } 6388c2ecf20Sopenharmony_ci 6398c2ecf20Sopenharmony_ci if (!hdev->hard_reset_pending) 6408c2ecf20Sopenharmony_ci hdev->asic_funcs->halt_coresight(hdev); 6418c2ecf20Sopenharmony_ci 6428c2ecf20Sopenharmony_ci hdev->in_debug = 0; 6438c2ecf20Sopenharmony_ci 6448c2ecf20Sopenharmony_ci if (!hdev->hard_reset_pending) 6458c2ecf20Sopenharmony_ci hdev->asic_funcs->set_clock_gating(hdev); 6468c2ecf20Sopenharmony_ci 6478c2ecf20Sopenharmony_ci goto out; 6488c2ecf20Sopenharmony_ci } 6498c2ecf20Sopenharmony_ci 6508c2ecf20Sopenharmony_ci if (hdev->in_debug) { 6518c2ecf20Sopenharmony_ci dev_err(hdev->dev, 6528c2ecf20Sopenharmony_ci "Failed to enable debug mode because device is already in debug mode\n"); 6538c2ecf20Sopenharmony_ci rc = -EFAULT; 6548c2ecf20Sopenharmony_ci goto out; 6558c2ecf20Sopenharmony_ci } 6568c2ecf20Sopenharmony_ci 6578c2ecf20Sopenharmony_ci hdev->asic_funcs->disable_clock_gating(hdev); 6588c2ecf20Sopenharmony_ci hdev->in_debug = 1; 6598c2ecf20Sopenharmony_ci 6608c2ecf20Sopenharmony_ciout: 6618c2ecf20Sopenharmony_ci mutex_unlock(&hdev->debug_lock); 6628c2ecf20Sopenharmony_ci 6638c2ecf20Sopenharmony_ci return rc; 6648c2ecf20Sopenharmony_ci} 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_ci/* 6678c2ecf20Sopenharmony_ci * hl_device_suspend - initiate device suspend 6688c2ecf20Sopenharmony_ci * 6698c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 6708c2ecf20Sopenharmony_ci * 6718c2ecf20Sopenharmony_ci * Puts the hw in the suspend state (all asics). 6728c2ecf20Sopenharmony_ci * Returns 0 for success or an error on failure. 6738c2ecf20Sopenharmony_ci * Called at driver suspend. 6748c2ecf20Sopenharmony_ci */ 6758c2ecf20Sopenharmony_ciint hl_device_suspend(struct hl_device *hdev) 6768c2ecf20Sopenharmony_ci{ 6778c2ecf20Sopenharmony_ci int rc; 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci pci_save_state(hdev->pdev); 6808c2ecf20Sopenharmony_ci 6818c2ecf20Sopenharmony_ci /* Block future CS/VM/JOB completion operations */ 6828c2ecf20Sopenharmony_ci rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); 6838c2ecf20Sopenharmony_ci if (rc) { 6848c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Can't suspend while in reset\n"); 6858c2ecf20Sopenharmony_ci return -EIO; 6868c2ecf20Sopenharmony_ci } 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_ci /* This blocks all other stuff that is not blocked by in_reset */ 6898c2ecf20Sopenharmony_ci hdev->disabled = true; 6908c2ecf20Sopenharmony_ci 6918c2ecf20Sopenharmony_ci /* 6928c2ecf20Sopenharmony_ci * Flush anyone that is inside the critical section of enqueue 6938c2ecf20Sopenharmony_ci * jobs to the H/W 6948c2ecf20Sopenharmony_ci */ 6958c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_queues_lock(hdev); 6968c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_queues_unlock(hdev); 6978c2ecf20Sopenharmony_ci 6988c2ecf20Sopenharmony_ci /* Flush processes that are sending message to CPU */ 6998c2ecf20Sopenharmony_ci mutex_lock(&hdev->send_cpu_message_lock); 7008c2ecf20Sopenharmony_ci mutex_unlock(&hdev->send_cpu_message_lock); 7018c2ecf20Sopenharmony_ci 7028c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->suspend(hdev); 7038c2ecf20Sopenharmony_ci if (rc) 7048c2ecf20Sopenharmony_ci dev_err(hdev->dev, 7058c2ecf20Sopenharmony_ci "Failed to disable PCI access of device CPU\n"); 7068c2ecf20Sopenharmony_ci 7078c2ecf20Sopenharmony_ci /* Shut down the device */ 7088c2ecf20Sopenharmony_ci pci_disable_device(hdev->pdev); 7098c2ecf20Sopenharmony_ci pci_set_power_state(hdev->pdev, PCI_D3hot); 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_ci return 0; 7128c2ecf20Sopenharmony_ci} 7138c2ecf20Sopenharmony_ci 7148c2ecf20Sopenharmony_ci/* 7158c2ecf20Sopenharmony_ci * hl_device_resume - initiate device resume 7168c2ecf20Sopenharmony_ci * 7178c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 7188c2ecf20Sopenharmony_ci * 7198c2ecf20Sopenharmony_ci * Bring the hw back to operating state (all asics). 7208c2ecf20Sopenharmony_ci * Returns 0 for success or an error on failure. 7218c2ecf20Sopenharmony_ci * Called at driver resume. 7228c2ecf20Sopenharmony_ci */ 7238c2ecf20Sopenharmony_ciint hl_device_resume(struct hl_device *hdev) 7248c2ecf20Sopenharmony_ci{ 7258c2ecf20Sopenharmony_ci int rc; 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_ci pci_set_power_state(hdev->pdev, PCI_D0); 7288c2ecf20Sopenharmony_ci pci_restore_state(hdev->pdev); 7298c2ecf20Sopenharmony_ci rc = pci_enable_device_mem(hdev->pdev); 7308c2ecf20Sopenharmony_ci if (rc) { 7318c2ecf20Sopenharmony_ci dev_err(hdev->dev, 7328c2ecf20Sopenharmony_ci "Failed to enable PCI device in resume\n"); 7338c2ecf20Sopenharmony_ci return rc; 7348c2ecf20Sopenharmony_ci } 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci pci_set_master(hdev->pdev); 7378c2ecf20Sopenharmony_ci 7388c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->resume(hdev); 7398c2ecf20Sopenharmony_ci if (rc) { 7408c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed to resume device after suspend\n"); 7418c2ecf20Sopenharmony_ci goto disable_device; 7428c2ecf20Sopenharmony_ci } 7438c2ecf20Sopenharmony_ci 7448c2ecf20Sopenharmony_ci 7458c2ecf20Sopenharmony_ci hdev->disabled = false; 7468c2ecf20Sopenharmony_ci atomic_set(&hdev->in_reset, 0); 7478c2ecf20Sopenharmony_ci 7488c2ecf20Sopenharmony_ci rc = hl_device_reset(hdev, true, false); 7498c2ecf20Sopenharmony_ci if (rc) { 7508c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed to reset device during resume\n"); 7518c2ecf20Sopenharmony_ci goto disable_device; 7528c2ecf20Sopenharmony_ci } 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci return 0; 7558c2ecf20Sopenharmony_ci 7568c2ecf20Sopenharmony_cidisable_device: 7578c2ecf20Sopenharmony_ci pci_clear_master(hdev->pdev); 7588c2ecf20Sopenharmony_ci pci_disable_device(hdev->pdev); 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_ci return rc; 7618c2ecf20Sopenharmony_ci} 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_cistatic int device_kill_open_processes(struct hl_device *hdev) 7648c2ecf20Sopenharmony_ci{ 7658c2ecf20Sopenharmony_ci u16 pending_total, pending_cnt; 7668c2ecf20Sopenharmony_ci struct hl_fpriv *hpriv; 7678c2ecf20Sopenharmony_ci struct task_struct *task = NULL; 7688c2ecf20Sopenharmony_ci 7698c2ecf20Sopenharmony_ci if (hdev->pldm) 7708c2ecf20Sopenharmony_ci pending_total = HL_PLDM_PENDING_RESET_PER_SEC; 7718c2ecf20Sopenharmony_ci else 7728c2ecf20Sopenharmony_ci pending_total = HL_PENDING_RESET_PER_SEC; 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci /* Giving time for user to close FD, and for processes that are inside 7758c2ecf20Sopenharmony_ci * hl_device_open to finish 7768c2ecf20Sopenharmony_ci */ 7778c2ecf20Sopenharmony_ci if (!list_empty(&hdev->fpriv_list)) 7788c2ecf20Sopenharmony_ci ssleep(1); 7798c2ecf20Sopenharmony_ci 7808c2ecf20Sopenharmony_ci mutex_lock(&hdev->fpriv_list_lock); 7818c2ecf20Sopenharmony_ci 7828c2ecf20Sopenharmony_ci /* This section must be protected because we are dereferencing 7838c2ecf20Sopenharmony_ci * pointers that are freed if the process exits 7848c2ecf20Sopenharmony_ci */ 7858c2ecf20Sopenharmony_ci list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) { 7868c2ecf20Sopenharmony_ci task = get_pid_task(hpriv->taskpid, PIDTYPE_PID); 7878c2ecf20Sopenharmony_ci if (task) { 7888c2ecf20Sopenharmony_ci dev_info(hdev->dev, "Killing user process pid=%d\n", 7898c2ecf20Sopenharmony_ci task_pid_nr(task)); 7908c2ecf20Sopenharmony_ci send_sig(SIGKILL, task, 1); 7918c2ecf20Sopenharmony_ci usleep_range(1000, 10000); 7928c2ecf20Sopenharmony_ci 7938c2ecf20Sopenharmony_ci put_task_struct(task); 7948c2ecf20Sopenharmony_ci } 7958c2ecf20Sopenharmony_ci } 7968c2ecf20Sopenharmony_ci 7978c2ecf20Sopenharmony_ci mutex_unlock(&hdev->fpriv_list_lock); 7988c2ecf20Sopenharmony_ci 7998c2ecf20Sopenharmony_ci /* We killed the open users, but because the driver cleans up after the 8008c2ecf20Sopenharmony_ci * user contexts are closed (e.g. mmu mappings), we need to wait again 8018c2ecf20Sopenharmony_ci * to make sure the cleaning phase is finished before continuing with 8028c2ecf20Sopenharmony_ci * the reset 8038c2ecf20Sopenharmony_ci */ 8048c2ecf20Sopenharmony_ci 8058c2ecf20Sopenharmony_ci pending_cnt = pending_total; 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_ci while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) { 8088c2ecf20Sopenharmony_ci dev_info(hdev->dev, 8098c2ecf20Sopenharmony_ci "Waiting for all unmap operations to finish before hard reset\n"); 8108c2ecf20Sopenharmony_ci 8118c2ecf20Sopenharmony_ci pending_cnt--; 8128c2ecf20Sopenharmony_ci 8138c2ecf20Sopenharmony_ci ssleep(1); 8148c2ecf20Sopenharmony_ci } 8158c2ecf20Sopenharmony_ci 8168c2ecf20Sopenharmony_ci return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY; 8178c2ecf20Sopenharmony_ci} 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_cistatic void device_hard_reset_pending(struct work_struct *work) 8208c2ecf20Sopenharmony_ci{ 8218c2ecf20Sopenharmony_ci struct hl_device_reset_work *device_reset_work = 8228c2ecf20Sopenharmony_ci container_of(work, struct hl_device_reset_work, reset_work); 8238c2ecf20Sopenharmony_ci struct hl_device *hdev = device_reset_work->hdev; 8248c2ecf20Sopenharmony_ci 8258c2ecf20Sopenharmony_ci hl_device_reset(hdev, true, true); 8268c2ecf20Sopenharmony_ci 8278c2ecf20Sopenharmony_ci kfree(device_reset_work); 8288c2ecf20Sopenharmony_ci} 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci/* 8318c2ecf20Sopenharmony_ci * hl_device_reset - reset the device 8328c2ecf20Sopenharmony_ci * 8338c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 8348c2ecf20Sopenharmony_ci * @hard_reset: should we do hard reset to all engines or just reset the 8358c2ecf20Sopenharmony_ci * compute/dma engines 8368c2ecf20Sopenharmony_ci * @from_hard_reset_thread: is the caller the hard-reset thread 8378c2ecf20Sopenharmony_ci * 8388c2ecf20Sopenharmony_ci * Block future CS and wait for pending CS to be enqueued 8398c2ecf20Sopenharmony_ci * Call ASIC H/W fini 8408c2ecf20Sopenharmony_ci * Flush all completions 8418c2ecf20Sopenharmony_ci * Re-initialize all internal data structures 8428c2ecf20Sopenharmony_ci * Call ASIC H/W init, late_init 8438c2ecf20Sopenharmony_ci * Test queues 8448c2ecf20Sopenharmony_ci * Enable device 8458c2ecf20Sopenharmony_ci * 8468c2ecf20Sopenharmony_ci * Returns 0 for success or an error on failure. 8478c2ecf20Sopenharmony_ci */ 8488c2ecf20Sopenharmony_ciint hl_device_reset(struct hl_device *hdev, bool hard_reset, 8498c2ecf20Sopenharmony_ci bool from_hard_reset_thread) 8508c2ecf20Sopenharmony_ci{ 8518c2ecf20Sopenharmony_ci int i, rc; 8528c2ecf20Sopenharmony_ci 8538c2ecf20Sopenharmony_ci if (!hdev->init_done) { 8548c2ecf20Sopenharmony_ci dev_err(hdev->dev, 8558c2ecf20Sopenharmony_ci "Can't reset before initialization is done\n"); 8568c2ecf20Sopenharmony_ci return 0; 8578c2ecf20Sopenharmony_ci } 8588c2ecf20Sopenharmony_ci 8598c2ecf20Sopenharmony_ci if ((!hard_reset) && (!hdev->supports_soft_reset)) { 8608c2ecf20Sopenharmony_ci dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n"); 8618c2ecf20Sopenharmony_ci hard_reset = true; 8628c2ecf20Sopenharmony_ci } 8638c2ecf20Sopenharmony_ci 8648c2ecf20Sopenharmony_ci /* 8658c2ecf20Sopenharmony_ci * Prevent concurrency in this function - only one reset should be 8668c2ecf20Sopenharmony_ci * done at any given time. Only need to perform this if we didn't 8678c2ecf20Sopenharmony_ci * get from the dedicated hard reset thread 8688c2ecf20Sopenharmony_ci */ 8698c2ecf20Sopenharmony_ci if (!from_hard_reset_thread) { 8708c2ecf20Sopenharmony_ci /* Block future CS/VM/JOB completion operations */ 8718c2ecf20Sopenharmony_ci rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); 8728c2ecf20Sopenharmony_ci if (rc) 8738c2ecf20Sopenharmony_ci return 0; 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_ci if (hard_reset) { 8768c2ecf20Sopenharmony_ci /* Disable PCI access from device F/W so he won't send 8778c2ecf20Sopenharmony_ci * us additional interrupts. We disable MSI/MSI-X at 8788c2ecf20Sopenharmony_ci * the halt_engines function and we can't have the F/W 8798c2ecf20Sopenharmony_ci * sending us interrupts after that. We need to disable 8808c2ecf20Sopenharmony_ci * the access here because if the device is marked 8818c2ecf20Sopenharmony_ci * disable, the message won't be send. Also, in case 8828c2ecf20Sopenharmony_ci * of heartbeat, the device CPU is marked as disable 8838c2ecf20Sopenharmony_ci * so this message won't be sent 8848c2ecf20Sopenharmony_ci */ 8858c2ecf20Sopenharmony_ci if (hl_fw_send_pci_access_msg(hdev, 8868c2ecf20Sopenharmony_ci CPUCP_PACKET_DISABLE_PCI_ACCESS)) 8878c2ecf20Sopenharmony_ci dev_warn(hdev->dev, 8888c2ecf20Sopenharmony_ci "Failed to disable PCI access by F/W\n"); 8898c2ecf20Sopenharmony_ci } 8908c2ecf20Sopenharmony_ci 8918c2ecf20Sopenharmony_ci /* This also blocks future CS/VM/JOB completion operations */ 8928c2ecf20Sopenharmony_ci hdev->disabled = true; 8938c2ecf20Sopenharmony_ci 8948c2ecf20Sopenharmony_ci /* Flush anyone that is inside the critical section of enqueue 8958c2ecf20Sopenharmony_ci * jobs to the H/W 8968c2ecf20Sopenharmony_ci */ 8978c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_queues_lock(hdev); 8988c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_queues_unlock(hdev); 8998c2ecf20Sopenharmony_ci 9008c2ecf20Sopenharmony_ci /* Flush anyone that is inside device open */ 9018c2ecf20Sopenharmony_ci mutex_lock(&hdev->fpriv_list_lock); 9028c2ecf20Sopenharmony_ci mutex_unlock(&hdev->fpriv_list_lock); 9038c2ecf20Sopenharmony_ci 9048c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Going to RESET device!\n"); 9058c2ecf20Sopenharmony_ci } 9068c2ecf20Sopenharmony_ci 9078c2ecf20Sopenharmony_ciagain: 9088c2ecf20Sopenharmony_ci if ((hard_reset) && (!from_hard_reset_thread)) { 9098c2ecf20Sopenharmony_ci struct hl_device_reset_work *device_reset_work; 9108c2ecf20Sopenharmony_ci 9118c2ecf20Sopenharmony_ci hdev->hard_reset_pending = true; 9128c2ecf20Sopenharmony_ci 9138c2ecf20Sopenharmony_ci device_reset_work = kzalloc(sizeof(*device_reset_work), 9148c2ecf20Sopenharmony_ci GFP_ATOMIC); 9158c2ecf20Sopenharmony_ci if (!device_reset_work) { 9168c2ecf20Sopenharmony_ci rc = -ENOMEM; 9178c2ecf20Sopenharmony_ci goto out_err; 9188c2ecf20Sopenharmony_ci } 9198c2ecf20Sopenharmony_ci 9208c2ecf20Sopenharmony_ci /* 9218c2ecf20Sopenharmony_ci * Because the reset function can't run from interrupt or 9228c2ecf20Sopenharmony_ci * from heartbeat work, we need to call the reset function 9238c2ecf20Sopenharmony_ci * from a dedicated work 9248c2ecf20Sopenharmony_ci */ 9258c2ecf20Sopenharmony_ci INIT_WORK(&device_reset_work->reset_work, 9268c2ecf20Sopenharmony_ci device_hard_reset_pending); 9278c2ecf20Sopenharmony_ci device_reset_work->hdev = hdev; 9288c2ecf20Sopenharmony_ci schedule_work(&device_reset_work->reset_work); 9298c2ecf20Sopenharmony_ci 9308c2ecf20Sopenharmony_ci return 0; 9318c2ecf20Sopenharmony_ci } 9328c2ecf20Sopenharmony_ci 9338c2ecf20Sopenharmony_ci if (hard_reset) { 9348c2ecf20Sopenharmony_ci device_late_fini(hdev); 9358c2ecf20Sopenharmony_ci 9368c2ecf20Sopenharmony_ci /* 9378c2ecf20Sopenharmony_ci * Now that the heartbeat thread is closed, flush processes 9388c2ecf20Sopenharmony_ci * which are sending messages to CPU 9398c2ecf20Sopenharmony_ci */ 9408c2ecf20Sopenharmony_ci mutex_lock(&hdev->send_cpu_message_lock); 9418c2ecf20Sopenharmony_ci mutex_unlock(&hdev->send_cpu_message_lock); 9428c2ecf20Sopenharmony_ci } 9438c2ecf20Sopenharmony_ci 9448c2ecf20Sopenharmony_ci /* 9458c2ecf20Sopenharmony_ci * Halt the engines and disable interrupts so we won't get any more 9468c2ecf20Sopenharmony_ci * completions from H/W and we won't have any accesses from the 9478c2ecf20Sopenharmony_ci * H/W to the host machine 9488c2ecf20Sopenharmony_ci */ 9498c2ecf20Sopenharmony_ci hdev->asic_funcs->halt_engines(hdev, hard_reset); 9508c2ecf20Sopenharmony_ci 9518c2ecf20Sopenharmony_ci /* Go over all the queues, release all CS and their jobs */ 9528c2ecf20Sopenharmony_ci hl_cs_rollback_all(hdev); 9538c2ecf20Sopenharmony_ci 9548c2ecf20Sopenharmony_ci if (hard_reset) { 9558c2ecf20Sopenharmony_ci /* Kill processes here after CS rollback. This is because the 9568c2ecf20Sopenharmony_ci * process can't really exit until all its CSs are done, which 9578c2ecf20Sopenharmony_ci * is what we do in cs rollback 9588c2ecf20Sopenharmony_ci */ 9598c2ecf20Sopenharmony_ci rc = device_kill_open_processes(hdev); 9608c2ecf20Sopenharmony_ci if (rc) { 9618c2ecf20Sopenharmony_ci dev_crit(hdev->dev, 9628c2ecf20Sopenharmony_ci "Failed to kill all open processes, stopping hard reset\n"); 9638c2ecf20Sopenharmony_ci goto out_err; 9648c2ecf20Sopenharmony_ci } 9658c2ecf20Sopenharmony_ci 9668c2ecf20Sopenharmony_ci /* Flush the Event queue workers to make sure no other thread is 9678c2ecf20Sopenharmony_ci * reading or writing to registers during the reset 9688c2ecf20Sopenharmony_ci */ 9698c2ecf20Sopenharmony_ci flush_workqueue(hdev->eq_wq); 9708c2ecf20Sopenharmony_ci } 9718c2ecf20Sopenharmony_ci 9728c2ecf20Sopenharmony_ci /* Reset the H/W. It will be in idle state after this returns */ 9738c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_fini(hdev, hard_reset); 9748c2ecf20Sopenharmony_ci 9758c2ecf20Sopenharmony_ci if (hard_reset) { 9768c2ecf20Sopenharmony_ci /* Release kernel context */ 9778c2ecf20Sopenharmony_ci if (hl_ctx_put(hdev->kernel_ctx) == 1) 9788c2ecf20Sopenharmony_ci hdev->kernel_ctx = NULL; 9798c2ecf20Sopenharmony_ci hl_vm_fini(hdev); 9808c2ecf20Sopenharmony_ci hl_mmu_fini(hdev); 9818c2ecf20Sopenharmony_ci hl_eq_reset(hdev, &hdev->event_queue); 9828c2ecf20Sopenharmony_ci } 9838c2ecf20Sopenharmony_ci 9848c2ecf20Sopenharmony_ci /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */ 9858c2ecf20Sopenharmony_ci hl_hw_queue_reset(hdev, hard_reset); 9868c2ecf20Sopenharmony_ci for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) 9878c2ecf20Sopenharmony_ci hl_cq_reset(hdev, &hdev->completion_queue[i]); 9888c2ecf20Sopenharmony_ci 9898c2ecf20Sopenharmony_ci hdev->idle_busy_ts_idx = 0; 9908c2ecf20Sopenharmony_ci hdev->idle_busy_ts_arr[0].busy_to_idle_ts = ktime_set(0, 0); 9918c2ecf20Sopenharmony_ci hdev->idle_busy_ts_arr[0].idle_to_busy_ts = ktime_set(0, 0); 9928c2ecf20Sopenharmony_ci 9938c2ecf20Sopenharmony_ci if (hdev->cs_active_cnt) 9948c2ecf20Sopenharmony_ci dev_crit(hdev->dev, "CS active cnt %d is not 0 during reset\n", 9958c2ecf20Sopenharmony_ci hdev->cs_active_cnt); 9968c2ecf20Sopenharmony_ci 9978c2ecf20Sopenharmony_ci mutex_lock(&hdev->fpriv_list_lock); 9988c2ecf20Sopenharmony_ci 9998c2ecf20Sopenharmony_ci /* Make sure the context switch phase will run again */ 10008c2ecf20Sopenharmony_ci if (hdev->compute_ctx) { 10018c2ecf20Sopenharmony_ci atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1); 10028c2ecf20Sopenharmony_ci hdev->compute_ctx->thread_ctx_switch_wait_token = 0; 10038c2ecf20Sopenharmony_ci } 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci mutex_unlock(&hdev->fpriv_list_lock); 10068c2ecf20Sopenharmony_ci 10078c2ecf20Sopenharmony_ci /* Finished tear-down, starting to re-initialize */ 10088c2ecf20Sopenharmony_ci 10098c2ecf20Sopenharmony_ci if (hard_reset) { 10108c2ecf20Sopenharmony_ci hdev->device_cpu_disabled = false; 10118c2ecf20Sopenharmony_ci hdev->hard_reset_pending = false; 10128c2ecf20Sopenharmony_ci 10138c2ecf20Sopenharmony_ci if (hdev->kernel_ctx) { 10148c2ecf20Sopenharmony_ci dev_crit(hdev->dev, 10158c2ecf20Sopenharmony_ci "kernel ctx was alive during hard reset, something is terribly wrong\n"); 10168c2ecf20Sopenharmony_ci rc = -EBUSY; 10178c2ecf20Sopenharmony_ci goto out_err; 10188c2ecf20Sopenharmony_ci } 10198c2ecf20Sopenharmony_ci 10208c2ecf20Sopenharmony_ci rc = hl_mmu_init(hdev); 10218c2ecf20Sopenharmony_ci if (rc) { 10228c2ecf20Sopenharmony_ci dev_err(hdev->dev, 10238c2ecf20Sopenharmony_ci "Failed to initialize MMU S/W after hard reset\n"); 10248c2ecf20Sopenharmony_ci goto out_err; 10258c2ecf20Sopenharmony_ci } 10268c2ecf20Sopenharmony_ci 10278c2ecf20Sopenharmony_ci /* Allocate the kernel context */ 10288c2ecf20Sopenharmony_ci hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), 10298c2ecf20Sopenharmony_ci GFP_KERNEL); 10308c2ecf20Sopenharmony_ci if (!hdev->kernel_ctx) { 10318c2ecf20Sopenharmony_ci rc = -ENOMEM; 10328c2ecf20Sopenharmony_ci hl_mmu_fini(hdev); 10338c2ecf20Sopenharmony_ci goto out_err; 10348c2ecf20Sopenharmony_ci } 10358c2ecf20Sopenharmony_ci 10368c2ecf20Sopenharmony_ci hdev->compute_ctx = NULL; 10378c2ecf20Sopenharmony_ci 10388c2ecf20Sopenharmony_ci rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); 10398c2ecf20Sopenharmony_ci if (rc) { 10408c2ecf20Sopenharmony_ci dev_err(hdev->dev, 10418c2ecf20Sopenharmony_ci "failed to init kernel ctx in hard reset\n"); 10428c2ecf20Sopenharmony_ci kfree(hdev->kernel_ctx); 10438c2ecf20Sopenharmony_ci hdev->kernel_ctx = NULL; 10448c2ecf20Sopenharmony_ci hl_mmu_fini(hdev); 10458c2ecf20Sopenharmony_ci goto out_err; 10468c2ecf20Sopenharmony_ci } 10478c2ecf20Sopenharmony_ci } 10488c2ecf20Sopenharmony_ci 10498c2ecf20Sopenharmony_ci /* Device is now enabled as part of the initialization requires 10508c2ecf20Sopenharmony_ci * communication with the device firmware to get information that 10518c2ecf20Sopenharmony_ci * is required for the initialization itself 10528c2ecf20Sopenharmony_ci */ 10538c2ecf20Sopenharmony_ci hdev->disabled = false; 10548c2ecf20Sopenharmony_ci 10558c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->hw_init(hdev); 10568c2ecf20Sopenharmony_ci if (rc) { 10578c2ecf20Sopenharmony_ci dev_err(hdev->dev, 10588c2ecf20Sopenharmony_ci "failed to initialize the H/W after reset\n"); 10598c2ecf20Sopenharmony_ci goto out_err; 10608c2ecf20Sopenharmony_ci } 10618c2ecf20Sopenharmony_ci 10628c2ecf20Sopenharmony_ci /* Check that the communication with the device is working */ 10638c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->test_queues(hdev); 10648c2ecf20Sopenharmony_ci if (rc) { 10658c2ecf20Sopenharmony_ci dev_err(hdev->dev, 10668c2ecf20Sopenharmony_ci "Failed to detect if device is alive after reset\n"); 10678c2ecf20Sopenharmony_ci goto out_err; 10688c2ecf20Sopenharmony_ci } 10698c2ecf20Sopenharmony_ci 10708c2ecf20Sopenharmony_ci if (hard_reset) { 10718c2ecf20Sopenharmony_ci rc = device_late_init(hdev); 10728c2ecf20Sopenharmony_ci if (rc) { 10738c2ecf20Sopenharmony_ci dev_err(hdev->dev, 10748c2ecf20Sopenharmony_ci "Failed late init after hard reset\n"); 10758c2ecf20Sopenharmony_ci goto out_err; 10768c2ecf20Sopenharmony_ci } 10778c2ecf20Sopenharmony_ci 10788c2ecf20Sopenharmony_ci rc = hl_vm_init(hdev); 10798c2ecf20Sopenharmony_ci if (rc) { 10808c2ecf20Sopenharmony_ci dev_err(hdev->dev, 10818c2ecf20Sopenharmony_ci "Failed to init memory module after hard reset\n"); 10828c2ecf20Sopenharmony_ci goto out_err; 10838c2ecf20Sopenharmony_ci } 10848c2ecf20Sopenharmony_ci 10858c2ecf20Sopenharmony_ci hl_set_max_power(hdev); 10868c2ecf20Sopenharmony_ci } else { 10878c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->soft_reset_late_init(hdev); 10888c2ecf20Sopenharmony_ci if (rc) { 10898c2ecf20Sopenharmony_ci dev_err(hdev->dev, 10908c2ecf20Sopenharmony_ci "Failed late init after soft reset\n"); 10918c2ecf20Sopenharmony_ci goto out_err; 10928c2ecf20Sopenharmony_ci } 10938c2ecf20Sopenharmony_ci } 10948c2ecf20Sopenharmony_ci 10958c2ecf20Sopenharmony_ci atomic_set(&hdev->in_reset, 0); 10968c2ecf20Sopenharmony_ci 10978c2ecf20Sopenharmony_ci if (hard_reset) 10988c2ecf20Sopenharmony_ci hdev->hard_reset_cnt++; 10998c2ecf20Sopenharmony_ci else 11008c2ecf20Sopenharmony_ci hdev->soft_reset_cnt++; 11018c2ecf20Sopenharmony_ci 11028c2ecf20Sopenharmony_ci dev_warn(hdev->dev, "Successfully finished resetting the device\n"); 11038c2ecf20Sopenharmony_ci 11048c2ecf20Sopenharmony_ci return 0; 11058c2ecf20Sopenharmony_ci 11068c2ecf20Sopenharmony_ciout_err: 11078c2ecf20Sopenharmony_ci hdev->disabled = true; 11088c2ecf20Sopenharmony_ci 11098c2ecf20Sopenharmony_ci if (hard_reset) { 11108c2ecf20Sopenharmony_ci dev_err(hdev->dev, 11118c2ecf20Sopenharmony_ci "Failed to reset! Device is NOT usable\n"); 11128c2ecf20Sopenharmony_ci hdev->hard_reset_cnt++; 11138c2ecf20Sopenharmony_ci } else { 11148c2ecf20Sopenharmony_ci dev_err(hdev->dev, 11158c2ecf20Sopenharmony_ci "Failed to do soft-reset, trying hard reset\n"); 11168c2ecf20Sopenharmony_ci hdev->soft_reset_cnt++; 11178c2ecf20Sopenharmony_ci hard_reset = true; 11188c2ecf20Sopenharmony_ci goto again; 11198c2ecf20Sopenharmony_ci } 11208c2ecf20Sopenharmony_ci 11218c2ecf20Sopenharmony_ci atomic_set(&hdev->in_reset, 0); 11228c2ecf20Sopenharmony_ci 11238c2ecf20Sopenharmony_ci return rc; 11248c2ecf20Sopenharmony_ci} 11258c2ecf20Sopenharmony_ci 11268c2ecf20Sopenharmony_ci/* 11278c2ecf20Sopenharmony_ci * hl_device_init - main initialization function for habanalabs device 11288c2ecf20Sopenharmony_ci * 11298c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 11308c2ecf20Sopenharmony_ci * 11318c2ecf20Sopenharmony_ci * Allocate an id for the device, do early initialization and then call the 11328c2ecf20Sopenharmony_ci * ASIC specific initialization functions. Finally, create the cdev and the 11338c2ecf20Sopenharmony_ci * Linux device to expose it to the user 11348c2ecf20Sopenharmony_ci */ 11358c2ecf20Sopenharmony_ciint hl_device_init(struct hl_device *hdev, struct class *hclass) 11368c2ecf20Sopenharmony_ci{ 11378c2ecf20Sopenharmony_ci int i, rc, cq_cnt, cq_ready_cnt; 11388c2ecf20Sopenharmony_ci char *name; 11398c2ecf20Sopenharmony_ci bool add_cdev_sysfs_on_err = false; 11408c2ecf20Sopenharmony_ci 11418c2ecf20Sopenharmony_ci name = kasprintf(GFP_KERNEL, "hl%d", hdev->id / 2); 11428c2ecf20Sopenharmony_ci if (!name) { 11438c2ecf20Sopenharmony_ci rc = -ENOMEM; 11448c2ecf20Sopenharmony_ci goto out_disabled; 11458c2ecf20Sopenharmony_ci } 11468c2ecf20Sopenharmony_ci 11478c2ecf20Sopenharmony_ci /* Initialize cdev and device structures */ 11488c2ecf20Sopenharmony_ci rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name, 11498c2ecf20Sopenharmony_ci &hdev->cdev, &hdev->dev); 11508c2ecf20Sopenharmony_ci 11518c2ecf20Sopenharmony_ci kfree(name); 11528c2ecf20Sopenharmony_ci 11538c2ecf20Sopenharmony_ci if (rc) 11548c2ecf20Sopenharmony_ci goto out_disabled; 11558c2ecf20Sopenharmony_ci 11568c2ecf20Sopenharmony_ci name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->id / 2); 11578c2ecf20Sopenharmony_ci if (!name) { 11588c2ecf20Sopenharmony_ci rc = -ENOMEM; 11598c2ecf20Sopenharmony_ci goto free_dev; 11608c2ecf20Sopenharmony_ci } 11618c2ecf20Sopenharmony_ci 11628c2ecf20Sopenharmony_ci /* Initialize cdev and device structures for control device */ 11638c2ecf20Sopenharmony_ci rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops, 11648c2ecf20Sopenharmony_ci name, &hdev->cdev_ctrl, &hdev->dev_ctrl); 11658c2ecf20Sopenharmony_ci 11668c2ecf20Sopenharmony_ci kfree(name); 11678c2ecf20Sopenharmony_ci 11688c2ecf20Sopenharmony_ci if (rc) 11698c2ecf20Sopenharmony_ci goto free_dev; 11708c2ecf20Sopenharmony_ci 11718c2ecf20Sopenharmony_ci /* Initialize ASIC function pointers and perform early init */ 11728c2ecf20Sopenharmony_ci rc = device_early_init(hdev); 11738c2ecf20Sopenharmony_ci if (rc) 11748c2ecf20Sopenharmony_ci goto free_dev_ctrl; 11758c2ecf20Sopenharmony_ci 11768c2ecf20Sopenharmony_ci /* 11778c2ecf20Sopenharmony_ci * Start calling ASIC initialization. First S/W then H/W and finally 11788c2ecf20Sopenharmony_ci * late init 11798c2ecf20Sopenharmony_ci */ 11808c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->sw_init(hdev); 11818c2ecf20Sopenharmony_ci if (rc) 11828c2ecf20Sopenharmony_ci goto early_fini; 11838c2ecf20Sopenharmony_ci 11848c2ecf20Sopenharmony_ci /* 11858c2ecf20Sopenharmony_ci * Initialize the H/W queues. Must be done before hw_init, because 11868c2ecf20Sopenharmony_ci * there the addresses of the kernel queue are being written to the 11878c2ecf20Sopenharmony_ci * registers of the device 11888c2ecf20Sopenharmony_ci */ 11898c2ecf20Sopenharmony_ci rc = hl_hw_queues_create(hdev); 11908c2ecf20Sopenharmony_ci if (rc) { 11918c2ecf20Sopenharmony_ci dev_err(hdev->dev, "failed to initialize kernel queues\n"); 11928c2ecf20Sopenharmony_ci goto sw_fini; 11938c2ecf20Sopenharmony_ci } 11948c2ecf20Sopenharmony_ci 11958c2ecf20Sopenharmony_ci cq_cnt = hdev->asic_prop.completion_queues_count; 11968c2ecf20Sopenharmony_ci 11978c2ecf20Sopenharmony_ci /* 11988c2ecf20Sopenharmony_ci * Initialize the completion queues. Must be done before hw_init, 11998c2ecf20Sopenharmony_ci * because there the addresses of the completion queues are being 12008c2ecf20Sopenharmony_ci * passed as arguments to request_irq 12018c2ecf20Sopenharmony_ci */ 12028c2ecf20Sopenharmony_ci if (cq_cnt) { 12038c2ecf20Sopenharmony_ci hdev->completion_queue = kcalloc(cq_cnt, 12048c2ecf20Sopenharmony_ci sizeof(*hdev->completion_queue), 12058c2ecf20Sopenharmony_ci GFP_KERNEL); 12068c2ecf20Sopenharmony_ci 12078c2ecf20Sopenharmony_ci if (!hdev->completion_queue) { 12088c2ecf20Sopenharmony_ci dev_err(hdev->dev, 12098c2ecf20Sopenharmony_ci "failed to allocate completion queues\n"); 12108c2ecf20Sopenharmony_ci rc = -ENOMEM; 12118c2ecf20Sopenharmony_ci goto hw_queues_destroy; 12128c2ecf20Sopenharmony_ci } 12138c2ecf20Sopenharmony_ci } 12148c2ecf20Sopenharmony_ci 12158c2ecf20Sopenharmony_ci for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) { 12168c2ecf20Sopenharmony_ci rc = hl_cq_init(hdev, &hdev->completion_queue[i], 12178c2ecf20Sopenharmony_ci hdev->asic_funcs->get_queue_id_for_cq(hdev, i)); 12188c2ecf20Sopenharmony_ci if (rc) { 12198c2ecf20Sopenharmony_ci dev_err(hdev->dev, 12208c2ecf20Sopenharmony_ci "failed to initialize completion queue\n"); 12218c2ecf20Sopenharmony_ci goto cq_fini; 12228c2ecf20Sopenharmony_ci } 12238c2ecf20Sopenharmony_ci hdev->completion_queue[i].cq_idx = i; 12248c2ecf20Sopenharmony_ci } 12258c2ecf20Sopenharmony_ci 12268c2ecf20Sopenharmony_ci /* 12278c2ecf20Sopenharmony_ci * Initialize the event queue. Must be done before hw_init, 12288c2ecf20Sopenharmony_ci * because there the address of the event queue is being 12298c2ecf20Sopenharmony_ci * passed as argument to request_irq 12308c2ecf20Sopenharmony_ci */ 12318c2ecf20Sopenharmony_ci rc = hl_eq_init(hdev, &hdev->event_queue); 12328c2ecf20Sopenharmony_ci if (rc) { 12338c2ecf20Sopenharmony_ci dev_err(hdev->dev, "failed to initialize event queue\n"); 12348c2ecf20Sopenharmony_ci goto cq_fini; 12358c2ecf20Sopenharmony_ci } 12368c2ecf20Sopenharmony_ci 12378c2ecf20Sopenharmony_ci /* MMU S/W must be initialized before kernel context is created */ 12388c2ecf20Sopenharmony_ci rc = hl_mmu_init(hdev); 12398c2ecf20Sopenharmony_ci if (rc) { 12408c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n"); 12418c2ecf20Sopenharmony_ci goto eq_fini; 12428c2ecf20Sopenharmony_ci } 12438c2ecf20Sopenharmony_ci 12448c2ecf20Sopenharmony_ci /* Allocate the kernel context */ 12458c2ecf20Sopenharmony_ci hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL); 12468c2ecf20Sopenharmony_ci if (!hdev->kernel_ctx) { 12478c2ecf20Sopenharmony_ci rc = -ENOMEM; 12488c2ecf20Sopenharmony_ci goto mmu_fini; 12498c2ecf20Sopenharmony_ci } 12508c2ecf20Sopenharmony_ci 12518c2ecf20Sopenharmony_ci hdev->compute_ctx = NULL; 12528c2ecf20Sopenharmony_ci 12538c2ecf20Sopenharmony_ci rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); 12548c2ecf20Sopenharmony_ci if (rc) { 12558c2ecf20Sopenharmony_ci dev_err(hdev->dev, "failed to initialize kernel context\n"); 12568c2ecf20Sopenharmony_ci kfree(hdev->kernel_ctx); 12578c2ecf20Sopenharmony_ci goto mmu_fini; 12588c2ecf20Sopenharmony_ci } 12598c2ecf20Sopenharmony_ci 12608c2ecf20Sopenharmony_ci rc = hl_cb_pool_init(hdev); 12618c2ecf20Sopenharmony_ci if (rc) { 12628c2ecf20Sopenharmony_ci dev_err(hdev->dev, "failed to initialize CB pool\n"); 12638c2ecf20Sopenharmony_ci goto release_ctx; 12648c2ecf20Sopenharmony_ci } 12658c2ecf20Sopenharmony_ci 12668c2ecf20Sopenharmony_ci hl_debugfs_add_device(hdev); 12678c2ecf20Sopenharmony_ci 12688c2ecf20Sopenharmony_ci if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { 12698c2ecf20Sopenharmony_ci dev_info(hdev->dev, 12708c2ecf20Sopenharmony_ci "H/W state is dirty, must reset before initializing\n"); 12718c2ecf20Sopenharmony_ci hdev->asic_funcs->halt_engines(hdev, true); 12728c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_fini(hdev, true); 12738c2ecf20Sopenharmony_ci } 12748c2ecf20Sopenharmony_ci 12758c2ecf20Sopenharmony_ci /* 12768c2ecf20Sopenharmony_ci * From this point, in case of an error, add char devices and create 12778c2ecf20Sopenharmony_ci * sysfs nodes as part of the error flow, to allow debugging. 12788c2ecf20Sopenharmony_ci */ 12798c2ecf20Sopenharmony_ci add_cdev_sysfs_on_err = true; 12808c2ecf20Sopenharmony_ci 12818c2ecf20Sopenharmony_ci /* Device is now enabled as part of the initialization requires 12828c2ecf20Sopenharmony_ci * communication with the device firmware to get information that 12838c2ecf20Sopenharmony_ci * is required for the initialization itself 12848c2ecf20Sopenharmony_ci */ 12858c2ecf20Sopenharmony_ci hdev->disabled = false; 12868c2ecf20Sopenharmony_ci 12878c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->hw_init(hdev); 12888c2ecf20Sopenharmony_ci if (rc) { 12898c2ecf20Sopenharmony_ci dev_err(hdev->dev, "failed to initialize the H/W\n"); 12908c2ecf20Sopenharmony_ci rc = 0; 12918c2ecf20Sopenharmony_ci goto out_disabled; 12928c2ecf20Sopenharmony_ci } 12938c2ecf20Sopenharmony_ci 12948c2ecf20Sopenharmony_ci /* Check that the communication with the device is working */ 12958c2ecf20Sopenharmony_ci rc = hdev->asic_funcs->test_queues(hdev); 12968c2ecf20Sopenharmony_ci if (rc) { 12978c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed to detect if device is alive\n"); 12988c2ecf20Sopenharmony_ci rc = 0; 12998c2ecf20Sopenharmony_ci goto out_disabled; 13008c2ecf20Sopenharmony_ci } 13018c2ecf20Sopenharmony_ci 13028c2ecf20Sopenharmony_ci rc = device_late_init(hdev); 13038c2ecf20Sopenharmony_ci if (rc) { 13048c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed late initialization\n"); 13058c2ecf20Sopenharmony_ci rc = 0; 13068c2ecf20Sopenharmony_ci goto out_disabled; 13078c2ecf20Sopenharmony_ci } 13088c2ecf20Sopenharmony_ci 13098c2ecf20Sopenharmony_ci dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n", 13108c2ecf20Sopenharmony_ci hdev->asic_name, 13118c2ecf20Sopenharmony_ci hdev->asic_prop.dram_size / 1024 / 1024 / 1024); 13128c2ecf20Sopenharmony_ci 13138c2ecf20Sopenharmony_ci rc = hl_vm_init(hdev); 13148c2ecf20Sopenharmony_ci if (rc) { 13158c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed to initialize memory module\n"); 13168c2ecf20Sopenharmony_ci rc = 0; 13178c2ecf20Sopenharmony_ci goto out_disabled; 13188c2ecf20Sopenharmony_ci } 13198c2ecf20Sopenharmony_ci 13208c2ecf20Sopenharmony_ci /* 13218c2ecf20Sopenharmony_ci * Expose devices and sysfs nodes to user. 13228c2ecf20Sopenharmony_ci * From here there is no need to add char devices and create sysfs nodes 13238c2ecf20Sopenharmony_ci * in case of an error. 13248c2ecf20Sopenharmony_ci */ 13258c2ecf20Sopenharmony_ci add_cdev_sysfs_on_err = false; 13268c2ecf20Sopenharmony_ci rc = device_cdev_sysfs_add(hdev); 13278c2ecf20Sopenharmony_ci if (rc) { 13288c2ecf20Sopenharmony_ci dev_err(hdev->dev, 13298c2ecf20Sopenharmony_ci "Failed to add char devices and sysfs nodes\n"); 13308c2ecf20Sopenharmony_ci rc = 0; 13318c2ecf20Sopenharmony_ci goto out_disabled; 13328c2ecf20Sopenharmony_ci } 13338c2ecf20Sopenharmony_ci 13348c2ecf20Sopenharmony_ci /* Need to call this again because the max power might change, 13358c2ecf20Sopenharmony_ci * depending on card type for certain ASICs 13368c2ecf20Sopenharmony_ci */ 13378c2ecf20Sopenharmony_ci hl_set_max_power(hdev); 13388c2ecf20Sopenharmony_ci 13398c2ecf20Sopenharmony_ci /* 13408c2ecf20Sopenharmony_ci * hl_hwmon_init() must be called after device_late_init(), because only 13418c2ecf20Sopenharmony_ci * there we get the information from the device about which 13428c2ecf20Sopenharmony_ci * hwmon-related sensors the device supports. 13438c2ecf20Sopenharmony_ci * Furthermore, it must be done after adding the device to the system. 13448c2ecf20Sopenharmony_ci */ 13458c2ecf20Sopenharmony_ci rc = hl_hwmon_init(hdev); 13468c2ecf20Sopenharmony_ci if (rc) { 13478c2ecf20Sopenharmony_ci dev_err(hdev->dev, "Failed to initialize hwmon\n"); 13488c2ecf20Sopenharmony_ci rc = 0; 13498c2ecf20Sopenharmony_ci goto out_disabled; 13508c2ecf20Sopenharmony_ci } 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci dev_notice(hdev->dev, 13538c2ecf20Sopenharmony_ci "Successfully added device to habanalabs driver\n"); 13548c2ecf20Sopenharmony_ci 13558c2ecf20Sopenharmony_ci hdev->init_done = true; 13568c2ecf20Sopenharmony_ci 13578c2ecf20Sopenharmony_ci return 0; 13588c2ecf20Sopenharmony_ci 13598c2ecf20Sopenharmony_cirelease_ctx: 13608c2ecf20Sopenharmony_ci if (hl_ctx_put(hdev->kernel_ctx) != 1) 13618c2ecf20Sopenharmony_ci dev_err(hdev->dev, 13628c2ecf20Sopenharmony_ci "kernel ctx is still alive on initialization failure\n"); 13638c2ecf20Sopenharmony_cimmu_fini: 13648c2ecf20Sopenharmony_ci hl_mmu_fini(hdev); 13658c2ecf20Sopenharmony_cieq_fini: 13668c2ecf20Sopenharmony_ci hl_eq_fini(hdev, &hdev->event_queue); 13678c2ecf20Sopenharmony_cicq_fini: 13688c2ecf20Sopenharmony_ci for (i = 0 ; i < cq_ready_cnt ; i++) 13698c2ecf20Sopenharmony_ci hl_cq_fini(hdev, &hdev->completion_queue[i]); 13708c2ecf20Sopenharmony_ci kfree(hdev->completion_queue); 13718c2ecf20Sopenharmony_cihw_queues_destroy: 13728c2ecf20Sopenharmony_ci hl_hw_queues_destroy(hdev); 13738c2ecf20Sopenharmony_cisw_fini: 13748c2ecf20Sopenharmony_ci hdev->asic_funcs->sw_fini(hdev); 13758c2ecf20Sopenharmony_ciearly_fini: 13768c2ecf20Sopenharmony_ci device_early_fini(hdev); 13778c2ecf20Sopenharmony_cifree_dev_ctrl: 13788c2ecf20Sopenharmony_ci put_device(hdev->dev_ctrl); 13798c2ecf20Sopenharmony_cifree_dev: 13808c2ecf20Sopenharmony_ci put_device(hdev->dev); 13818c2ecf20Sopenharmony_ciout_disabled: 13828c2ecf20Sopenharmony_ci hdev->disabled = true; 13838c2ecf20Sopenharmony_ci if (add_cdev_sysfs_on_err) 13848c2ecf20Sopenharmony_ci device_cdev_sysfs_add(hdev); 13858c2ecf20Sopenharmony_ci if (hdev->pdev) 13868c2ecf20Sopenharmony_ci dev_err(&hdev->pdev->dev, 13878c2ecf20Sopenharmony_ci "Failed to initialize hl%d. Device is NOT usable !\n", 13888c2ecf20Sopenharmony_ci hdev->id / 2); 13898c2ecf20Sopenharmony_ci else 13908c2ecf20Sopenharmony_ci pr_err("Failed to initialize hl%d. Device is NOT usable !\n", 13918c2ecf20Sopenharmony_ci hdev->id / 2); 13928c2ecf20Sopenharmony_ci 13938c2ecf20Sopenharmony_ci return rc; 13948c2ecf20Sopenharmony_ci} 13958c2ecf20Sopenharmony_ci 13968c2ecf20Sopenharmony_ci/* 13978c2ecf20Sopenharmony_ci * hl_device_fini - main tear-down function for habanalabs device 13988c2ecf20Sopenharmony_ci * 13998c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 14008c2ecf20Sopenharmony_ci * 14018c2ecf20Sopenharmony_ci * Destroy the device, call ASIC fini functions and release the id 14028c2ecf20Sopenharmony_ci */ 14038c2ecf20Sopenharmony_civoid hl_device_fini(struct hl_device *hdev) 14048c2ecf20Sopenharmony_ci{ 14058c2ecf20Sopenharmony_ci int i, rc; 14068c2ecf20Sopenharmony_ci ktime_t timeout; 14078c2ecf20Sopenharmony_ci 14088c2ecf20Sopenharmony_ci dev_info(hdev->dev, "Removing device\n"); 14098c2ecf20Sopenharmony_ci 14108c2ecf20Sopenharmony_ci /* 14118c2ecf20Sopenharmony_ci * This function is competing with the reset function, so try to 14128c2ecf20Sopenharmony_ci * take the reset atomic and if we are already in middle of reset, 14138c2ecf20Sopenharmony_ci * wait until reset function is finished. Reset function is designed 14148c2ecf20Sopenharmony_ci * to always finish. However, in Gaudi, because of all the network 14158c2ecf20Sopenharmony_ci * ports, the hard reset could take between 10-30 seconds 14168c2ecf20Sopenharmony_ci */ 14178c2ecf20Sopenharmony_ci 14188c2ecf20Sopenharmony_ci timeout = ktime_add_us(ktime_get(), 14198c2ecf20Sopenharmony_ci HL_HARD_RESET_MAX_TIMEOUT * 1000 * 1000); 14208c2ecf20Sopenharmony_ci rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); 14218c2ecf20Sopenharmony_ci while (rc) { 14228c2ecf20Sopenharmony_ci usleep_range(50, 200); 14238c2ecf20Sopenharmony_ci rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); 14248c2ecf20Sopenharmony_ci if (ktime_compare(ktime_get(), timeout) > 0) { 14258c2ecf20Sopenharmony_ci WARN(1, "Failed to remove device because reset function did not finish\n"); 14268c2ecf20Sopenharmony_ci return; 14278c2ecf20Sopenharmony_ci } 14288c2ecf20Sopenharmony_ci } 14298c2ecf20Sopenharmony_ci 14308c2ecf20Sopenharmony_ci /* Disable PCI access from device F/W so it won't send us additional 14318c2ecf20Sopenharmony_ci * interrupts. We disable MSI/MSI-X at the halt_engines function and we 14328c2ecf20Sopenharmony_ci * can't have the F/W sending us interrupts after that. We need to 14338c2ecf20Sopenharmony_ci * disable the access here because if the device is marked disable, the 14348c2ecf20Sopenharmony_ci * message won't be send. Also, in case of heartbeat, the device CPU is 14358c2ecf20Sopenharmony_ci * marked as disable so this message won't be sent 14368c2ecf20Sopenharmony_ci */ 14378c2ecf20Sopenharmony_ci hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS); 14388c2ecf20Sopenharmony_ci 14398c2ecf20Sopenharmony_ci /* Mark device as disabled */ 14408c2ecf20Sopenharmony_ci hdev->disabled = true; 14418c2ecf20Sopenharmony_ci 14428c2ecf20Sopenharmony_ci /* Flush anyone that is inside the critical section of enqueue 14438c2ecf20Sopenharmony_ci * jobs to the H/W 14448c2ecf20Sopenharmony_ci */ 14458c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_queues_lock(hdev); 14468c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_queues_unlock(hdev); 14478c2ecf20Sopenharmony_ci 14488c2ecf20Sopenharmony_ci /* Flush anyone that is inside device open */ 14498c2ecf20Sopenharmony_ci mutex_lock(&hdev->fpriv_list_lock); 14508c2ecf20Sopenharmony_ci mutex_unlock(&hdev->fpriv_list_lock); 14518c2ecf20Sopenharmony_ci 14528c2ecf20Sopenharmony_ci hdev->hard_reset_pending = true; 14538c2ecf20Sopenharmony_ci 14548c2ecf20Sopenharmony_ci hl_hwmon_fini(hdev); 14558c2ecf20Sopenharmony_ci 14568c2ecf20Sopenharmony_ci device_late_fini(hdev); 14578c2ecf20Sopenharmony_ci 14588c2ecf20Sopenharmony_ci hl_debugfs_remove_device(hdev); 14598c2ecf20Sopenharmony_ci 14608c2ecf20Sopenharmony_ci /* 14618c2ecf20Sopenharmony_ci * Halt the engines and disable interrupts so we won't get any more 14628c2ecf20Sopenharmony_ci * completions from H/W and we won't have any accesses from the 14638c2ecf20Sopenharmony_ci * H/W to the host machine 14648c2ecf20Sopenharmony_ci */ 14658c2ecf20Sopenharmony_ci hdev->asic_funcs->halt_engines(hdev, true); 14668c2ecf20Sopenharmony_ci 14678c2ecf20Sopenharmony_ci /* Go over all the queues, release all CS and their jobs */ 14688c2ecf20Sopenharmony_ci hl_cs_rollback_all(hdev); 14698c2ecf20Sopenharmony_ci 14708c2ecf20Sopenharmony_ci /* Kill processes here after CS rollback. This is because the process 14718c2ecf20Sopenharmony_ci * can't really exit until all its CSs are done, which is what we 14728c2ecf20Sopenharmony_ci * do in cs rollback 14738c2ecf20Sopenharmony_ci */ 14748c2ecf20Sopenharmony_ci rc = device_kill_open_processes(hdev); 14758c2ecf20Sopenharmony_ci if (rc) 14768c2ecf20Sopenharmony_ci dev_crit(hdev->dev, "Failed to kill all open processes\n"); 14778c2ecf20Sopenharmony_ci 14788c2ecf20Sopenharmony_ci hl_cb_pool_fini(hdev); 14798c2ecf20Sopenharmony_ci 14808c2ecf20Sopenharmony_ci /* Reset the H/W. It will be in idle state after this returns */ 14818c2ecf20Sopenharmony_ci hdev->asic_funcs->hw_fini(hdev, true); 14828c2ecf20Sopenharmony_ci 14838c2ecf20Sopenharmony_ci /* Release kernel context */ 14848c2ecf20Sopenharmony_ci if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) 14858c2ecf20Sopenharmony_ci dev_err(hdev->dev, "kernel ctx is still alive\n"); 14868c2ecf20Sopenharmony_ci 14878c2ecf20Sopenharmony_ci hl_vm_fini(hdev); 14888c2ecf20Sopenharmony_ci 14898c2ecf20Sopenharmony_ci hl_mmu_fini(hdev); 14908c2ecf20Sopenharmony_ci 14918c2ecf20Sopenharmony_ci hl_eq_fini(hdev, &hdev->event_queue); 14928c2ecf20Sopenharmony_ci 14938c2ecf20Sopenharmony_ci for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) 14948c2ecf20Sopenharmony_ci hl_cq_fini(hdev, &hdev->completion_queue[i]); 14958c2ecf20Sopenharmony_ci kfree(hdev->completion_queue); 14968c2ecf20Sopenharmony_ci 14978c2ecf20Sopenharmony_ci hl_hw_queues_destroy(hdev); 14988c2ecf20Sopenharmony_ci 14998c2ecf20Sopenharmony_ci /* Call ASIC S/W finalize function */ 15008c2ecf20Sopenharmony_ci hdev->asic_funcs->sw_fini(hdev); 15018c2ecf20Sopenharmony_ci 15028c2ecf20Sopenharmony_ci device_early_fini(hdev); 15038c2ecf20Sopenharmony_ci 15048c2ecf20Sopenharmony_ci /* Hide devices and sysfs nodes from user */ 15058c2ecf20Sopenharmony_ci device_cdev_sysfs_del(hdev); 15068c2ecf20Sopenharmony_ci 15078c2ecf20Sopenharmony_ci pr_info("removed device successfully\n"); 15088c2ecf20Sopenharmony_ci} 15098c2ecf20Sopenharmony_ci 15108c2ecf20Sopenharmony_ci/* 15118c2ecf20Sopenharmony_ci * MMIO register access helper functions. 15128c2ecf20Sopenharmony_ci */ 15138c2ecf20Sopenharmony_ci 15148c2ecf20Sopenharmony_ci/* 15158c2ecf20Sopenharmony_ci * hl_rreg - Read an MMIO register 15168c2ecf20Sopenharmony_ci * 15178c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 15188c2ecf20Sopenharmony_ci * @reg: MMIO register offset (in bytes) 15198c2ecf20Sopenharmony_ci * 15208c2ecf20Sopenharmony_ci * Returns the value of the MMIO register we are asked to read 15218c2ecf20Sopenharmony_ci * 15228c2ecf20Sopenharmony_ci */ 15238c2ecf20Sopenharmony_ciinline u32 hl_rreg(struct hl_device *hdev, u32 reg) 15248c2ecf20Sopenharmony_ci{ 15258c2ecf20Sopenharmony_ci return readl(hdev->rmmio + reg); 15268c2ecf20Sopenharmony_ci} 15278c2ecf20Sopenharmony_ci 15288c2ecf20Sopenharmony_ci/* 15298c2ecf20Sopenharmony_ci * hl_wreg - Write to an MMIO register 15308c2ecf20Sopenharmony_ci * 15318c2ecf20Sopenharmony_ci * @hdev: pointer to habanalabs device structure 15328c2ecf20Sopenharmony_ci * @reg: MMIO register offset (in bytes) 15338c2ecf20Sopenharmony_ci * @val: 32-bit value 15348c2ecf20Sopenharmony_ci * 15358c2ecf20Sopenharmony_ci * Writes the 32-bit value into the MMIO register 15368c2ecf20Sopenharmony_ci * 15378c2ecf20Sopenharmony_ci */ 15388c2ecf20Sopenharmony_ciinline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) 15398c2ecf20Sopenharmony_ci{ 15408c2ecf20Sopenharmony_ci writel(val, hdev->rmmio + reg); 15418c2ecf20Sopenharmony_ci} 1542