1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 *
7 */
8
9#define pr_fmt(fmt)		"habanalabs: " fmt
10
11#include "habanalabs.h"
12
13#include <linux/pci.h>
14#include <linux/aer.h>
15#include <linux/module.h>
16
17#define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
18
19#define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
20
21MODULE_AUTHOR(HL_DRIVER_AUTHOR);
22MODULE_DESCRIPTION(HL_DRIVER_DESC);
23MODULE_LICENSE("GPL v2");
24
25static int hl_major;
26static struct class *hl_class;
27static DEFINE_IDR(hl_devs_idr);
28static DEFINE_MUTEX(hl_devs_idr_lock);
29
30static int timeout_locked = 5;
31static int reset_on_lockup = 1;
32
33module_param(timeout_locked, int, 0444);
34MODULE_PARM_DESC(timeout_locked,
35	"Device lockup timeout in seconds (0 = disabled, default 5s)");
36
37module_param(reset_on_lockup, int, 0444);
38MODULE_PARM_DESC(reset_on_lockup,
39	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
40
41#define PCI_VENDOR_ID_HABANALABS	0x1da3
42
43#define PCI_IDS_GOYA			0x0001
44#define PCI_IDS_GAUDI			0x1000
45
46static const struct pci_device_id ids[] = {
47	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
48	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
49	{ 0, }
50};
51MODULE_DEVICE_TABLE(pci, ids);
52
53/*
54 * get_asic_type - translate device id to asic type
55 *
56 * @device: id of the PCI device
57 *
58 * Translate device id to asic type.
59 * In case of unidentified device, return -1
60 */
61static enum hl_asic_type get_asic_type(u16 device)
62{
63	enum hl_asic_type asic_type;
64
65	switch (device) {
66	case PCI_IDS_GOYA:
67		asic_type = ASIC_GOYA;
68		break;
69	case PCI_IDS_GAUDI:
70		asic_type = ASIC_GAUDI;
71		break;
72	default:
73		asic_type = ASIC_INVALID;
74		break;
75	}
76
77	return asic_type;
78}
79
80/*
81 * hl_device_open - open function for habanalabs device
82 *
83 * @inode: pointer to inode structure
84 * @filp: pointer to file structure
85 *
86 * Called when process opens an habanalabs device.
87 */
88int hl_device_open(struct inode *inode, struct file *filp)
89{
90	struct hl_device *hdev;
91	struct hl_fpriv *hpriv;
92	int rc;
93
94	mutex_lock(&hl_devs_idr_lock);
95	hdev = idr_find(&hl_devs_idr, iminor(inode));
96	mutex_unlock(&hl_devs_idr_lock);
97
98	if (!hdev) {
99		pr_err("Couldn't find device %d:%d\n",
100			imajor(inode), iminor(inode));
101		return -ENXIO;
102	}
103
104	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
105	if (!hpriv)
106		return -ENOMEM;
107
108	hpriv->hdev = hdev;
109	filp->private_data = hpriv;
110	hpriv->filp = filp;
111	mutex_init(&hpriv->restore_phase_mutex);
112	kref_init(&hpriv->refcount);
113	nonseekable_open(inode, filp);
114
115	hl_cb_mgr_init(&hpriv->cb_mgr);
116	hl_ctx_mgr_init(&hpriv->ctx_mgr);
117
118	hpriv->taskpid = find_get_pid(current->pid);
119
120	mutex_lock(&hdev->fpriv_list_lock);
121
122	if (hl_device_disabled_or_in_reset(hdev)) {
123		dev_err_ratelimited(hdev->dev,
124			"Can't open %s because it is disabled or in reset\n",
125			dev_name(hdev->dev));
126		rc = -EPERM;
127		goto out_err;
128	}
129
130	if (hdev->in_debug) {
131		dev_err_ratelimited(hdev->dev,
132			"Can't open %s because it is being debugged by another user\n",
133			dev_name(hdev->dev));
134		rc = -EPERM;
135		goto out_err;
136	}
137
138	if (hdev->compute_ctx) {
139		dev_dbg_ratelimited(hdev->dev,
140			"Can't open %s because another user is working on it\n",
141			dev_name(hdev->dev));
142		rc = -EBUSY;
143		goto out_err;
144	}
145
146	rc = hl_ctx_create(hdev, hpriv);
147	if (rc) {
148		dev_err(hdev->dev, "Failed to create context %d\n", rc);
149		goto out_err;
150	}
151
152	/* Device is IDLE at this point so it is legal to change PLLs.
153	 * There is no need to check anything because if the PLL is
154	 * already HIGH, the set function will return without doing
155	 * anything
156	 */
157	hl_device_set_frequency(hdev, PLL_HIGH);
158
159	list_add(&hpriv->dev_node, &hdev->fpriv_list);
160	mutex_unlock(&hdev->fpriv_list_lock);
161
162	hl_debugfs_add_file(hpriv);
163
164	return 0;
165
166out_err:
167	mutex_unlock(&hdev->fpriv_list_lock);
168
169	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
170	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
171	filp->private_data = NULL;
172	mutex_destroy(&hpriv->restore_phase_mutex);
173	put_pid(hpriv->taskpid);
174
175	kfree(hpriv);
176
177	return rc;
178}
179
180int hl_device_open_ctrl(struct inode *inode, struct file *filp)
181{
182	struct hl_device *hdev;
183	struct hl_fpriv *hpriv;
184	int rc;
185
186	mutex_lock(&hl_devs_idr_lock);
187	hdev = idr_find(&hl_devs_idr, iminor(inode));
188	mutex_unlock(&hl_devs_idr_lock);
189
190	if (!hdev) {
191		pr_err("Couldn't find device %d:%d\n",
192			imajor(inode), iminor(inode));
193		return -ENXIO;
194	}
195
196	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
197	if (!hpriv)
198		return -ENOMEM;
199
200	mutex_lock(&hdev->fpriv_list_lock);
201
202	if (hl_device_disabled_or_in_reset(hdev)) {
203		dev_err_ratelimited(hdev->dev_ctrl,
204			"Can't open %s because it is disabled or in reset\n",
205			dev_name(hdev->dev_ctrl));
206		rc = -EPERM;
207		goto out_err;
208	}
209
210	list_add(&hpriv->dev_node, &hdev->fpriv_list);
211	mutex_unlock(&hdev->fpriv_list_lock);
212
213	hpriv->hdev = hdev;
214	filp->private_data = hpriv;
215	hpriv->filp = filp;
216	hpriv->is_control = true;
217	nonseekable_open(inode, filp);
218
219	hpriv->taskpid = find_get_pid(current->pid);
220
221	return 0;
222
223out_err:
224	mutex_unlock(&hdev->fpriv_list_lock);
225	kfree(hpriv);
226	return rc;
227}
228
229static void set_driver_behavior_per_device(struct hl_device *hdev)
230{
231	hdev->mmu_enable = 1;
232	hdev->cpu_enable = 1;
233	hdev->fw_loading = 1;
234	hdev->cpu_queues_enable = 1;
235	hdev->heartbeat = 1;
236	hdev->clock_gating_mask = ULONG_MAX;
237
238	hdev->reset_pcilink = 0;
239	hdev->axi_drain = 0;
240	hdev->sram_scrambler_enable = 1;
241	hdev->dram_scrambler_enable = 1;
242	hdev->bmc_enable = 1;
243	hdev->hard_reset_on_fw_events = 1;
244}
245
246/*
247 * create_hdev - create habanalabs device instance
248 *
249 * @dev: will hold the pointer to the new habanalabs device structure
250 * @pdev: pointer to the pci device
251 * @asic_type: in case of simulator device, which device is it
252 * @minor: in case of simulator device, the minor of the device
253 *
254 * Allocate memory for habanalabs device and initialize basic fields
255 * Identify the ASIC type
256 * Allocate ID (minor) for the device (only for real devices)
257 */
258int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
259		enum hl_asic_type asic_type, int minor)
260{
261	struct hl_device *hdev;
262	int rc, main_id, ctrl_id = 0;
263
264	*dev = NULL;
265
266	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
267	if (!hdev)
268		return -ENOMEM;
269
270	/* First, we must find out which ASIC are we handling. This is needed
271	 * to configure the behavior of the driver (kernel parameters)
272	 */
273	if (pdev) {
274		hdev->asic_type = get_asic_type(pdev->device);
275		if (hdev->asic_type == ASIC_INVALID) {
276			dev_err(&pdev->dev, "Unsupported ASIC\n");
277			rc = -ENODEV;
278			goto free_hdev;
279		}
280	} else {
281		hdev->asic_type = asic_type;
282	}
283
284	hdev->major = hl_major;
285	hdev->reset_on_lockup = reset_on_lockup;
286	hdev->pldm = 0;
287
288	set_driver_behavior_per_device(hdev);
289
290	if (timeout_locked)
291		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
292	else
293		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
294
295	hdev->disabled = true;
296	hdev->pdev = pdev; /* can be NULL in case of simulator device */
297
298	/* Set default DMA mask to 32 bits */
299	hdev->dma_mask = 32;
300
301	mutex_lock(&hl_devs_idr_lock);
302
303	/* Always save 2 numbers, 1 for main device and 1 for control.
304	 * They must be consecutive
305	 */
306	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
307				GFP_KERNEL);
308
309	if (main_id >= 0)
310		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
311					main_id + 2, GFP_KERNEL);
312
313	mutex_unlock(&hl_devs_idr_lock);
314
315	if ((main_id < 0) || (ctrl_id < 0)) {
316		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
317			pr_err("too many devices in the system\n");
318
319		if (main_id >= 0) {
320			mutex_lock(&hl_devs_idr_lock);
321			idr_remove(&hl_devs_idr, main_id);
322			mutex_unlock(&hl_devs_idr_lock);
323		}
324
325		rc = -EBUSY;
326		goto free_hdev;
327	}
328
329	hdev->id = main_id;
330	hdev->id_control = ctrl_id;
331
332	*dev = hdev;
333
334	return 0;
335
336free_hdev:
337	kfree(hdev);
338	return rc;
339}
340
341/*
342 * destroy_hdev - destroy habanalabs device instance
343 *
344 * @dev: pointer to the habanalabs device structure
345 *
346 */
347void destroy_hdev(struct hl_device *hdev)
348{
349	/* Remove device from the device list */
350	mutex_lock(&hl_devs_idr_lock);
351	idr_remove(&hl_devs_idr, hdev->id);
352	idr_remove(&hl_devs_idr, hdev->id_control);
353	mutex_unlock(&hl_devs_idr_lock);
354
355	kfree(hdev);
356}
357
358static int hl_pmops_suspend(struct device *dev)
359{
360	struct hl_device *hdev = dev_get_drvdata(dev);
361
362	pr_debug("Going to suspend PCI device\n");
363
364	if (!hdev) {
365		pr_err("device pointer is NULL in suspend\n");
366		return 0;
367	}
368
369	return hl_device_suspend(hdev);
370}
371
372static int hl_pmops_resume(struct device *dev)
373{
374	struct hl_device *hdev = dev_get_drvdata(dev);
375
376	pr_debug("Going to resume PCI device\n");
377
378	if (!hdev) {
379		pr_err("device pointer is NULL in resume\n");
380		return 0;
381	}
382
383	return hl_device_resume(hdev);
384}
385
386/*
387 * hl_pci_probe - probe PCI habanalabs devices
388 *
389 * @pdev: pointer to pci device
390 * @id: pointer to pci device id structure
391 *
392 * Standard PCI probe function for habanalabs device.
393 * Create a new habanalabs device and initialize it according to the
394 * device's type
395 */
396static int hl_pci_probe(struct pci_dev *pdev,
397				const struct pci_device_id *id)
398{
399	struct hl_device *hdev;
400	int rc;
401
402	dev_info(&pdev->dev, HL_NAME
403		 " device found [%04x:%04x] (rev %x)\n",
404		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
405
406	rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
407	if (rc)
408		return rc;
409
410	pci_set_drvdata(pdev, hdev);
411
412	pci_enable_pcie_error_reporting(pdev);
413
414	rc = hl_device_init(hdev, hl_class);
415	if (rc) {
416		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
417		rc = -ENODEV;
418		goto disable_device;
419	}
420
421	return 0;
422
423disable_device:
424	pci_disable_pcie_error_reporting(pdev);
425	pci_set_drvdata(pdev, NULL);
426	destroy_hdev(hdev);
427
428	return rc;
429}
430
431/*
432 * hl_pci_remove - remove PCI habanalabs devices
433 *
434 * @pdev: pointer to pci device
435 *
436 * Standard PCI remove function for habanalabs device
437 */
438static void hl_pci_remove(struct pci_dev *pdev)
439{
440	struct hl_device *hdev;
441
442	hdev = pci_get_drvdata(pdev);
443	if (!hdev)
444		return;
445
446	hl_device_fini(hdev);
447	pci_disable_pcie_error_reporting(pdev);
448	pci_set_drvdata(pdev, NULL);
449	destroy_hdev(hdev);
450}
451
452/**
453 * hl_pci_err_detected - a PCI bus error detected on this device
454 *
455 * @pdev: pointer to pci device
456 * @state: PCI error type
457 *
458 * Called by the PCI subsystem whenever a non-correctable
459 * PCI bus error is detected
460 */
461static pci_ers_result_t
462hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
463{
464	struct hl_device *hdev = pci_get_drvdata(pdev);
465	enum pci_ers_result result;
466
467	switch (state) {
468	case pci_channel_io_normal:
469		return PCI_ERS_RESULT_CAN_RECOVER;
470
471	case pci_channel_io_frozen:
472		dev_warn(hdev->dev, "frozen state error detected\n");
473		result = PCI_ERS_RESULT_NEED_RESET;
474		break;
475
476	case pci_channel_io_perm_failure:
477		dev_warn(hdev->dev, "failure state error detected\n");
478		result = PCI_ERS_RESULT_DISCONNECT;
479		break;
480
481	default:
482		result = PCI_ERS_RESULT_NONE;
483	}
484
485	hdev->asic_funcs->halt_engines(hdev, true);
486
487	return result;
488}
489
490/**
491 * hl_pci_err_resume - resume after a PCI slot reset
492 *
493 * @pdev: pointer to pci device
494 *
495 */
496static void hl_pci_err_resume(struct pci_dev *pdev)
497{
498	struct hl_device *hdev = pci_get_drvdata(pdev);
499
500	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
501	hl_device_resume(hdev);
502}
503
504/**
505 * hl_pci_err_slot_reset - a PCI slot reset has just happened
506 *
507 * @pdev: pointer to pci device
508 *
509 * Determine if the driver can recover from the PCI slot reset
510 */
511static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
512{
513	return PCI_ERS_RESULT_RECOVERED;
514}
515
516static const struct dev_pm_ops hl_pm_ops = {
517	.suspend = hl_pmops_suspend,
518	.resume = hl_pmops_resume,
519};
520
521static const struct pci_error_handlers hl_pci_err_handler = {
522	.error_detected = hl_pci_err_detected,
523	.slot_reset = hl_pci_err_slot_reset,
524	.resume = hl_pci_err_resume,
525};
526
527static struct pci_driver hl_pci_driver = {
528	.name = HL_NAME,
529	.id_table = ids,
530	.probe = hl_pci_probe,
531	.remove = hl_pci_remove,
532	.shutdown = hl_pci_remove,
533	.driver.pm = &hl_pm_ops,
534	.err_handler = &hl_pci_err_handler,
535};
536
537/*
538 * hl_init - Initialize the habanalabs kernel driver
539 */
540static int __init hl_init(void)
541{
542	int rc;
543	dev_t dev;
544
545	pr_info("loading driver\n");
546
547	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
548	if (rc < 0) {
549		pr_err("unable to get major\n");
550		return rc;
551	}
552
553	hl_major = MAJOR(dev);
554
555	hl_class = class_create(THIS_MODULE, HL_NAME);
556	if (IS_ERR(hl_class)) {
557		pr_err("failed to allocate class\n");
558		rc = PTR_ERR(hl_class);
559		goto remove_major;
560	}
561
562	hl_debugfs_init();
563
564	rc = pci_register_driver(&hl_pci_driver);
565	if (rc) {
566		pr_err("failed to register pci device\n");
567		goto remove_debugfs;
568	}
569
570	pr_debug("driver loaded\n");
571
572	return 0;
573
574remove_debugfs:
575	hl_debugfs_fini();
576	class_destroy(hl_class);
577remove_major:
578	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
579	return rc;
580}
581
582/*
583 * hl_exit - Release all resources of the habanalabs kernel driver
584 */
585static void __exit hl_exit(void)
586{
587	pci_unregister_driver(&hl_pci_driver);
588
589	/*
590	 * Removing debugfs must be after all devices or simulator devices
591	 * have been removed because otherwise we get a bug in the
592	 * debugfs module for referencing NULL objects
593	 */
594	hl_debugfs_fini();
595
596	class_destroy(hl_class);
597	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
598
599	idr_destroy(&hl_devs_idr);
600
601	pr_debug("driver removed\n");
602}
603
604module_init(hl_init);
605module_exit(hl_exit);
606