1/*
2 * Copyright 2022 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22*/
23
24#include <unistd.h>
25#include <stdio.h>
26#include <stdlib.h>
27#include <inttypes.h>
28
29#include "CUnit/Basic.h"
30
31#include "amdgpu_test.h"
32#include "amdgpu_drm.h"
33#include "amdgpu_internal.h"
34
35#define IB_SIZE 4096
36#define MAX_RESOURCES 8
37
38#define DMA_SIZE 4097
39#define DMA_DATA_BYTE 0xea
40
41static bool do_p2p;
42
43static amdgpu_device_handle executing_device_handle;
44static uint32_t executing_device_major_version;
45static uint32_t executing_device_minor_version;
46
47static amdgpu_device_handle peer_exporting_device_handle;
48static uint32_t peer_exporting_device_major_version;
49static uint32_t peer_exporting_device_minor_version;
50
51static amdgpu_context_handle context_handle;
52static amdgpu_bo_handle ib_handle;
53static uint32_t *ib_cpu;
54static uint64_t ib_mc_address;
55static amdgpu_va_handle ib_va_handle;
56static uint32_t num_dword;
57
58static amdgpu_bo_handle resources[MAX_RESOURCES];
59static unsigned num_resources;
60
61static uint8_t* reference_data;
62
63static void amdgpu_cp_dma_host_to_vram(void);
64static void amdgpu_cp_dma_vram_to_host(void);
65static void amdgpu_cp_dma_p2p_vram_to_vram(void);
66static void amdgpu_cp_dma_p2p_host_to_vram(void);
67static void amdgpu_cp_dma_p2p_vram_to_host(void);
68
69/**
70 * Tests in cp dma test suite
71 */
72CU_TestInfo cp_dma_tests[] = {
73	{ "CP DMA write Host to VRAM",  amdgpu_cp_dma_host_to_vram },
74	{ "CP DMA write VRAM to Host",  amdgpu_cp_dma_vram_to_host },
75
76	{ "Peer to Peer CP DMA write VRAM to VRAM",  amdgpu_cp_dma_p2p_vram_to_vram },
77	{ "Peer to Peer CP DMA write Host to VRAM",  amdgpu_cp_dma_p2p_host_to_vram },
78	{ "Peer to Peer CP DMA write VRAM to Host",  amdgpu_cp_dma_p2p_vram_to_host },
79	CU_TEST_INFO_NULL,
80};
81
82struct amdgpu_cp_dma_bo{
83	amdgpu_bo_handle buf_handle;
84	amdgpu_va_handle va_handle;
85	uint64_t gpu_va;
86	uint64_t size;
87};
88
89static int allocate_bo_and_va(amdgpu_device_handle dev,
90		uint64_t size, uint64_t alignment,
91		uint32_t heap, uint64_t alloc_flags,
92		struct amdgpu_cp_dma_bo *bo) {
93	struct amdgpu_bo_alloc_request request = {};
94	amdgpu_bo_handle buf_handle;
95	amdgpu_va_handle va_handle;
96	uint64_t vmc_addr;
97	int r;
98
99	request.alloc_size = size;
100	request.phys_alignment = alignment;
101	request.preferred_heap = heap;
102	request.flags = alloc_flags;
103
104	r = amdgpu_bo_alloc(dev, &request, &buf_handle);
105	if (r)
106		goto error_bo_alloc;
107
108	r = amdgpu_va_range_alloc(dev, amdgpu_gpu_va_range_general,
109			size, alignment, 0,
110			&vmc_addr, &va_handle, 0);
111	if (r)
112		goto error_va_alloc;
113
114	r = amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr,
115						AMDGPU_VM_PAGE_READABLE |
116							AMDGPU_VM_PAGE_WRITEABLE |
117							AMDGPU_VM_PAGE_EXECUTABLE,
118						AMDGPU_VA_OP_MAP);
119	if (r)
120		goto error_va_map;
121
122	bo->buf_handle = buf_handle;
123	bo->va_handle = va_handle;
124	bo->gpu_va = vmc_addr;
125	bo->size = size;
126
127	return 0;
128
129error_va_map:
130	amdgpu_bo_va_op(buf_handle, 0,
131			size, vmc_addr, 0, AMDGPU_VA_OP_UNMAP);
132
133error_va_alloc:
134	amdgpu_va_range_free(va_handle);
135
136error_bo_alloc:
137	amdgpu_bo_free(buf_handle);
138
139	return r;
140}
141
142static int import_dma_buf_to_bo(amdgpu_device_handle dev,
143		int dmabuf_fd, struct amdgpu_cp_dma_bo *bo) {
144	amdgpu_va_handle va_handle;
145	uint64_t vmc_addr;
146	int r;
147	struct amdgpu_bo_import_result bo_import_result = {};
148
149	r = amdgpu_bo_import(dev, amdgpu_bo_handle_type_dma_buf_fd,
150			dmabuf_fd, &bo_import_result);
151	if (r)
152		goto error_bo_import;
153
154	r = amdgpu_va_range_alloc(dev, amdgpu_gpu_va_range_general,
155				bo_import_result.alloc_size, 0, 0,
156				&vmc_addr, &va_handle, 0);
157	if (r)
158		goto error_va_alloc;
159
160	r = amdgpu_bo_va_op(bo_import_result.buf_handle, 0,
161			bo_import_result.alloc_size, vmc_addr,
162			AMDGPU_VM_PAGE_READABLE |
163				AMDGPU_VM_PAGE_WRITEABLE |
164				AMDGPU_VM_PAGE_EXECUTABLE,
165			AMDGPU_VA_OP_MAP);
166	if (r)
167		goto error_va_map;
168
169	bo->buf_handle = bo_import_result.buf_handle;
170	bo->va_handle = va_handle;
171	bo->gpu_va = vmc_addr;
172	bo->size = bo_import_result.alloc_size;
173
174	return 0;
175
176error_va_map:
177	amdgpu_bo_va_op(bo_import_result.buf_handle, 0,
178			bo_import_result.alloc_size, vmc_addr, 0, AMDGPU_VA_OP_UNMAP);
179
180error_va_alloc:
181	amdgpu_va_range_free(va_handle);
182
183error_bo_import:
184	amdgpu_bo_free(bo_import_result.buf_handle);
185
186	return r;
187}
188
189static int free_bo(struct amdgpu_cp_dma_bo bo) {
190	int r;
191	r = amdgpu_bo_va_op(bo.buf_handle, 0,
192			bo.size, bo.gpu_va, 0, AMDGPU_VA_OP_UNMAP);
193	if(r)
194		return r;
195
196	r = amdgpu_va_range_free(bo.va_handle);
197	if(r)
198		return r;
199
200	r = amdgpu_bo_free(bo.buf_handle);
201	if(r)
202		return r;
203
204	return 0;
205}
206
207static int submit_and_sync() {
208	struct amdgpu_cs_request ibs_request = {0};
209	struct amdgpu_cs_ib_info ib_info = {0};
210	struct amdgpu_cs_fence fence_status = {0};
211	uint32_t expired;
212	uint32_t family_id, chip_id, chip_rev;
213	unsigned gc_ip_type;
214	int r;
215
216	r = amdgpu_bo_list_create(executing_device_handle,
217			num_resources, resources,
218			NULL, &ibs_request.resources);
219	if (r)
220		return r;
221
222	family_id = executing_device_handle->info.family_id;
223	chip_id = executing_device_handle->info.chip_external_rev;
224	chip_rev = executing_device_handle->info.chip_rev;
225
226	gc_ip_type = (asic_is_gfx_pipe_removed(family_id, chip_id, chip_rev)) ?
227		AMDGPU_HW_IP_COMPUTE : AMDGPU_HW_IP_GFX;
228
229	ib_info.ib_mc_address = ib_mc_address;
230	ib_info.size = num_dword;
231
232	ibs_request.ip_type = gc_ip_type;
233	ibs_request.number_of_ibs = 1;
234	ibs_request.ibs = &ib_info;
235	ibs_request.fence_info.handle = NULL;
236
237	r = amdgpu_cs_submit(context_handle, 0, &ibs_request, 1);
238	if (r)
239		return r;
240
241	r = amdgpu_bo_list_destroy(ibs_request.resources);
242	if (r)
243		return r;
244
245	fence_status.context = context_handle;
246	fence_status.ip_type = gc_ip_type;
247	fence_status.fence = ibs_request.seq_no;
248
249	r = amdgpu_cs_query_fence_status(&fence_status,
250			AMDGPU_TIMEOUT_INFINITE,
251			0, &expired);
252	if (r)
253		return r;
254
255	return 0;
256}
257
258static void cp_dma_cmd(struct amdgpu_cp_dma_bo src_bo,
259		struct amdgpu_cp_dma_bo dst_bo) {
260	_Static_assert(DMA_SIZE < (1 << 26), "DMA size exceeds CP DMA maximium!");
261
262	ib_cpu[0] = 0xc0055000;
263	ib_cpu[1] = 0x80000000;
264	ib_cpu[2] = src_bo.gpu_va & 0x00000000ffffffff;
265	ib_cpu[3] = (src_bo.gpu_va & 0xffffffff00000000) >> 32;
266	ib_cpu[4] = dst_bo.gpu_va & 0x00000000ffffffff;
267	ib_cpu[5] = (dst_bo.gpu_va & 0xffffffff00000000) >> 32;
268	// size is read from the lower 26bits.
269	ib_cpu[6] = ((1 << 26) - 1) & DMA_SIZE;
270	ib_cpu[7] = 0xffff1000;
271
272	num_dword = 8;
273
274	resources[0] = src_bo.buf_handle;
275	resources[1] = dst_bo.buf_handle;
276	resources[2] = ib_handle;
277	num_resources = 3;
278}
279
280static void amdgpu_cp_dma(uint32_t src_heap, uint32_t dst_heap) {
281	int r;
282	struct amdgpu_cp_dma_bo src_bo = {0};
283	struct amdgpu_cp_dma_bo dst_bo = {0};
284	void *src_bo_cpu;
285	void *dst_bo_cpu;
286
287	/* allocate the src bo, set its data to DMA_DATA_BYTE */
288	r = allocate_bo_and_va(executing_device_handle, DMA_SIZE, 4096,
289			src_heap, AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, &src_bo);
290	CU_ASSERT_EQUAL(r, 0);
291
292	r = amdgpu_bo_cpu_map(src_bo.buf_handle, (void **)&src_bo_cpu);
293	CU_ASSERT_EQUAL(r, 0);
294	memset(src_bo_cpu, DMA_DATA_BYTE, DMA_SIZE);
295
296	r = amdgpu_bo_cpu_unmap(src_bo.buf_handle);
297	CU_ASSERT_EQUAL(r, 0);
298
299	/* allocate the dst bo and clear its content to all 0 */
300	r = allocate_bo_and_va(executing_device_handle, DMA_SIZE, 4096,
301			dst_heap, AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, &dst_bo);
302	CU_ASSERT_EQUAL(r, 0);
303
304	r = amdgpu_bo_cpu_map(dst_bo.buf_handle, (void **)&dst_bo_cpu);
305	CU_ASSERT_EQUAL(r, 0);
306
307	_Static_assert(DMA_DATA_BYTE != 0, "Initialization data should be different from DMA data!");
308	memset(dst_bo_cpu, 0, DMA_SIZE);
309
310	/* record CP DMA command and dispatch the command */
311	cp_dma_cmd(src_bo, dst_bo);
312
313	r = submit_and_sync();
314	CU_ASSERT_EQUAL(r, 0);
315
316	/* verify the dst bo is filled with DMA_DATA_BYTE */
317	CU_ASSERT_EQUAL(memcmp(dst_bo_cpu, reference_data, DMA_SIZE) == 0, true);
318
319	r = amdgpu_bo_cpu_unmap(dst_bo.buf_handle);
320	CU_ASSERT_EQUAL(r, 0);
321
322	r = free_bo(src_bo);
323	CU_ASSERT_EQUAL(r, 0);
324
325	r = free_bo(dst_bo);
326	CU_ASSERT_EQUAL(r, 0);
327}
328
329static void amdgpu_cp_dma_p2p(uint32_t src_heap, uint32_t dst_heap) {
330	int r;
331	struct amdgpu_cp_dma_bo exported_bo = {0};
332	int dma_buf_fd;
333	int dma_buf_fd_dup;
334	struct amdgpu_cp_dma_bo src_bo = {0};
335	struct amdgpu_cp_dma_bo imported_dst_bo = {0};
336	void *exported_bo_cpu;
337	void *src_bo_cpu;
338
339	/* allocate a bo on the peer device and export it to dma-buf */
340	r = allocate_bo_and_va(peer_exporting_device_handle, DMA_SIZE, 4096,
341			src_heap, AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, &exported_bo);
342	CU_ASSERT_EQUAL(r, 0);
343
344	/* map the exported bo and clear its content to 0 */
345	_Static_assert(DMA_DATA_BYTE != 0, "Initialization data should be different from DMA data!");
346	r = amdgpu_bo_cpu_map(exported_bo.buf_handle, (void **)&exported_bo_cpu);
347	CU_ASSERT_EQUAL(r, 0);
348	memset(exported_bo_cpu, 0, DMA_SIZE);
349
350	r = amdgpu_bo_export(exported_bo.buf_handle,
351			amdgpu_bo_handle_type_dma_buf_fd, (uint32_t*)&dma_buf_fd);
352	CU_ASSERT_EQUAL(r, 0);
353
354    // According to amdgpu_drm:
355	// "Buffer must be "imported" only using new "fd"
356	// (different from one used by "exporter")"
357	dma_buf_fd_dup = dup(dma_buf_fd);
358	r = close(dma_buf_fd);
359	CU_ASSERT_EQUAL(r, 0);
360
361	/* import the dma-buf to the executing device, imported bo is the DMA destination */
362	r = import_dma_buf_to_bo(
363			executing_device_handle, dma_buf_fd_dup, &imported_dst_bo);
364	CU_ASSERT_EQUAL(r, 0);
365
366	r = close(dma_buf_fd_dup);
367	CU_ASSERT_EQUAL(r, 0);
368
369	/* allocate the src bo and set its content to DMA_DATA_BYTE */
370	r = allocate_bo_and_va(executing_device_handle, DMA_SIZE, 4096,
371			dst_heap, AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, &src_bo);
372	CU_ASSERT_EQUAL(r, 0);
373
374	r = amdgpu_bo_cpu_map(src_bo.buf_handle, (void **)&src_bo_cpu);
375	CU_ASSERT_EQUAL(r, 0);
376
377	memset(src_bo_cpu, DMA_DATA_BYTE, DMA_SIZE);
378
379	r = amdgpu_bo_cpu_unmap(src_bo.buf_handle);
380	CU_ASSERT_EQUAL(r, 0);
381
382	/* record CP DMA command and dispatch the command */
383	cp_dma_cmd(src_bo, imported_dst_bo);
384
385	r = submit_and_sync();
386	CU_ASSERT_EQUAL(r, 0);
387
388	/* verify the bo from the peer device is filled with DMA_DATA_BYTE */
389	CU_ASSERT_EQUAL(memcmp(exported_bo_cpu, reference_data, DMA_SIZE) == 0, true);
390
391	r = amdgpu_bo_cpu_unmap(exported_bo.buf_handle);
392	CU_ASSERT_EQUAL(r, 0);
393
394	r = free_bo(exported_bo);
395	CU_ASSERT_EQUAL(r, 0);
396
397	r = free_bo(imported_dst_bo);
398	CU_ASSERT_EQUAL(r, 0);
399
400	r = free_bo(src_bo);
401	CU_ASSERT_EQUAL(r, 0);
402}
403
404static void amdgpu_cp_dma_host_to_vram(void) {
405	amdgpu_cp_dma(AMDGPU_GEM_DOMAIN_GTT, AMDGPU_GEM_DOMAIN_VRAM);
406}
407
408static void amdgpu_cp_dma_vram_to_host(void) {
409	amdgpu_cp_dma(AMDGPU_GEM_DOMAIN_VRAM, AMDGPU_GEM_DOMAIN_GTT);
410}
411
412static void amdgpu_cp_dma_p2p_vram_to_vram(void) {
413	amdgpu_cp_dma_p2p(AMDGPU_GEM_DOMAIN_VRAM, AMDGPU_GEM_DOMAIN_VRAM);
414}
415
416static void amdgpu_cp_dma_p2p_host_to_vram(void) {
417	amdgpu_cp_dma_p2p(AMDGPU_GEM_DOMAIN_GTT, AMDGPU_GEM_DOMAIN_VRAM);
418}
419
420static void amdgpu_cp_dma_p2p_vram_to_host(void) {
421	amdgpu_cp_dma_p2p(AMDGPU_GEM_DOMAIN_VRAM, AMDGPU_GEM_DOMAIN_GTT);
422}
423
424int suite_cp_dma_tests_init() {
425	int r;
426
427	r = amdgpu_device_initialize(drm_amdgpu[0],
428			&executing_device_major_version,
429			&executing_device_minor_version,
430			&executing_device_handle);
431	if (r)
432		return CUE_SINIT_FAILED;
433
434	r = amdgpu_cs_ctx_create(executing_device_handle, &context_handle);
435	if (r)
436		return CUE_SINIT_FAILED;
437
438	r = amdgpu_bo_alloc_and_map(executing_device_handle, IB_SIZE, 4096,
439					AMDGPU_GEM_DOMAIN_GTT, 0,
440					&ib_handle, (void**)&ib_cpu,
441					&ib_mc_address, &ib_va_handle);
442	if (r)
443		return CUE_SINIT_FAILED;
444
445	if (do_p2p) {
446		r = amdgpu_device_initialize(drm_amdgpu[1],
447				&peer_exporting_device_major_version,
448				&peer_exporting_device_minor_version,
449				&peer_exporting_device_handle);
450
451		if (r)
452			return CUE_SINIT_FAILED;
453	}
454
455	reference_data = (uint8_t*)malloc(DMA_SIZE);
456	if (!reference_data)
457		return CUE_SINIT_FAILED;
458	memset(reference_data, DMA_DATA_BYTE, DMA_SIZE);
459
460	return CUE_SUCCESS;
461}
462
463int suite_cp_dma_tests_clean() {
464	int r;
465
466	free(reference_data);
467
468	r = amdgpu_bo_unmap_and_free(ib_handle, ib_va_handle,
469				 ib_mc_address, IB_SIZE);
470	if (r)
471		return CUE_SCLEAN_FAILED;
472
473	r = amdgpu_cs_ctx_free(context_handle);
474	if (r)
475		return CUE_SCLEAN_FAILED;
476
477	r = amdgpu_device_deinitialize(executing_device_handle);
478	if (r)
479		return CUE_SCLEAN_FAILED;
480
481	if (do_p2p) {
482		r = amdgpu_device_deinitialize(peer_exporting_device_handle);
483		if (r)
484			return CUE_SCLEAN_FAILED;
485	}
486
487	return CUE_SUCCESS;
488}
489
490CU_BOOL suite_cp_dma_tests_enable(void) {
491	int r = 0;
492
493	if (amdgpu_device_initialize(drm_amdgpu[0],
494			&executing_device_major_version,
495			&executing_device_minor_version,
496			&executing_device_handle))
497		return CU_FALSE;
498
499	if (!(executing_device_handle->info.family_id >= AMDGPU_FAMILY_AI &&
500			executing_device_handle->info.family_id <= AMDGPU_FAMILY_NV)) {
501		printf("Testing device has ASIC that is not supported by CP-DMA test suite!\n");
502		return CU_FALSE;
503	}
504
505	if (amdgpu_device_deinitialize(executing_device_handle))
506		return CU_FALSE;
507
508	if (drm_amdgpu[1] >= 0) {
509		r = amdgpu_device_initialize(drm_amdgpu[1],
510				&peer_exporting_device_major_version,
511				&peer_exporting_device_minor_version,
512				&peer_exporting_device_handle);
513
514		if (r == 0 && (peer_exporting_device_handle->info.family_id >= AMDGPU_FAMILY_AI &&
515						peer_exporting_device_handle->info.family_id <= AMDGPU_FAMILY_NV)) {
516			do_p2p = true;
517		}
518
519		if (r == 0 && amdgpu_device_deinitialize(peer_exporting_device_handle) != 0) {
520			printf("Deinitialize peer_exporting_device_handle failed!\n");
521			return CU_FALSE;
522		}
523	}
524
525	if (!do_p2p) {
526		amdgpu_set_test_active("CP DMA Tests", "Peer to Peer CP DMA write VRAM to VRAM", CU_FALSE);
527		amdgpu_set_test_active("CP DMA Tests", "Peer to Peer CP DMA write Host to VRAM", CU_FALSE);
528		amdgpu_set_test_active("CP DMA Tests", "Peer to Peer CP DMA write VRAM to Host", CU_FALSE);
529		printf("Peer device is not opened or has ASIC not supported by the suite, skip all Peer to Peer tests.\n");
530	}
531
532	return CU_TRUE;
533}
534