1/*
2 * Copyright 2021 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22*/
23
24#include <stdio.h>
25#include <sys/types.h>
26#include <sys/stat.h>
27#include <fcntl.h>
28#include <stdarg.h>
29#include <string.h>
30#include <errno.h>
31#include <unistd.h>
32#include <stdlib.h>
33
34#include "drm.h"
35#include "xf86drmMode.h"
36#include "xf86drm.h"
37#include "amdgpu.h"
38#include "amdgpu_drm.h"
39#include "amdgpu_internal.h"
40
41#define MAX_CARDS_SUPPORTED	4
42#define NUM_BUFFER_OBJECTS	1024
43
44#define SDMA_PACKET(op, sub_op, e)      ((((e) & 0xFFFF) << 16) |  \
45					(((sub_op) & 0xFF) << 8) | \
46					(((op) & 0xFF) << 0))
47
48#define SDMA_OPCODE_COPY				  1
49#       define SDMA_COPY_SUB_OPCODE_LINEAR		0
50
51
52#define SDMA_PACKET_SI(op, b, t, s, cnt)	((((op) & 0xF) << 28) | \
53						(((b) & 0x1) << 26) |	\
54						(((t) & 0x1) << 23) |	\
55						(((s) & 0x1) << 22) |	\
56						(((cnt) & 0xFFFFF) << 0))
57#define SDMA_OPCODE_COPY_SI     3
58
59
60/** Help string for command line parameters */
61static const char usage[] =
62	"Usage: %s [-?h] [-b v|g|vg size] "
63	"[-c from to size count]\n"
64	"where:\n"
65	"	b - Allocate a BO in VRAM, GTT or VRAM|GTT of size bytes.\n"
66	"	    This flag can be used multiple times. The first bo will\n"
67	"	    have id `1`, then second id `2`, ...\n"
68	"       c - Copy size bytes from BO (bo_id1) to BO (bo_id2), count times\n"
69	"       h - Display this help\n"
70	"\n"
71	"Sizes can be postfixes with k, m or g for kilo, mega and gigabyte scaling\n";
72
73/** Specified options strings for getopt */
74static const char options[]   = "?hb:c:";
75
76/* Open AMD devices.
77 * Returns the fd of the first device it could open.
78 */
79static int amdgpu_open_device(void)
80{
81	drmDevicePtr devices[MAX_CARDS_SUPPORTED];
82	unsigned int i;
83	int drm_count;
84
85	drm_count = drmGetDevices2(0, devices, MAX_CARDS_SUPPORTED);
86	if (drm_count < 0) {
87		fprintf(stderr, "drmGetDevices2() returned an error %d\n",
88			drm_count);
89		return drm_count;
90	}
91
92	for (i = 0; i < drm_count; i++) {
93		drmVersionPtr version;
94		int fd;
95
96		/* If this is not PCI device, skip*/
97		if (devices[i]->bustype != DRM_BUS_PCI)
98			continue;
99
100		/* If this is not AMD GPU vender ID, skip*/
101		if (devices[i]->deviceinfo.pci->vendor_id != 0x1002)
102			continue;
103
104		if (!(devices[i]->available_nodes & 1 << DRM_NODE_RENDER))
105			continue;
106
107		fd = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR | O_CLOEXEC);
108
109		/* This node is not available. */
110		if (fd < 0) continue;
111
112		version = drmGetVersion(fd);
113		if (!version) {
114			fprintf(stderr,
115				"Warning: Cannot get version for %s."
116				"Error is %s\n",
117				devices[i]->nodes[DRM_NODE_RENDER],
118				strerror(errno));
119			close(fd);
120			continue;
121		}
122
123		if (strcmp(version->name, "amdgpu")) {
124			/* This is not AMDGPU driver, skip.*/
125			drmFreeVersion(version);
126			close(fd);
127			continue;
128		}
129
130		drmFreeVersion(version);
131		drmFreeDevices(devices, drm_count);
132		return fd;
133	}
134
135	return -1;
136}
137
138amdgpu_device_handle device_handle;
139amdgpu_context_handle context_handle;
140
141amdgpu_bo_handle resources[NUM_BUFFER_OBJECTS];
142uint64_t virtual[NUM_BUFFER_OBJECTS];
143unsigned int num_buffers;
144uint32_t *pm4;
145
146int alloc_bo(uint32_t domain, uint64_t size)
147{
148	struct amdgpu_bo_alloc_request request = {};
149	amdgpu_bo_handle bo;
150	amdgpu_va_handle va;
151	uint64_t addr;
152	int r;
153
154	if (num_buffers >= NUM_BUFFER_OBJECTS)
155		return -ENOSPC;
156
157	request.alloc_size = size;
158	request.phys_alignment = 0;
159	request.preferred_heap = domain;
160	request.flags = 0;
161	r = amdgpu_bo_alloc(device_handle, &request, &bo);
162	if (r)
163		return r;
164
165	r = amdgpu_va_range_alloc(device_handle, amdgpu_gpu_va_range_general,
166				  size, 0, 0, &addr, &va, 0);
167	if (r)
168		return r;
169
170	r = amdgpu_bo_va_op_raw(device_handle, bo, 0, size, addr,
171				AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
172				AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_MAP);
173	if (r)
174		return r;
175
176	resources[num_buffers] = bo;
177	virtual[num_buffers] = addr;
178	fprintf(stdout, "Allocated BO number %u at 0x%lx, domain 0x%x, size %lu\n",
179		num_buffers++, addr, domain, size);
180	return 0;
181}
182
183int submit_ib(uint32_t from, uint32_t to, uint64_t size, uint32_t count)
184{
185	struct amdgpu_cs_request ibs_request;
186	struct amdgpu_cs_fence fence_status;
187	struct amdgpu_cs_ib_info ib_info;
188	uint64_t copied = size, delta;
189	struct timespec start, stop;
190
191	uint64_t src = virtual[from];
192	uint64_t dst = virtual[to];
193	uint32_t expired;
194	int i, r;
195
196	i = 0;
197	while (size) {
198		uint64_t bytes = size < 0x40000 ? size : 0x40000;
199
200		if (device_handle->info.family_id == AMDGPU_FAMILY_SI) {
201			pm4[i++] = SDMA_PACKET_SI(SDMA_OPCODE_COPY_SI, 0, 0, 0,
202						  bytes);
203			pm4[i++] = 0xffffffff & dst;
204			pm4[i++] = 0xffffffff & src;
205			pm4[i++] = (0xffffffff00000000 & dst) >> 32;
206			pm4[i++] = (0xffffffff00000000 & src) >> 32;
207		} else {
208			pm4[i++] = SDMA_PACKET(SDMA_OPCODE_COPY,
209					       SDMA_COPY_SUB_OPCODE_LINEAR,
210					       0);
211			if ( device_handle->info.family_id >= AMDGPU_FAMILY_AI)
212				pm4[i++] = bytes - 1;
213			else
214				pm4[i++] = bytes;
215			pm4[i++] = 0;
216			pm4[i++] = 0xffffffff & src;
217			pm4[i++] = (0xffffffff00000000 & src) >> 32;
218			pm4[i++] = 0xffffffff & dst;
219			pm4[i++] = (0xffffffff00000000 & dst) >> 32;
220		}
221
222		size -= bytes;
223		src += bytes;
224		dst += bytes;
225	}
226
227	memset(&ib_info, 0, sizeof(ib_info));
228	ib_info.ib_mc_address = virtual[0];
229	ib_info.size = i;
230
231	memset(&ibs_request, 0, sizeof(ibs_request));
232	ibs_request.ip_type = AMDGPU_HW_IP_DMA;
233	ibs_request.ring = 0;
234	ibs_request.number_of_ibs = 1;
235	ibs_request.ibs = &ib_info;
236	ibs_request.fence_info.handle = NULL;
237
238	r = clock_gettime(CLOCK_MONOTONIC, &start);
239	if (r)
240		return errno;
241
242	r = amdgpu_bo_list_create(device_handle, num_buffers, resources, NULL,
243				  &ibs_request.resources);
244	if (r)
245		return r;
246
247	for (i = 0; i < count; ++i) {
248		r = amdgpu_cs_submit(context_handle, 0, &ibs_request, 1);
249		if (r)
250			return r;
251	}
252
253	r = amdgpu_bo_list_destroy(ibs_request.resources);
254	if (r)
255		return r;
256
257	memset(&fence_status, 0, sizeof(fence_status));
258	fence_status.ip_type = ibs_request.ip_type;
259	fence_status.ip_instance = 0;
260	fence_status.ring = ibs_request.ring;
261	fence_status.context = context_handle;
262	fence_status.fence = ibs_request.seq_no;
263	r = amdgpu_cs_query_fence_status(&fence_status,
264					 AMDGPU_TIMEOUT_INFINITE,
265					 0, &expired);
266	if (r)
267		return r;
268
269	r = clock_gettime(CLOCK_MONOTONIC, &stop);
270	if (r)
271		return errno;
272
273	delta = stop.tv_nsec + stop.tv_sec * 1000000000UL;
274	delta -= start.tv_nsec + start.tv_sec * 1000000000UL;
275
276	fprintf(stdout, "Submitted %u IBs to copy from %u(%lx) to %u(%lx) %lu bytes took %lu usec\n",
277		count, from, virtual[from], to, virtual[to], copied, delta / 1000);
278	return 0;
279}
280
281void next_arg(int argc, char **argv, const char *msg)
282{
283	optarg = argv[optind++];
284	if (optind > argc || optarg[0] == '-') {
285		fprintf(stderr, "%s\n", msg);
286		exit(EXIT_FAILURE);
287	}
288}
289
290uint64_t parse_size(void)
291{
292	uint64_t size;
293	char ext[2];
294
295	ext[0] = 0;
296	if (sscanf(optarg, "%li%1[kmgKMG]", &size, ext) < 1) {
297		fprintf(stderr, "Can't parse size arg: %s\n", optarg);
298		exit(EXIT_FAILURE);
299	}
300	switch (ext[0]) {
301	case 'k':
302	case 'K':
303		size *= 1024;
304		break;
305	case 'm':
306	case 'M':
307		size *= 1024 * 1024;
308		break;
309	case 'g':
310	case 'G':
311		size *= 1024 * 1024 * 1024;
312		break;
313	default:
314		break;
315	}
316	return size;
317}
318
319int main(int argc, char **argv)
320{
321	uint32_t major_version, minor_version;
322	uint32_t domain, from, to, count;
323       	uint64_t size;
324	int fd, r, c;
325
326	fd = amdgpu_open_device();
327       	if (fd < 0) {
328		perror("Cannot open AMDGPU device");
329		exit(EXIT_FAILURE);
330	}
331
332	r = amdgpu_device_initialize(fd, &major_version, &minor_version, &device_handle);
333	if (r) {
334		fprintf(stderr, "amdgpu_device_initialize returned %d\n", r);
335		exit(EXIT_FAILURE);
336	}
337
338	r = amdgpu_cs_ctx_create(device_handle, &context_handle);
339	if (r) {
340		fprintf(stderr, "amdgpu_cs_ctx_create returned %d\n", r);
341		exit(EXIT_FAILURE);
342	}
343
344	if (argc == 1) {
345		fprintf(stderr, usage, argv[0]);
346		exit(EXIT_FAILURE);
347	}
348
349	r = alloc_bo(AMDGPU_GEM_DOMAIN_GTT, 2ULL * 1024 * 1024);
350	if (r) {
351		fprintf(stderr, "Buffer allocation failed with %d\n", r);
352		exit(EXIT_FAILURE);
353	}
354
355	r = amdgpu_bo_cpu_map(resources[0], (void **)&pm4);
356	if (r) {
357		fprintf(stderr, "Buffer mapping failed with %d\n", r);
358		exit(EXIT_FAILURE);
359	}
360
361	opterr = 0;
362	while ((c = getopt(argc, argv, options)) != -1) {
363		switch (c) {
364		case 'b':
365			if (!strcmp(optarg, "v"))
366				domain = AMDGPU_GEM_DOMAIN_VRAM;
367			else if (!strcmp(optarg, "g"))
368				domain = AMDGPU_GEM_DOMAIN_GTT;
369			else if (!strcmp(optarg, "vg"))
370				domain = AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT;
371			else {
372				fprintf(stderr, "Invalid domain: %s\n", optarg);
373				exit(EXIT_FAILURE);
374			}
375			next_arg(argc, argv, "Missing buffer size");
376			size = parse_size();
377			if (size < getpagesize()) {
378				fprintf(stderr, "Buffer size to small %lu\n", size);
379				exit(EXIT_FAILURE);
380			}
381			r = alloc_bo(domain, size);
382			if (r) {
383				fprintf(stderr, "Buffer allocation failed with %d\n", r);
384				exit(EXIT_FAILURE);
385			}
386			break;
387		case 'c':
388			if (sscanf(optarg, "%u", &from) != 1) {
389				fprintf(stderr, "Can't parse from buffer: %s\n", optarg);
390				exit(EXIT_FAILURE);
391			}
392			next_arg(argc, argv, "Missing to buffer");
393			if (sscanf(optarg, "%u", &to) != 1) {
394				fprintf(stderr, "Can't parse to buffer: %s\n", optarg);
395				exit(EXIT_FAILURE);
396			}
397			next_arg(argc, argv, "Missing size");
398			size = parse_size();
399			next_arg(argc, argv, "Missing count");
400			count = parse_size();
401			r = submit_ib(from, to, size, count);
402			if (r) {
403				fprintf(stderr, "IB submission failed with %d\n", r);
404				exit(EXIT_FAILURE);
405			}
406			break;
407		case '?':
408		case 'h':
409			fprintf(stderr, usage, argv[0]);
410			exit(EXIT_SUCCESS);
411		default:
412			fprintf(stderr, usage, argv[0]);
413			exit(EXIT_FAILURE);
414		}
415	}
416
417	return EXIT_SUCCESS;
418}
419