1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (c) International Business Machines  Corp., 2007, 2008
4 *
5 * Authors: Darren Hart <dvhltc@us.ibm.com>
6 *          Dinakar Guniguntala <dino@in.ibm.com>
7 */
8/*\
9 * [Description]
10 *
11 * Compare running sequential matrix multiplication routines
12 * to running them in parallel to judge multiprocessor
13 * performance
14 */
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <math.h>
19#include <librttest.h>
20#include <libstats.h>
21
22#define MAX_CPUS	8192
23#define PRIO		43
24#define MATRIX_SIZE	100
25#define DEF_OPS		8	/* the higher the number, the more CPU intensive */
26					/* (and therefore SMP performance goes up) */
27#define PASS_CRITERIA	0.75	/* Avg concurrent time * pass criteria < avg seq time - */
28					/* for every addition of a cpu */
29#define ITERATIONS	128
30#define HIST_BUCKETS	100
31
32#define THREAD_WAIT	1
33#define THREAD_WORK	2
34#define THREAD_DONE	3
35
36#define THREAD_SLEEP	1 * NS_PER_US
37
38static int ops = DEF_OPS;
39static int numcpus;
40static float criteria;
41static int *tids;
42static int online_cpu_id = -1;
43static int iterations = ITERATIONS;
44static int iterations_percpu;
45
46stats_container_t sdat, cdat, *curdat;
47stats_container_t shist, chist;
48static pthread_barrier_t mult_start;
49
50struct matrices {
51	double A[MATRIX_SIZE][MATRIX_SIZE];
52	double B[MATRIX_SIZE][MATRIX_SIZE];
53	double C[MATRIX_SIZE][MATRIX_SIZE];
54};
55
56static void usage(void)
57{
58	rt_help();
59	printf("matrix_mult specific options:\n");
60	printf
61	    ("  -l#	   #: number of multiplications per iteration (load)\n");
62	printf("  -i#	   #: number of iterations\n");
63}
64
65static int parse_args(int c, char *v)
66{
67	int handled = 1;
68	switch (c) {
69	case 'i':
70		iterations = atoi(v);
71		break;
72	case 'l':
73		ops = atoi(v);
74		break;
75	case 'h':
76		usage();
77		exit(0);
78	default:
79		handled = 0;
80		break;
81	}
82	return handled;
83}
84
85static void matrix_init(double A[MATRIX_SIZE][MATRIX_SIZE],
86		 double B[MATRIX_SIZE][MATRIX_SIZE])
87{
88	int i, j;
89	for (i = 0; i < MATRIX_SIZE; i++) {
90		for (j = 0; j < MATRIX_SIZE; j++) {
91			A[i][j] = (double)(i * j);
92			B[i][j] = (double)((i * j) % 10);
93		}
94	}
95}
96
97static void matrix_mult(struct matrices *matrices)
98{
99	int i, j, k;
100
101	matrix_init(matrices->A, matrices->B);
102	for (i = 0; i < MATRIX_SIZE; i++) {
103		int i_m = MATRIX_SIZE - i - 1;
104		for (j = 0; j < MATRIX_SIZE; j++) {
105			double sum = matrices->A[i_m][j] *  matrices->B[j][i];
106			for (k = 0; k < MATRIX_SIZE; k++)
107				sum +=  matrices->A[i_m][k] *  matrices->B[k][j];
108			 matrices->C[i][j] = sum;
109		}
110	}
111}
112
113static void matrix_mult_record(struct matrices *matrices, int index)
114{
115	nsec_t start, end, delta;
116	int i;
117
118	start = rt_gettime();
119	for (i = 0; i < ops; i++)
120		matrix_mult(matrices);
121	end = rt_gettime();
122	delta = (long)((end - start) / NS_PER_US);
123	curdat->records[index].x = index;
124	curdat->records[index].y = delta;
125}
126
127static int set_affinity(void)
128{
129	static pthread_mutex_t mutex_cpu = PTHREAD_MUTEX_INITIALIZER;
130	cpu_set_t mask;
131	int cpuid;
132
133	pthread_mutex_lock(&mutex_cpu);
134	do {
135		++online_cpu_id;
136		CPU_ZERO(&mask);
137		CPU_SET(online_cpu_id, &mask);
138
139		if (!sched_setaffinity(0, sizeof(mask), &mask)) {
140			cpuid = online_cpu_id;	/* Save this value before unlocking mutex */
141			pthread_mutex_unlock(&mutex_cpu);
142			return cpuid;
143		}
144	} while (online_cpu_id < MAX_CPUS);
145	pthread_mutex_unlock(&mutex_cpu);
146	return -1;
147}
148
149static void *concurrent_thread(void *thread)
150{
151	struct thread *t = (struct thread *)thread;
152	struct matrices *matrices = (struct matrices *) t->arg;
153	int thread_id = (intptr_t) t->id;
154	int cpuid;
155	int i;
156	int index;
157
158	cpuid = set_affinity();
159	if (cpuid == -1) {
160		fprintf(stderr, "Thread %d: Can't set affinity.\n", thread_id);
161		exit(1);
162	}
163
164	index = iterations_percpu * thread_id;	/* To avoid stats overlapping */
165	pthread_barrier_wait(&mult_start);
166	for (i = 0; i < iterations_percpu; i++)
167		matrix_mult_record(matrices, index++);
168
169	return NULL;
170}
171
172static int main_thread(void)
173{
174	int ret, i, j;
175	nsec_t start, end;
176	long smin = 0, smax = 0, cmin = 0, cmax = 0, delta = 0;
177	float savg, cavg;
178	int cpuid;
179	struct matrices *matrices[numcpus];
180
181	for (i = 0; i < numcpus; ++i)
182		matrices[i] = malloc(sizeof(struct matrices));
183
184	if (stats_container_init(&sdat, iterations) ||
185	    stats_container_init(&shist, HIST_BUCKETS) ||
186	    stats_container_init(&cdat, iterations) ||
187	    stats_container_init(&chist, HIST_BUCKETS)
188	    ) {
189		fprintf(stderr, "Cannot init stats container\n");
190		exit(1);
191	}
192
193	tids = calloc(numcpus, sizeof(int));
194	if (!tids) {
195		perror("malloc");
196		exit(1);
197	}
198
199	cpuid = set_affinity();
200	if (cpuid == -1) {
201		fprintf(stderr, "Main thread: Can't set affinity.\n");
202		exit(1);
203	}
204
205	/* run matrix mult operation sequentially */
206	curdat = &sdat;
207	curdat->index = iterations - 1;
208	printf("\nRunning sequential operations\n");
209	start = rt_gettime();
210	for (i = 0; i < iterations; i++)
211		matrix_mult_record(matrices[0], i);
212	end = rt_gettime();
213	delta = (long)((end - start) / NS_PER_US);
214
215	savg = delta / iterations;	/* don't use the stats record, use the total time recorded */
216	smin = stats_min(&sdat);
217	smax = stats_max(&sdat);
218
219	printf("Min: %ld us\n", smin);
220	printf("Max: %ld us\n", smax);
221	printf("Avg: %.4f us\n", savg);
222	printf("StdDev: %.4f us\n", stats_stddev(&sdat));
223
224	if (stats_hist(&shist, &sdat) ||
225	    stats_container_save("sequential",
226				 "Matrix Multiplication Sequential Execution Runtime Scatter Plot",
227				 "Iteration", "Runtime (us)", &sdat, "points")
228	    || stats_container_save("sequential_hist",
229				    "Matrix Multiplicatoin Sequential Execution Runtime Histogram",
230				    "Runtime (us)", "Samples", &shist, "steps")
231	    ) {
232		fprintf(stderr,
233			"Warning: could not save sequential mults stats\n");
234	}
235
236	pthread_barrier_init(&mult_start, NULL, numcpus + 1);
237	set_priority(PRIO);
238	curdat = &cdat;
239	curdat->index = iterations - 1;
240	online_cpu_id = -1;	/* Redispatch cpus */
241	/* Create numcpus-1 concurrent threads */
242	for (j = 0; j < numcpus; j++) {
243		tids[j] = create_fifo_thread(concurrent_thread, matrices[j], PRIO);
244		if (tids[j] == -1) {
245			printf
246			    ("Thread creation failed (max threads exceeded?)\n");
247			exit(1);
248		}
249	}
250
251	/* run matrix mult operation concurrently */
252	printf("\nRunning concurrent operations\n");
253	pthread_barrier_wait(&mult_start);
254	start = rt_gettime();
255	join_threads();
256	end = rt_gettime();
257
258	delta = (long)((end - start) / NS_PER_US);
259
260	cavg = delta / iterations;	/* don't use the stats record, use the total time recorded */
261	cmin = stats_min(&cdat);
262	cmax = stats_max(&cdat);
263
264	printf("Min: %ld us\n", cmin);
265	printf("Max: %ld us\n", cmax);
266	printf("Avg: %.4f us\n", cavg);
267	printf("StdDev: %.4f us\n", stats_stddev(&cdat));
268
269	if (stats_hist(&chist, &cdat) ||
270	    stats_container_save("concurrent",
271				 "Matrix Multiplication Concurrent Execution Runtime Scatter Plot",
272				 "Iteration", "Runtime (us)", &cdat, "points")
273	    || stats_container_save("concurrent_hist",
274				    "Matrix Multiplication Concurrent Execution Runtime Histogram",
275				    "Iteration", "Runtime (us)", &chist,
276				    "steps")
277	    ) {
278		fprintf(stderr,
279			"Warning: could not save concurrent mults stats\n");
280	}
281
282	printf("\nConcurrent Multipliers:\n");
283	printf("Min: %.4f\n", (float)smin / cmin);
284	printf("Max: %.4f\n", (float)smax / cmax);
285	printf("Avg: %.4f\n", (float)savg / cavg);
286
287	ret = 1;
288	if (savg > (cavg * criteria))
289		ret = 0;
290	printf
291	    ("\nCriteria: %.2f * average concurrent time < average sequential time\n",
292	     criteria);
293	printf("Result: %s\n", ret ? "FAIL" : "PASS");
294
295	for (i = 0; i < numcpus; i++)
296		free(matrices[i]);
297
298	return ret;
299}
300
301int main(int argc, char *argv[])
302{
303	setup();
304	pass_criteria = PASS_CRITERIA;
305	rt_init("l:i:h", parse_args, argc, argv);
306	numcpus = sysconf(_SC_NPROCESSORS_ONLN);
307	/* the minimum avg concurrent multiplier to pass */
308	criteria = pass_criteria * numcpus;
309	int new_iterations, ret;
310
311	if (iterations <= 0) {
312		fprintf(stderr, "iterations must be greater than zero\n");
313		exit(1);
314	}
315
316	printf("\n---------------------------------------\n");
317	printf("Matrix Multiplication (SMP Performance)\n");
318	printf("---------------------------------------\n\n");
319
320	/* Line below rounds up iterations to a multiple of numcpus.
321	 * Without this, having iterations not a mutiple of numcpus causes
322	 * stats to segfault (overflow stats array).
323	 */
324	new_iterations = (int)((iterations + numcpus - 1) / numcpus) * numcpus;
325	if (new_iterations != iterations)
326		printf
327		    ("Rounding up iterations value to nearest multiple of total online CPUs\n");
328
329	iterations = new_iterations;
330	iterations_percpu = iterations / numcpus;
331
332	printf("Running %d iterations\n", iterations);
333	printf("Matrix Dimensions: %dx%d\n", MATRIX_SIZE, MATRIX_SIZE);
334	printf("Calculations per iteration: %d\n", ops);
335	printf("Number of CPUs: %u\n", numcpus);
336
337	set_priority(PRIO);
338	ret = main_thread();
339
340	return ret;
341}
342