1#include <stdlib.h> 2#include <stdio.h> 3#include <string.h> 4#include <unistd.h> 5#include <sched.h> 6#include <time.h> 7 8#define rdtscll(val) \ 9 __asm__ __volatile__("rdtsc" : "=A" (val)) 10 11#define likely(x) __builtin_expect((x),1) 12#define unlikely(x) __builtin_expect((x),0) 13 14typedef short int s16; 15typedef int s32; 16 17#if 0 18#define CONFIG_SMP 19#endif 20 21#ifdef CONFIG_SMP 22#define LOCK_PREFIX "lock ; " 23#else 24#define LOCK_PREFIX "" 25#endif 26 27struct __xchg_dummy { unsigned long a[100]; }; 28#define __xg(x) ((struct __xchg_dummy *)(x)) 29 30static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, 31 unsigned long new, int size) 32{ 33 unsigned long prev; 34 switch (size) { 35 case 1: 36 __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" 37 : "=a"(prev) 38 : "q"(new), "m"(*__xg(ptr)), "0"(old) 39 : "memory"); 40 return prev; 41 case 2: 42 __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" 43 : "=a"(prev) 44 : "q"(new), "m"(*__xg(ptr)), "0"(old) 45 : "memory"); 46 return prev; 47 case 4: 48 __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" 49 : "=a"(prev) 50 : "q"(new), "m"(*__xg(ptr)), "0"(old) 51 : "memory"); 52 return prev; 53 } 54 return old; 55} 56 57#define cmpxchg(ptr,o,n)\ 58 ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ 59 (unsigned long)(n),sizeof(*(ptr)))) 60 61static inline void atomic_add(volatile int *dst, int v) 62{ 63 __asm__ __volatile__( 64 LOCK_PREFIX "addl %1,%0" 65 :"=m" (*dst) 66 :"ir" (v), "m" (*dst)); 67} 68 69static double detect_cpu_clock() 70{ 71 struct timespec tm_begin, tm_end; 72 unsigned long long tsc_begin, tsc_end; 73 74 /* Warm cache */ 75 clock_gettime(CLOCK_MONOTONIC, &tm_begin); 76 77 rdtscll(tsc_begin); 78 clock_gettime(CLOCK_MONOTONIC, &tm_begin); 79 80 usleep(1000000); 81 82 rdtscll(tsc_end); 83 clock_gettime(CLOCK_MONOTONIC, &tm_end); 84 85 return (tsc_end - tsc_begin) / (tm_end.tv_sec - tm_begin.tv_sec + (tm_end.tv_nsec - tm_begin.tv_nsec) / 1e9); 86} 87 88void mix_areas_srv(unsigned int size, 89 const s16 *src, 90 volatile s32 *sum, 91 unsigned int src_step, unsigned int sum_step) 92{ 93 src_step /= sizeof(*src); 94 sum_step /= sizeof(*sum); 95 while (size-- > 0) { 96 atomic_add(sum, *src); 97 src += src_step; 98 sum += sum_step; 99 } 100} 101 102void saturate(unsigned int size, 103 s16 *dst, const s32 *sum, 104 unsigned int dst_step, unsigned int sum_step) 105{ 106 dst_step /= sizeof(*dst); 107 sum_step /= sizeof(*sum); 108 while (size-- > 0) { 109 s32 sample = *sum; 110 if (unlikely(sample < -0x8000)) 111 *dst = -0x8000; 112 else if (unlikely(sample > 0x7fff)) 113 *dst = 0x7fff; 114 else 115 *dst = sample; 116 dst += dst_step; 117 sum += sum_step; 118 } 119} 120 121void mix_areas0(unsigned int size, 122 volatile s16 *dst, s16 *src, 123 volatile s32 *sum, 124 unsigned int dst_step, 125 unsigned int src_step, 126 unsigned int sum_step) 127{ 128 dst_step /= sizeof(*dst); 129 src_step /= sizeof(*src); 130 sum_step /= sizeof(*sum); 131 while (size-- > 0) { 132 s32 sample = *dst + *src; 133 if (unlikely(sample < -0x8000)) 134 *dst = -0x8000; 135 else if (unlikely(sample > 0x7fff)) 136 *dst = 0x7fff; 137 else 138 *dst = sample; 139 dst += dst_step; 140 src += src_step; 141 sum += sum_step; 142 } 143} 144 145#define MIX_AREAS_16 mix_areas1 146#define MIX_AREAS_16_MMX mix_areas1_mmx 147#define MIX_AREAS_32 mix_areas1_32 148#define MIX_AREAS_24 mix_areas1_24 149#define MIX_AREAS_24_CMOV mix_areas1_24_cmov 150#define XADD "addl" 151#define XSUB "subl" 152#include "../src/pcm/pcm_dmix_i386.h" 153static void *ptr_mix_areas1_32 __attribute__((unused)) = &mix_areas1_32; 154static void *ptr_mix_areas1_24 __attribute__((unused)) = &mix_areas1_24; 155static void *ptr_mix_areas1_24_cmov __attribute__((unused)) = &mix_areas1_24_cmov; 156 157void mix_areas2(unsigned int size, 158 volatile s16 *dst, const s16 *src, 159 volatile s32 *sum, 160 unsigned int dst_step, 161 unsigned int src_step) 162{ 163 dst_step /= sizeof(*dst); 164 src_step /= sizeof(*src); 165 while (size-- > 0) { 166 s32 sample = *src; 167 s32 old_sample = *sum; 168 if (cmpxchg(dst, 0, 1) == 0) 169 sample -= old_sample; 170 atomic_add(sum, sample); 171 do { 172 sample = *sum; 173 if (unlikely(sample < -0x8000)) 174 *dst = -0x8000; 175 else if (unlikely(sample > 0x7fff)) 176 *dst = 0x7fff; 177 else 178 *dst = sample; 179 } while (unlikely(sample != *sum)); 180 sum++; 181 dst += dst_step; 182 src += src_step; 183 } 184} 185 186void setscheduler(void) 187{ 188 struct sched_param sched_param; 189 190 if (sched_getparam(0, &sched_param) < 0) { 191 printf("Scheduler getparam failed...\n"); 192 return; 193 } 194 sched_param.sched_priority = sched_get_priority_max(SCHED_RR); 195 if (!sched_setscheduler(0, SCHED_RR, &sched_param)) { 196 printf("Scheduler set to Round Robin with priority %i...\n", sched_param.sched_priority); 197 fflush(stdout); 198 return; 199 } 200 printf("!!!Scheduler set to Round Robin with priority %i FAILED!!!\n", sched_param.sched_priority); 201} 202 203int cache_size = 1024*1024; 204 205void init(s16 *dst, s32 *sum, int size) 206{ 207 int count; 208 char *a; 209 210 for (count = size - 1; count >= 0; count--) 211 *sum++ = 0; 212 for (count = size - 1; count >= 0; count--) 213 *dst++ = 0; 214 a = malloc(cache_size); 215 for (count = cache_size - 1; count >= 0; count--) { 216 a[count] = count & 0xff; 217 a[count] ^= 0x55; 218 a[count] ^= 0xaa; 219 } 220 free(a); 221} 222 223int main(int argc, char **argv) 224{ 225 int size = 2048, n = 4, max = 32267; 226 int LOOP = 100; 227 int i, t; 228 unsigned long long begin, end, diff, diffS, diff0, diff1, diff1_mmx, diff2; 229 double cpu_clock = detect_cpu_clock(); 230 s16 *dst = malloc(sizeof(*dst) * size); 231 s32 *sum = calloc(size, sizeof(*sum)); 232 s16 **srcs = malloc(sizeof(*srcs) * n); 233 234 setscheduler(); 235#ifndef CONFIG_SMP 236 printf("CPU clock: %fMhz (UP)\n\n", cpu_clock / 10e5); 237#else 238 printf("CPU clock: %fMhz (SMP)\n\n", cpu_clock / 10e5); 239#endif 240 if (argc > 3) { 241 size = atoi(argv[1]); 242 n = atoi(argv[2]); 243 max = atoi(argv[3]); 244 } 245 if (argc > 4) 246 cache_size = atoi(argv[4]) * 1024; 247 for (i = 0; i < n; i++) { 248 int k; 249 s16 *s; 250 srcs[i] = s = malloc(sizeof(s16) * size); 251 for (k = 0; k < size; ++k, ++s) { 252 *s = (rand() % (max * 2)) - max; 253 } 254 } 255 256 for (t = 0, diffS = -1; t < LOOP; t++) { 257 init(dst, sum, size); 258 rdtscll(begin); 259 for (i = 0; i < n; i++) { 260 mix_areas_srv(size, srcs[i], sum, 2, 4); 261 } 262 saturate(size, dst, sum, 2, 4); 263 rdtscll(end); 264 diff = end - begin; 265 if (diff < diffS) 266 diffS = diff; 267 printf("mix_areas_srv : %llu \r", diff); fflush(stdout); 268 } 269 270 for (t = 0, diff0 = -1; t < LOOP; t++) { 271 init(dst, sum, size); 272 rdtscll(begin); 273 for (i = 0; i < n; i++) { 274 mix_areas0(size, dst, srcs[i], sum, 2, 2, 4); 275 } 276 rdtscll(end); 277 diff = end - begin; 278 if (diff < diff0) 279 diff0 = diff; 280 printf("mix_areas0 : %llu \r", diff); fflush(stdout); 281 } 282 283 for (t = 0, diff1 = -1; t < LOOP; t++) { 284 init(dst, sum, size); 285 rdtscll(begin); 286 for (i = 0; i < n; i++) { 287 mix_areas1(size, dst, srcs[i], sum, 2, 2, 4); 288 } 289 rdtscll(end); 290 diff = end - begin; 291 if (diff < diff1) 292 diff1 = diff; 293 printf("mix_areas1 : %llu \r", diff); fflush(stdout); 294 } 295 296 for (t = 0, diff1_mmx = -1; t < LOOP; t++) { 297 init(dst, sum, size); 298 rdtscll(begin); 299 for (i = 0; i < n; i++) { 300 mix_areas1_mmx(size, dst, srcs[i], sum, 2, 2, 4); 301 } 302 rdtscll(end); 303 diff = end - begin; 304 if (diff < diff1_mmx) 305 diff1_mmx = diff; 306 printf("mix_areas1_mmx: %llu \r", diff); fflush(stdout); 307 } 308 309 for (t = 0, diff2 = -1; t < LOOP; t++) { 310 init(dst, sum, size); 311 rdtscll(begin); 312 for (i = 0; i < n; i++) { 313 mix_areas2(size, dst, srcs[i], sum, 2, 2); 314 } 315 rdtscll(end); 316 diff = end - begin; 317 if (diff < diff2) 318 diff2 = diff; 319 printf("mix_areas2 : %llu \r", diff); fflush(stdout); 320 } 321 322 printf(" \r"); 323 printf("Summary (the best times):\n"); 324 printf("mix_areas_srv : %8llu %f%%\n", diffS, 100*2*44100.0*diffS/(size*n*cpu_clock)); 325 printf("mix_areas0 : %8llu %f%%\n", diff0, 100*2*44100.0*diff0/(size*n*cpu_clock)); 326 printf("mix_areas1 : %8llu %f%%\n", diff1, 100*2*44100.0*diff1/(size*n*cpu_clock)); 327 printf("mix_areas1_mmx : %8llu %f%%\n", diff1_mmx, 100*2*44100.0*diff1_mmx/(size*n*cpu_clock)); 328 printf("mix_areas2 : %8llu %f%%\n", diff2, 100*2*44100.0*diff2/(size*n*cpu_clock)); 329 330 printf("\n"); 331 printf("areas1/srv ratio : %f\n", (double)diff1 / diffS); 332 printf("areas1_mmx/srv ratio : %f\n", (double)diff1_mmx / diffS); 333 334 return 0; 335} 336