1/* 2 * Copyright (c) 2022 Ben Avison 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License along 17 * with FFmpeg; if not, write to the Free Software Foundation, Inc., 18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 */ 20 21#include <string.h> 22 23#include "checkasm.h" 24 25#include "libavcodec/vc1dsp.h" 26 27#include "libavutil/common.h" 28#include "libavutil/internal.h" 29#include "libavutil/intreadwrite.h" 30#include "libavutil/mem_internal.h" 31 32#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) }, 33#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height }, 34 35typedef struct { 36 const char *name; 37 size_t offset; 38 int width; 39 int height; 40} test; 41 42typedef struct matrix { 43 size_t width; 44 size_t height; 45 float d[]; 46} matrix; 47 48static const matrix T8 = { 8, 8, { 49 12, 12, 12, 12, 12, 12, 12, 12, 50 16, 15, 9, 4, -4, -9, -15, -16, 51 16, 6, -6, -16, -16, -6, 6, 16, 52 15, -4, -16, -9, 9, 16, 4, -15, 53 12, -12, -12, 12, 12, -12, -12, 12, 54 9, -16, 4, 15, -15, -4, 16, -9, 55 6, -16, 16, -6, -6, 16, -16, 6, 56 4, -9, 15, -16, 16, -15, 9, -4 57} }; 58 59static const matrix T4 = { 4, 4, { 60 17, 17, 17, 17, 61 22, 10, -10, -22, 62 17, -17, -17, 17, 63 10, -22, 22, -10 64} }; 65 66static const matrix T8t = { 8, 8, { 67 12, 16, 16, 15, 12, 9, 6, 4, 68 12, 15, 6, -4, -12, -16, -16, -9, 69 12, 9, -6, -16, -12, 4, 16, 15, 70 12, 4, -16, -9, 12, 15, -6, -16, 71 12, -4, -16, 9, 12, -15, -6, 16, 72 12, -9, -6, 16, -12, -4, 16, -15, 73 12, -15, 6, 4, -12, 16, -16, 9, 74 12, -16, 16, -15, 12, -9, 6, -4 75} }; 76 77static const matrix T4t = { 4, 4, { 78 17, 22, 17, 10, 79 17, 10, -17, -22, 80 17, -10, -17, 22, 81 17, -22, 17, -10 82} }; 83 84static matrix *new_matrix(size_t width, size_t height) 85{ 86 matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float)); 87 if (out == NULL) { 88 fprintf(stderr, "Memory allocation failure\n"); 89 exit(EXIT_FAILURE); 90 } 91 out->width = width; 92 out->height = height; 93 return out; 94} 95 96static matrix *multiply(const matrix *a, const matrix *b) 97{ 98 matrix *out; 99 if (a->width != b->height) { 100 fprintf(stderr, "Incompatible multiplication\n"); 101 exit(EXIT_FAILURE); 102 } 103 out = new_matrix(b->width, a->height); 104 for (int j = 0; j < out->height; ++j) 105 for (int i = 0; i < out->width; ++i) { 106 float sum = 0; 107 for (int k = 0; k < a->width; ++k) 108 sum += a->d[j * a->width + k] * b->d[k * b->width + i]; 109 out->d[j * out->width + i] = sum; 110 } 111 return out; 112} 113 114static void normalise(matrix *a) 115{ 116 for (int j = 0; j < a->height; ++j) 117 for (int i = 0; i < a->width; ++i) { 118 float *p = a->d + j * a->width + i; 119 *p *= 64; 120 if (a->height == 4) 121 *p /= (const unsigned[]) { 289, 292, 289, 292 } [j]; 122 else 123 *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j]; 124 if (a->width == 4) 125 *p /= (const unsigned[]) { 289, 292, 289, 292 } [i]; 126 else 127 *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i]; 128 } 129} 130 131static void divide_and_round_nearest(matrix *a, float by) 132{ 133 for (int j = 0; j < a->height; ++j) 134 for (int i = 0; i < a->width; ++i) { 135 float *p = a->d + j * a->width + i; 136 *p = rintf(*p / by); 137 } 138} 139 140static void tweak(matrix *a) 141{ 142 for (int j = 4; j < a->height; ++j) 143 for (int i = 0; i < a->width; ++i) { 144 float *p = a->d + j * a->width + i; 145 *p += 1; 146 } 147} 148 149/* The VC-1 spec places restrictions on the values permitted at three 150 * different stages: 151 * - D: the input coefficients in frequency domain 152 * - E: the intermediate coefficients, inverse-transformed only horizontally 153 * - R: the fully inverse-transformed coefficients 154 * 155 * To fully cater for the ranges specified requires various intermediate 156 * values to be held to 17-bit precision; yet these conditions do not appear 157 * to be utilised in real-world streams. At least some assembly 158 * implementations have chosen to restrict these values to 16-bit precision, 159 * to accelerate the decoding of real-world streams at the cost of strict 160 * adherence to the spec. To avoid our test marking these as failures, 161 * reduce our random inputs. 162 */ 163#define ATTENUATION 4 164 165static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height) 166{ 167 matrix *raw, *tmp, *D, *E, *R; 168 raw = new_matrix(width, height); 169 for (int i = 0; i < width * height; ++i) 170 raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION; 171 tmp = multiply(height == 8 ? &T8 : &T4, raw); 172 D = multiply(tmp, width == 8 ? &T8t : &T4t); 173 normalise(D); 174 divide_and_round_nearest(D, 1); 175 for (int i = 0; i < width * height; ++i) { 176 if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) { 177 /* Rare, so simply try again */ 178 av_free(raw); 179 av_free(tmp); 180 av_free(D); 181 return generate_inverse_quantized_transform_coefficients(width, height); 182 } 183 } 184 E = multiply(D, width == 8 ? &T8 : &T4); 185 divide_and_round_nearest(E, 8); 186 for (int i = 0; i < width * height; ++i) 187 if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) { 188 /* Rare, so simply try again */ 189 av_free(raw); 190 av_free(tmp); 191 av_free(D); 192 av_free(E); 193 return generate_inverse_quantized_transform_coefficients(width, height); 194 } 195 R = multiply(height == 8 ? &T8t : &T4t, E); 196 tweak(R); 197 divide_and_round_nearest(R, 128); 198 for (int i = 0; i < width * height; ++i) 199 if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) { 200 /* Rare, so simply try again */ 201 av_free(raw); 202 av_free(tmp); 203 av_free(D); 204 av_free(E); 205 av_free(R); 206 return generate_inverse_quantized_transform_coefficients(width, height); 207 } 208 av_free(raw); 209 av_free(tmp); 210 av_free(E); 211 av_free(R); 212 return D; 213} 214 215#define RANDOMIZE_BUFFER16(name, size) \ 216 do { \ 217 int i; \ 218 for (i = 0; i < size; ++i) { \ 219 uint16_t r = rnd(); \ 220 AV_WN16A(name##0 + i, r); \ 221 AV_WN16A(name##1 + i, r); \ 222 } \ 223 } while (0) 224 225#define RANDOMIZE_BUFFER8(name, size) \ 226 do { \ 227 int i; \ 228 for (i = 0; i < size; ++i) { \ 229 uint8_t r = rnd(); \ 230 name##0[i] = r; \ 231 name##1[i] = r; \ 232 } \ 233 } while (0) 234 235#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \ 236 do { \ 237 uint8_t *p##0 = name##0, *p##1 = name##1; \ 238 int i = (size); \ 239 while (i-- > 0) { \ 240 int x = 0x80 | (rnd() & 0x7F); \ 241 x >>= rnd() % 9; \ 242 if (rnd() & 1) \ 243 x = -x; \ 244 *p##1++ = *p##0++ = 0x80 + x; \ 245 } \ 246 } while (0) 247 248static void check_inv_trans_inplace(void) 249{ 250 /* Inverse transform input coefficients are stored in a 16-bit buffer 251 * with row stride of 8 coefficients irrespective of transform size. 252 * vc1_inv_trans_8x8 differs from the others in two ways: coefficients 253 * are stored in column-major order, and the outputs are written back 254 * to the input buffer, so we oversize it slightly to catch overruns. */ 255 LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]); 256 LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]); 257 258 VC1DSPContext h; 259 260 ff_vc1dsp_init(&h); 261 262 if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) { 263 matrix *coeffs; 264 declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *); 265 RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8); 266 coeffs = generate_inverse_quantized_transform_coefficients(8, 8); 267 for (int j = 0; j < 8; ++j) 268 for (int i = 0; i < 8; ++i) { 269 int idx = 8 + i * 8 + j; 270 inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i]; 271 } 272 call_ref(inv_trans_in0 + 8); 273 call_new(inv_trans_in1 + 8); 274 if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t))) 275 fail(); 276 bench_new(inv_trans_in1 + 8); 277 av_free(coeffs); 278 } 279} 280 281static void check_inv_trans_adding(void) 282{ 283 /* Inverse transform input coefficients are stored in a 16-bit buffer 284 * with row stride of 8 coefficients irrespective of transform size. */ 285 LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]); 286 LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]); 287 288 /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and 289 * added with saturation to an array of unsigned 8-bit values. Oversize 290 * this by 8 samples left and right and one row above and below. */ 291 LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]); 292 LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]); 293 294 VC1DSPContext h; 295 296 const test tests[] = { 297 VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4) 298 VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8) 299 VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4) 300 VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8) 301 VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4) 302 VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8) 303 VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4) 304 }; 305 306 ff_vc1dsp_init(&h); 307 308 for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { 309 void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset); 310 if (check_func(func, "vc1dsp.%s", tests[t].name)) { 311 matrix *coeffs; 312 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *); 313 RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8); 314 RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24); 315 coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height); 316 for (int j = 0; j < tests[t].height; ++j) 317 for (int i = 0; i < tests[t].width; ++i) { 318 int idx = j * 8 + i; 319 inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i]; 320 } 321 call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0); 322 call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1); 323 if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24)) 324 fail(); 325 bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8); 326 av_free(coeffs); 327 } 328 } 329} 330 331static void check_loop_filter(void) 332{ 333 /* Deblocking filter buffers are big enough to hold a 16x16 block, 334 * plus 16 columns left and 4 rows above to hold filter inputs 335 * (depending on whether v or h neighbouring block edge, oversized 336 * horizontally to maintain 16-byte alignment) plus 16 columns and 337 * 4 rows below to catch write overflows */ 338 LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]); 339 LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]); 340 341 VC1DSPContext h; 342 343 const test tests[] = { 344 VC1DSP_TEST(vc1_v_loop_filter4) 345 VC1DSP_TEST(vc1_h_loop_filter4) 346 VC1DSP_TEST(vc1_v_loop_filter8) 347 VC1DSP_TEST(vc1_h_loop_filter8) 348 VC1DSP_TEST(vc1_v_loop_filter16) 349 VC1DSP_TEST(vc1_h_loop_filter16) 350 }; 351 352 ff_vc1dsp_init(&h); 353 354 for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { 355 void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset); 356 declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int); 357 if (check_func(func, "vc1dsp.%s", tests[t].name)) { 358 for (int count = 1000; count > 0; --count) { 359 int pq = rnd() % 31 + 1; 360 RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48); 361 call_ref(filter_buf0 + 4 * 48 + 16, 48, pq); 362 call_new(filter_buf1 + 4 * 48 + 16, 48, pq); 363 if (memcmp(filter_buf0, filter_buf1, 24 * 48)) 364 fail(); 365 } 366 } 367 for (int j = 0; j < 24; ++j) 368 for (int i = 0; i < 48; ++i) 369 filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4); 370 if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name)) 371 bench_new(filter_buf1 + 4 * 48 + 16, 48, 1); 372 if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name)) 373 bench_new(filter_buf1 + 4 * 48 + 16, 48, 31); 374 } 375} 376 377#define TEST_UNESCAPE \ 378 do { \ 379 for (int count = 100; count > 0; --count) { \ 380 escaped_offset = rnd() & 7; \ 381 unescaped_offset = rnd() & 7; \ 382 escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \ 383 RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \ 384 len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \ 385 len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \ 386 if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \ 387 fail(); \ 388 } \ 389 } while (0) 390 391static void check_unescape(void) 392{ 393 /* This appears to be a typical length of buffer in use */ 394#define LOG2_UNESCAPE_BUF_SIZE 17 395#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE) 396 LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]); 397 LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]); 398 LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]); 399 LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]); 400 401 VC1DSPContext h; 402 403 ff_vc1dsp_init(&h); 404 405 if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) { 406 int len0, len1, escaped_offset, unescaped_offset, escaped_len; 407 declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *); 408 409 /* Test data which consists of escapes sequences packed as tightly as possible */ 410 for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x) 411 escaped1[x] = escaped0[x] = 3 * (x % 3 == 0); 412 TEST_UNESCAPE; 413 414 /* Test random data */ 415 RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE); 416 TEST_UNESCAPE; 417 418 /* Test data with escape sequences at random intervals */ 419 for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) { 420 int gap, gap_msb; 421 escaped1[x+0] = escaped0[x+0] = 0; 422 escaped1[x+1] = escaped0[x+1] = 0; 423 escaped1[x+2] = escaped0[x+2] = 3; 424 escaped1[x+3] = escaped0[x+3] = rnd() & 3; 425 gap_msb = 2u << (rnd() % 8); 426 gap = (rnd() &~ -gap_msb) | gap_msb; 427 x += gap; 428 } 429 TEST_UNESCAPE; 430 431 /* Test data which is known to contain no escape sequences */ 432 memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE); 433 memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE); 434 TEST_UNESCAPE; 435 436 /* Benchmark the no-escape-sequences case */ 437 bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1); 438 } 439} 440 441void checkasm_check_vc1dsp(void) 442{ 443 check_inv_trans_inplace(); 444 check_inv_trans_adding(); 445 report("inv_trans"); 446 447 check_loop_filter(); 448 report("loop_filter"); 449 450 check_unescape(); 451 report("unescape_buffer"); 452} 453