1/* SwapBytes.c -- Byte Swap conversion filter 22023-04-07 : Igor Pavlov : Public domain */ 3 4#include "Precomp.h" 5 6#include "Compiler.h" 7#include "CpuArch.h" 8#include "RotateDefs.h" 9#include "SwapBytes.h" 10 11typedef UInt16 CSwapUInt16; 12typedef UInt32 CSwapUInt32; 13 14// #define k_SwapBytes_Mode_BASE 0 15 16#ifdef MY_CPU_X86_OR_AMD64 17 18#define k_SwapBytes_Mode_SSE2 1 19#define k_SwapBytes_Mode_SSSE3 2 20#define k_SwapBytes_Mode_AVX2 3 21 22 // #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) 23 #if defined(__clang__) && (__clang_major__ >= 4) \ 24 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) 25 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2 26 #define SWAP_ATTRIB_SSE2 __attribute__((__target__("sse2"))) 27 #define SWAP_ATTRIB_SSSE3 __attribute__((__target__("ssse3"))) 28 #define SWAP_ATTRIB_AVX2 __attribute__((__target__("avx2"))) 29 #elif defined(_MSC_VER) 30 #if (_MSC_VER == 1900) 31 #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX 32 #endif 33 #if (_MSC_VER >= 1900) 34 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2 35 #elif (_MSC_VER >= 1500) // (VS2008) 36 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSSE3 37 #elif (_MSC_VER >= 1310) // (VS2003) 38 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSE2 39 #endif 40 #endif // _MSC_VER 41 42/* 43// for debug 44#ifdef k_SwapBytes_Mode_MAX 45#undef k_SwapBytes_Mode_MAX 46#endif 47*/ 48 49#ifndef k_SwapBytes_Mode_MAX 50#define k_SwapBytes_Mode_MAX 0 51#endif 52 53#if (k_SwapBytes_Mode_MAX != 0) && defined(MY_CPU_AMD64) 54 #define k_SwapBytes_Mode_MIN k_SwapBytes_Mode_SSE2 55#else 56 #define k_SwapBytes_Mode_MIN 0 57#endif 58 59#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_AVX2) 60 #define USE_SWAP_AVX2 61#endif 62#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSSE3) 63 #define USE_SWAP_SSSE3 64#endif 65#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSE2) 66 #define USE_SWAP_128 67#endif 68 69#if k_SwapBytes_Mode_MAX <= k_SwapBytes_Mode_MIN || !defined(USE_SWAP_128) 70#define FORCE_SWAP_MODE 71#endif 72 73 74#ifdef USE_SWAP_128 75/* 76 <mmintrin.h> MMX 77<xmmintrin.h> SSE 78<emmintrin.h> SSE2 79<pmmintrin.h> SSE3 80<tmmintrin.h> SSSE3 81<smmintrin.h> SSE4.1 82<nmmintrin.h> SSE4.2 83<ammintrin.h> SSE4A 84<wmmintrin.h> AES 85<immintrin.h> AVX, AVX2, FMA 86*/ 87 88#include <emmintrin.h> // sse2 89// typedef __m128i v128; 90 91#define SWAP2_128(i) { \ 92 const __m128i v = *(const __m128i *)(const void *)(items + (i) * 8); \ 93 *( __m128i *)( void *)(items + (i) * 8) = \ 94 _mm_or_si128( \ 95 _mm_slli_epi16(v, 8), \ 96 _mm_srli_epi16(v, 8)); } 97// _mm_or_si128() has more ports to execute than _mm_add_epi16(). 98 99static 100#ifdef SWAP_ATTRIB_SSE2 101SWAP_ATTRIB_SSE2 102#endif 103void 104Z7_FASTCALL 105SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim) 106{ 107 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 108 do 109 { 110 SWAP2_128(0) SWAP2_128(1) items += 2 * 8; 111 SWAP2_128(0) SWAP2_128(1) items += 2 * 8; 112 } 113 while (items != lim); 114} 115 116/* 117// sse2 118#define SWAP4_128_pack(i) { \ 119 __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \ 120 __m128i v0 = _mm_unpacklo_epi8(v, mask); \ 121 __m128i v1 = _mm_unpackhi_epi8(v, mask); \ 122 v0 = _mm_shufflelo_epi16(v0, 0x1b); \ 123 v1 = _mm_shufflelo_epi16(v1, 0x1b); \ 124 v0 = _mm_shufflehi_epi16(v0, 0x1b); \ 125 v1 = _mm_shufflehi_epi16(v1, 0x1b); \ 126 *(__m128i *)(void *)(items + (i) * 4) = _mm_packus_epi16(v0, v1); } 127 128static 129#ifdef SWAP_ATTRIB_SSE2 130SWAP_ATTRIB_SSE2 131#endif 132void 133Z7_FASTCALL 134SwapBytes4_128_pack(CSwapUInt32 *items, const CSwapUInt32 *lim) 135{ 136 const __m128i mask = _mm_setzero_si128(); 137 // const __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0); 138 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 139 do 140 { 141 SWAP4_128_pack(0); items += 1 * 4; 142 // SWAP4_128_pack(0); SWAP4_128_pack(1); items += 2 * 4; 143 } 144 while (items != lim); 145} 146 147// sse2 148#define SWAP4_128_shift(i) { \ 149 __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \ 150 __m128i v2; \ 151 v2 = _mm_or_si128( \ 152 _mm_slli_si128(_mm_and_si128(v, mask), 1), \ 153 _mm_and_si128(_mm_srli_si128(v, 1), mask)); \ 154 v = _mm_or_si128( \ 155 _mm_slli_epi32(v, 24), \ 156 _mm_srli_epi32(v, 24)); \ 157 *(__m128i *)(void *)(items + (i) * 4) = _mm_or_si128(v2, v); } 158 159static 160#ifdef SWAP_ATTRIB_SSE2 161SWAP_ATTRIB_SSE2 162#endif 163void 164Z7_FASTCALL 165SwapBytes4_128_shift(CSwapUInt32 *items, const CSwapUInt32 *lim) 166{ 167 #define M1 0xff00 168 const __m128i mask = _mm_set_epi32(M1, M1, M1, M1); 169 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 170 do 171 { 172 // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4; 173 // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4; 174 SWAP4_128_shift(0); items += 1 * 4; 175 } 176 while (items != lim); 177} 178*/ 179 180 181#if defined(USE_SWAP_SSSE3) || defined(USE_SWAP_AVX2) 182 183#define SWAP_SHUF_REV_SEQ_2_VALS(v) (v)+1, (v) 184#define SWAP_SHUF_REV_SEQ_4_VALS(v) (v)+3, (v)+2, (v)+1, (v) 185 186#define SWAP2_SHUF_MASK_16_BYTES \ 187 SWAP_SHUF_REV_SEQ_2_VALS (0 * 2), \ 188 SWAP_SHUF_REV_SEQ_2_VALS (1 * 2), \ 189 SWAP_SHUF_REV_SEQ_2_VALS (2 * 2), \ 190 SWAP_SHUF_REV_SEQ_2_VALS (3 * 2), \ 191 SWAP_SHUF_REV_SEQ_2_VALS (4 * 2), \ 192 SWAP_SHUF_REV_SEQ_2_VALS (5 * 2), \ 193 SWAP_SHUF_REV_SEQ_2_VALS (6 * 2), \ 194 SWAP_SHUF_REV_SEQ_2_VALS (7 * 2) 195 196#define SWAP4_SHUF_MASK_16_BYTES \ 197 SWAP_SHUF_REV_SEQ_4_VALS (0 * 4), \ 198 SWAP_SHUF_REV_SEQ_4_VALS (1 * 4), \ 199 SWAP_SHUF_REV_SEQ_4_VALS (2 * 4), \ 200 SWAP_SHUF_REV_SEQ_4_VALS (3 * 4) 201 202#if defined(USE_SWAP_AVX2) 203/* if we use 256_BIT_INIT_MASK, each static array mask will be larger for 16 bytes */ 204// #define SWAP_USE_256_BIT_INIT_MASK 205#endif 206 207#if defined(SWAP_USE_256_BIT_INIT_MASK) && defined(USE_SWAP_AVX2) 208#define SWAP_MASK_INIT_SIZE 32 209#else 210#define SWAP_MASK_INIT_SIZE 16 211#endif 212 213MY_ALIGN(SWAP_MASK_INIT_SIZE) 214static const Byte k_ShufMask_Swap2[] = 215{ 216 SWAP2_SHUF_MASK_16_BYTES 217 #if SWAP_MASK_INIT_SIZE > 16 218 , SWAP2_SHUF_MASK_16_BYTES 219 #endif 220}; 221 222MY_ALIGN(SWAP_MASK_INIT_SIZE) 223static const Byte k_ShufMask_Swap4[] = 224{ 225 SWAP4_SHUF_MASK_16_BYTES 226 #if SWAP_MASK_INIT_SIZE > 16 227 , SWAP4_SHUF_MASK_16_BYTES 228 #endif 229}; 230 231 232#ifdef USE_SWAP_SSSE3 233 234#include <tmmintrin.h> // ssse3 235 236#define SHUF_128(i) *(items + (i)) = \ 237 _mm_shuffle_epi8(*(items + (i)), mask); // SSSE3 238 239// Z7_NO_INLINE 240static 241#ifdef SWAP_ATTRIB_SSSE3 242SWAP_ATTRIB_SSSE3 243#endif 244Z7_ATTRIB_NO_VECTORIZE 245void 246Z7_FASTCALL 247ShufBytes_128(void *items8, const void *lim8, const void *mask128_ptr) 248{ 249 __m128i *items = (__m128i *)items8; 250 const __m128i *lim = (const __m128i *)lim8; 251 // const __m128i mask = _mm_set_epi8(SHUF_SWAP2_MASK_16_VALS); 252 // const __m128i mask = _mm_set_epi8(SHUF_SWAP4_MASK_16_VALS); 253 // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); 254 // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); 255 // const __m128i mask = *(const __m128i *)(const void *)&(k_ShufMask_Swap4[0]); 256 const __m128i mask = *(const __m128i *)mask128_ptr; 257 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 258 do 259 { 260 SHUF_128(0) SHUF_128(1) items += 2; 261 SHUF_128(0) SHUF_128(1) items += 2; 262 } 263 while (items != lim); 264} 265 266#endif // USE_SWAP_SSSE3 267 268 269 270#ifdef USE_SWAP_AVX2 271 272#include <immintrin.h> // avx, avx2 273#if defined(__clang__) 274#include <avxintrin.h> 275#include <avx2intrin.h> 276#endif 277 278#define SHUF_256(i) *(items + (i)) = \ 279 _mm256_shuffle_epi8(*(items + (i)), mask); // AVX2 280 281// Z7_NO_INLINE 282static 283#ifdef SWAP_ATTRIB_AVX2 284SWAP_ATTRIB_AVX2 285#endif 286Z7_ATTRIB_NO_VECTORIZE 287void 288Z7_FASTCALL 289ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) 290{ 291 __m256i *items = (__m256i *)items8; 292 const __m256i *lim = (const __m256i *)lim8; 293 /* 294 UNUSED_VAR(mask128_ptr) 295 __m256i mask = 296 for Swap4: _mm256_setr_epi8(SWAP4_SHUF_MASK_16_BYTES, SWAP4_SHUF_MASK_16_BYTES); 297 for Swap2: _mm256_setr_epi8(SWAP2_SHUF_MASK_16_BYTES, SWAP2_SHUF_MASK_16_BYTES); 298 */ 299 const __m256i mask = 300 #if SWAP_MASK_INIT_SIZE > 16 301 *(const __m256i *)(const void *)mask128_ptr; 302 #else 303 /* msvc: broadcastsi128() version reserves the stack for no reason 304 msvc 19.29-: _mm256_insertf128_si256() / _mm256_set_m128i)) versions use non-avx movdqu xmm0,XMMWORD PTR [r8] 305 msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want 306 */ 307 // _mm256_broadcastsi128_si256(*mask128_ptr); 308 /* 309 #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) 310 MY_mm256_set_m128i 311 */ 312 _mm256_set_m128i( 313 *(const __m128i *)mask128_ptr, 314 *(const __m128i *)mask128_ptr); 315 #endif 316 317 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 318 do 319 { 320 SHUF_256(0) SHUF_256(1) items += 2; 321 SHUF_256(0) SHUF_256(1) items += 2; 322 } 323 while (items != lim); 324} 325 326#endif // USE_SWAP_AVX2 327#endif // USE_SWAP_SSSE3 || USE_SWAP_AVX2 328#endif // USE_SWAP_128 329 330 331 332// compile message "NEON intrinsics not available with the soft-float ABI" 333#elif defined(MY_CPU_ARM_OR_ARM64) || \ 334 (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) 335// #elif defined(MY_CPU_ARM64) 336 337 #if defined(__clang__) && (__clang_major__ >= 8) \ 338 || defined(__GNUC__) && (__GNUC__ >= 8) 339 #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) \ 340 || defined(MY_CPU_ARM64) 341 #define USE_SWAP_128 342 #endif 343 #ifdef MY_CPU_ARM64 344 // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) 345 #else 346 // #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 347 #endif 348 #elif defined(_MSC_VER) 349 #if (_MSC_VER >= 1910) 350 #define USE_SWAP_128 351 #endif 352 #endif 353 354 #if defined(_MSC_VER) && defined(MY_CPU_ARM64) 355 #include <arm64_neon.h> 356 #else 357 #include <arm_neon.h> 358 #endif 359 360#ifndef USE_SWAP_128 361 #define FORCE_SWAP_MODE 362#else 363 364#ifdef MY_CPU_ARM64 365 // for debug : comment it 366 #define FORCE_SWAP_MODE 367#else 368 #define k_SwapBytes_Mode_NEON 1 369#endif 370// typedef uint8x16_t v128; 371#define SWAP2_128(i) *(uint8x16_t *) (void *)(items + (i) * 8) = \ 372 vrev16q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 8)); 373#define SWAP4_128(i) *(uint8x16_t *) (void *)(items + (i) * 4) = \ 374 vrev32q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 4)); 375 376// Z7_NO_INLINE 377static 378#ifdef SWAP_ATTRIB_NEON 379SWAP_ATTRIB_NEON 380#endif 381Z7_ATTRIB_NO_VECTORIZE 382void 383Z7_FASTCALL 384SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim) 385{ 386 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 387 do 388 { 389 SWAP2_128(0) SWAP2_128(1) items += 2 * 8; 390 SWAP2_128(0) SWAP2_128(1) items += 2 * 8; 391 } 392 while (items != lim); 393} 394 395// Z7_NO_INLINE 396static 397#ifdef SWAP_ATTRIB_NEON 398SWAP_ATTRIB_NEON 399#endif 400Z7_ATTRIB_NO_VECTORIZE 401void 402Z7_FASTCALL 403SwapBytes4_128(CSwapUInt32 *items, const CSwapUInt32 *lim) 404{ 405 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 406 do 407 { 408 SWAP4_128(0) SWAP4_128(1) items += 2 * 4; 409 SWAP4_128(0) SWAP4_128(1) items += 2 * 4; 410 } 411 while (items != lim); 412} 413 414#endif // USE_SWAP_128 415 416#else // MY_CPU_ARM_OR_ARM64 417#define FORCE_SWAP_MODE 418#endif // MY_CPU_ARM_OR_ARM64 419 420 421 422 423 424 425#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_X86) 426 /* _byteswap_ushort() in MSVC x86 32-bit works via slow { mov dh, al; mov dl, ah } 427 So we use own versions of byteswap function */ 428 #if (_MSC_VER < 1400 ) // old MSVC-X86 without _rotr16() support 429 #define SWAP2_16(i) { UInt32 v = items[i]; v += (v << 16); v >>= 8; items[i] = (CSwapUInt16)v; } 430 #else // is new MSVC-X86 with fast _rotr16() 431 #include <intrin.h> 432 #define SWAP2_16(i) { items[i] = _rotr16(items[i], 8); } 433 #endif 434#else // is not MSVC-X86 435 #define SWAP2_16(i) { CSwapUInt16 v = items[i]; items[i] = Z7_BSWAP16(v); } 436#endif // MSVC-X86 437 438#if defined(Z7_CPU_FAST_BSWAP_SUPPORTED) 439 #define SWAP4_32(i) { CSwapUInt32 v = items[i]; items[i] = Z7_BSWAP32(v); } 440#else 441 #define SWAP4_32(i) \ 442 { UInt32 v = items[i]; \ 443 v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); \ 444 v = rotlFixed(v, 16); \ 445 items[i] = v; } 446#endif 447 448 449 450 451#if defined(FORCE_SWAP_MODE) && defined(USE_SWAP_128) 452 #define DEFAULT_Swap2 SwapBytes2_128 453 #if !defined(MY_CPU_X86_OR_AMD64) 454 #define DEFAULT_Swap4 SwapBytes4_128 455 #endif 456#endif 457 458#if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4) 459 460#define SWAP_BASE_FUNCS_PREFIXES \ 461Z7_FORCE_INLINE \ 462static \ 463Z7_ATTRIB_NO_VECTOR \ 464void Z7_FASTCALL 465 466 467#ifdef MY_CPU_64BIT 468 469#if defined(MY_CPU_ARM64) \ 470 && defined(__ARM_ARCH) && (__ARM_ARCH >= 8) \ 471 && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \ 472 || (defined(__clang__) && (__clang_major__ >= 4))) 473 474 #define SWAP2_64_VAR(v) asm ("rev16 %x0,%x0" : "+r" (v)); 475 #define SWAP4_64_VAR(v) asm ("rev32 %x0,%x0" : "+r" (v)); 476 477#else // is not ARM64-GNU 478 479#if !defined(MY_CPU_X86_OR_AMD64) || (k_SwapBytes_Mode_MIN == 0) || !defined(USE_SWAP_128) 480 #define SWAP2_64_VAR(v) \ 481 v = ( 0x00ff00ff00ff00ff & (v >> 8)) \ 482 + ((0x00ff00ff00ff00ff & v) << 8); 483 /* plus gives faster code in MSVC */ 484#endif 485 486#ifdef Z7_CPU_FAST_BSWAP_SUPPORTED 487 #define SWAP4_64_VAR(v) \ 488 v = Z7_BSWAP64(v); \ 489 v = Z7_ROTL64(v, 32); 490#else 491 #define SWAP4_64_VAR(v) \ 492 v = ( 0x000000ff000000ff & (v >> 24)) \ 493 + ((0x000000ff000000ff & v) << 24 ) \ 494 + ( 0x0000ff000000ff00 & (v >> 8)) \ 495 + ((0x0000ff000000ff00 & v) << 8 ) \ 496 ; 497#endif 498 499#endif // ARM64-GNU 500 501 502#ifdef SWAP2_64_VAR 503 504#define SWAP2_64(i) { \ 505 UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 4); \ 506 SWAP2_64_VAR(v) \ 507 *(UInt64 *)(void *)(items + (i) * 4) = v; } 508 509SWAP_BASE_FUNCS_PREFIXES 510SwapBytes2_64(CSwapUInt16 *items, const CSwapUInt16 *lim) 511{ 512 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 513 do 514 { 515 SWAP2_64(0) SWAP2_64(1) items += 2 * 4; 516 SWAP2_64(0) SWAP2_64(1) items += 2 * 4; 517 } 518 while (items != lim); 519} 520 521 #define DEFAULT_Swap2 SwapBytes2_64 522 #if !defined(FORCE_SWAP_MODE) 523 #define SWAP2_DEFAULT_MODE 0 524 #endif 525#else // !defined(SWAP2_64_VAR) 526 #define DEFAULT_Swap2 SwapBytes2_128 527 #if !defined(FORCE_SWAP_MODE) 528 #define SWAP2_DEFAULT_MODE 1 529 #endif 530#endif // SWAP2_64_VAR 531 532 533#define SWAP4_64(i) { \ 534 UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 2); \ 535 SWAP4_64_VAR(v) \ 536 *(UInt64 *)(void *)(items + (i) * 2) = v; } 537 538SWAP_BASE_FUNCS_PREFIXES 539SwapBytes4_64(CSwapUInt32 *items, const CSwapUInt32 *lim) 540{ 541 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 542 do 543 { 544 SWAP4_64(0) SWAP4_64(1) items += 2 * 2; 545 SWAP4_64(0) SWAP4_64(1) items += 2 * 2; 546 } 547 while (items != lim); 548} 549 550#define DEFAULT_Swap4 SwapBytes4_64 551 552#else // is not 64BIT 553 554 555#if defined(MY_CPU_ARM_OR_ARM64) \ 556 && defined(__ARM_ARCH) && (__ARM_ARCH >= 6) \ 557 && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \ 558 || (defined(__clang__) && (__clang_major__ >= 4))) 559 560#ifdef MY_CPU_64BIT 561 #define SWAP2_32_VAR(v) asm ("rev16 %w0,%w0" : "+r" (v)); 562#else 563 #define SWAP2_32_VAR(v) asm ("rev16 %0,%0" : "+r" (v)); // for clang/gcc 564 // asm ("rev16 %r0,%r0" : "+r" (a)); // for gcc 565#endif 566 567#elif defined(_MSC_VER) && (_MSC_VER < 1300) && defined(MY_CPU_X86) \ 568 || !defined(Z7_CPU_FAST_BSWAP_SUPPORTED) \ 569 || !defined(Z7_CPU_FAST_ROTATE_SUPPORTED) 570 // old msvc doesn't support _byteswap_ulong() 571 #define SWAP2_32_VAR(v) \ 572 v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); 573 574#else // is not ARM and is not old-MSVC-X86 and fast BSWAP/ROTATE are supported 575 #define SWAP2_32_VAR(v) \ 576 v = Z7_BSWAP32(v); \ 577 v = rotlFixed(v, 16); 578 579#endif // GNU-ARM* 580 581#define SWAP2_32(i) { \ 582 UInt32 v = *(const UInt32 *)(const void *)(items + (i) * 2); \ 583 SWAP2_32_VAR(v); \ 584 *(UInt32 *)(void *)(items + (i) * 2) = v; } 585 586 587SWAP_BASE_FUNCS_PREFIXES 588SwapBytes2_32(CSwapUInt16 *items, const CSwapUInt16 *lim) 589{ 590 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 591 do 592 { 593 SWAP2_32(0) SWAP2_32(1) items += 2 * 2; 594 SWAP2_32(0) SWAP2_32(1) items += 2 * 2; 595 } 596 while (items != lim); 597} 598 599 600SWAP_BASE_FUNCS_PREFIXES 601SwapBytes4_32(CSwapUInt32 *items, const CSwapUInt32 *lim) 602{ 603 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 604 do 605 { 606 SWAP4_32(0) SWAP4_32(1) items += 2; 607 SWAP4_32(0) SWAP4_32(1) items += 2; 608 } 609 while (items != lim); 610} 611 612#define DEFAULT_Swap2 SwapBytes2_32 613#define DEFAULT_Swap4 SwapBytes4_32 614#if !defined(FORCE_SWAP_MODE) 615 #define SWAP2_DEFAULT_MODE 0 616#endif 617 618#endif // MY_CPU_64BIT 619#endif // if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4) 620 621 622 623#if !defined(FORCE_SWAP_MODE) 624static unsigned g_SwapBytes_Mode; 625#endif 626 627/* size of largest unrolled loop iteration: 128 bytes = 4 * 32 bytes (AVX). */ 628#define SWAP_ITERATION_BLOCK_SIZE_MAX (1 << 7) 629 630// 32 bytes for (AVX) or 2 * 16-bytes for NEON. 631#define SWAP_VECTOR_ALIGN_SIZE (1 << 5) 632 633Z7_NO_INLINE 634void z7_SwapBytes2(CSwapUInt16 *items, size_t numItems) 635{ 636 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 637 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--) 638 { 639 SWAP2_16(0) 640 items++; 641 } 642 { 643 const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt16) - 1; 644 size_t numItems2 = numItems; 645 CSwapUInt16 *lim; 646 numItems &= k_Align_Mask; 647 numItems2 &= ~(size_t)k_Align_Mask; 648 lim = items + numItems2; 649 if (numItems2 != 0) 650 { 651 #if !defined(FORCE_SWAP_MODE) 652 #ifdef MY_CPU_X86_OR_AMD64 653 #ifdef USE_SWAP_AVX2 654 if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3) 655 ShufBytes_256((__m256i *)(void *)items, 656 (const __m256i *)(const void *)lim, 657 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0])); 658 else 659 #endif 660 #ifdef USE_SWAP_SSSE3 661 if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3) 662 ShufBytes_128((__m128i *)(void *)items, 663 (const __m128i *)(const void *)lim, 664 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0])); 665 else 666 #endif 667 #endif // MY_CPU_X86_OR_AMD64 668 #if SWAP2_DEFAULT_MODE == 0 669 if (g_SwapBytes_Mode != 0) 670 SwapBytes2_128(items, lim); 671 else 672 #endif 673 #endif // FORCE_SWAP_MODE 674 DEFAULT_Swap2(items, lim); 675 } 676 items = lim; 677 } 678 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 679 for (; numItems != 0; numItems--) 680 { 681 SWAP2_16(0) 682 items++; 683 } 684} 685 686 687Z7_NO_INLINE 688void z7_SwapBytes4(CSwapUInt32 *items, size_t numItems) 689{ 690 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 691 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--) 692 { 693 SWAP4_32(0) 694 items++; 695 } 696 { 697 const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt32) - 1; 698 size_t numItems2 = numItems; 699 CSwapUInt32 *lim; 700 numItems &= k_Align_Mask; 701 numItems2 &= ~(size_t)k_Align_Mask; 702 lim = items + numItems2; 703 if (numItems2 != 0) 704 { 705 #if !defined(FORCE_SWAP_MODE) 706 #ifdef MY_CPU_X86_OR_AMD64 707 #ifdef USE_SWAP_AVX2 708 if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3) 709 ShufBytes_256((__m256i *)(void *)items, 710 (const __m256i *)(const void *)lim, 711 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); 712 else 713 #endif 714 #ifdef USE_SWAP_SSSE3 715 if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3) 716 ShufBytes_128((__m128i *)(void *)items, 717 (const __m128i *)(const void *)lim, 718 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); 719 else 720 #endif 721 #else // MY_CPU_X86_OR_AMD64 722 723 if (g_SwapBytes_Mode != 0) 724 SwapBytes4_128(items, lim); 725 else 726 #endif // MY_CPU_X86_OR_AMD64 727 #endif // FORCE_SWAP_MODE 728 DEFAULT_Swap4(items, lim); 729 } 730 items = lim; 731 } 732 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 733 for (; numItems != 0; numItems--) 734 { 735 SWAP4_32(0) 736 items++; 737 } 738} 739 740 741// #define SHOW_HW_STATUS 742 743#ifdef SHOW_HW_STATUS 744#include <stdio.h> 745#define PRF(x) x 746#else 747#define PRF(x) 748#endif 749 750void z7_SwapBytesPrepare(void) 751{ 752#ifndef FORCE_SWAP_MODE 753 unsigned mode = 0; // k_SwapBytes_Mode_BASE; 754 755#ifdef MY_CPU_ARM_OR_ARM64 756 { 757 if (CPU_IsSupported_NEON()) 758 { 759 // #pragma message ("=== SwapBytes NEON") 760 PRF(printf("\n=== SwapBytes NEON\n");) 761 mode = k_SwapBytes_Mode_NEON; 762 } 763 } 764#else // MY_CPU_ARM_OR_ARM64 765 { 766 #ifdef USE_SWAP_AVX2 767 if (CPU_IsSupported_AVX2()) 768 { 769 // #pragma message ("=== SwapBytes AVX2") 770 PRF(printf("\n=== SwapBytes AVX2\n");) 771 mode = k_SwapBytes_Mode_AVX2; 772 } 773 else 774 #endif 775 #ifdef USE_SWAP_SSSE3 776 if (CPU_IsSupported_SSSE3()) 777 { 778 // #pragma message ("=== SwapBytes SSSE3") 779 PRF(printf("\n=== SwapBytes SSSE3\n");) 780 mode = k_SwapBytes_Mode_SSSE3; 781 } 782 else 783 #endif 784 #if !defined(MY_CPU_AMD64) 785 if (CPU_IsSupported_SSE2()) 786 #endif 787 { 788 // #pragma message ("=== SwapBytes SSE2") 789 PRF(printf("\n=== SwapBytes SSE2\n");) 790 mode = k_SwapBytes_Mode_SSE2; 791 } 792 } 793#endif // MY_CPU_ARM_OR_ARM64 794 g_SwapBytes_Mode = mode; 795 // g_SwapBytes_Mode = 0; // for debug 796#endif // FORCE_SWAP_MODE 797 PRF(printf("\n=== SwapBytesPrepare\n");) 798} 799 800#undef PRF 801