1/* LzFind.c -- Match finder for LZ algorithms 22023-03-14 : Igor Pavlov : Public domain */ 3 4#include "Precomp.h" 5 6#include <string.h> 7// #include <stdio.h> 8 9#include "CpuArch.h" 10#include "LzFind.h" 11#include "LzHash.h" 12 13#define kBlockMoveAlign (1 << 7) // alignment for memmove() 14#define kBlockSizeAlign (1 << 16) // alignment for block allocation 15#define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary 16 17#define kEmptyHashValue 0 18 19#define kMaxValForNormalize ((UInt32)0) 20// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug 21 22// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses 23 24#define GET_AVAIL_BYTES(p) \ 25 Inline_MatchFinder_GetNumAvailableBytes(p) 26 27 28// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) 29#define kFix5HashSize kFix4HashSize 30 31/* 32 HASH2_CALC: 33 if (hv) match, then cur[0] and cur[1] also match 34*/ 35#define HASH2_CALC hv = GetUi16(cur); 36 37// (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255] 38 39/* 40 HASH3_CALC: 41 if (cur[0]) and (h2) match, then cur[1] also match 42 if (cur[0]) and (hv) match, then cur[1] and cur[2] also match 43*/ 44#define HASH3_CALC { \ 45 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ 46 h2 = temp & (kHash2Size - 1); \ 47 hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; } 48 49#define HASH4_CALC { \ 50 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ 51 h2 = temp & (kHash2Size - 1); \ 52 temp ^= ((UInt32)cur[2] << 8); \ 53 h3 = temp & (kHash3Size - 1); \ 54 hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; } 55 56#define HASH5_CALC { \ 57 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ 58 h2 = temp & (kHash2Size - 1); \ 59 temp ^= ((UInt32)cur[2] << 8); \ 60 h3 = temp & (kHash3Size - 1); \ 61 temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \ 62 /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \ 63 hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; } 64 65#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF; 66 67 68static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) 69{ 70 // if (!p->directInput) 71 { 72 ISzAlloc_Free(alloc, p->bufBase); 73 p->bufBase = NULL; 74 } 75} 76 77 78static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc) 79{ 80 if (blockSize == 0) 81 return 0; 82 if (!p->bufBase || p->blockSize != blockSize) 83 { 84 // size_t blockSizeT; 85 LzInWindow_Free(p, alloc); 86 p->blockSize = blockSize; 87 // blockSizeT = blockSize; 88 89 // printf("\nblockSize = 0x%x\n", blockSize); 90 /* 91 #if defined _WIN64 92 // we can allocate 4GiB, but still use UInt32 for (p->blockSize) 93 // we use UInt32 type for (p->blockSize), because 94 // we don't want to wrap over 4 GiB, 95 // when we use (p->streamPos - p->pos) that is UInt32. 96 if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign) 97 { 98 blockSizeT = ((size_t)1 << 32); 99 printf("\nchanged to blockSizeT = 4GiB\n"); 100 } 101 #endif 102 */ 103 104 p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); 105 // printf("\nbufferBase = %p\n", p->bufBase); 106 // return 0; // for debug 107 } 108 return (p->bufBase != NULL); 109} 110 111static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } 112 113static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } 114 115 116Z7_NO_INLINE 117static void MatchFinder_ReadBlock(CMatchFinder *p) 118{ 119 if (p->streamEndWasReached || p->result != SZ_OK) 120 return; 121 122 /* We use (p->streamPos - p->pos) value. 123 (p->streamPos < p->pos) is allowed. */ 124 125 if (p->directInput) 126 { 127 UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p); 128 if (curSize > p->directInputRem) 129 curSize = (UInt32)p->directInputRem; 130 p->streamPos += curSize; 131 p->directInputRem -= curSize; 132 if (p->directInputRem == 0) 133 p->streamEndWasReached = 1; 134 return; 135 } 136 137 for (;;) 138 { 139 const Byte *dest = p->buffer + GET_AVAIL_BYTES(p); 140 size_t size = (size_t)(p->bufBase + p->blockSize - dest); 141 if (size == 0) 142 { 143 /* we call ReadBlock() after NeedMove() and MoveBlock(). 144 NeedMove() and MoveBlock() povide more than (keepSizeAfter) 145 to the end of (blockSize). 146 So we don't execute this branch in normal code flow. 147 We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock(). 148 */ 149 // p->result = SZ_ERROR_FAIL; // we can show error here 150 return; 151 } 152 153 // #define kRead 3 154 // if (size > kRead) size = kRead; // for debug 155 156 /* 157 // we need cast (Byte *)dest. 158 #ifdef __clang__ 159 #pragma GCC diagnostic ignored "-Wcast-qual" 160 #endif 161 */ 162 p->result = ISeqInStream_Read(p->stream, 163 p->bufBase + (dest - p->bufBase), &size); 164 if (p->result != SZ_OK) 165 return; 166 if (size == 0) 167 { 168 p->streamEndWasReached = 1; 169 return; 170 } 171 p->streamPos += (UInt32)size; 172 if (GET_AVAIL_BYTES(p) > p->keepSizeAfter) 173 return; 174 /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function 175 (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */ 176 } 177 178 // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter) 179} 180 181 182 183Z7_NO_INLINE 184void MatchFinder_MoveBlock(CMatchFinder *p) 185{ 186 const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore; 187 const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore; 188 p->buffer = p->bufBase + keepBefore; 189 memmove(p->bufBase, 190 p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)), 191 keepBefore + (size_t)GET_AVAIL_BYTES(p)); 192} 193 194/* We call MoveBlock() before ReadBlock(). 195 So MoveBlock() can be wasteful operation, if the whole input data 196 can fit in current block even without calling MoveBlock(). 197 in important case where (dataSize <= historySize) 198 condition (p->blockSize > dataSize + p->keepSizeAfter) is met 199 So there is no MoveBlock() in that case case. 200*/ 201 202int MatchFinder_NeedMove(CMatchFinder *p) 203{ 204 if (p->directInput) 205 return 0; 206 if (p->streamEndWasReached || p->result != SZ_OK) 207 return 0; 208 return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter); 209} 210 211void MatchFinder_ReadIfRequired(CMatchFinder *p) 212{ 213 if (p->keepSizeAfter >= GET_AVAIL_BYTES(p)) 214 MatchFinder_ReadBlock(p); 215} 216 217 218 219static void MatchFinder_SetDefaultSettings(CMatchFinder *p) 220{ 221 p->cutValue = 32; 222 p->btMode = 1; 223 p->numHashBytes = 4; 224 p->numHashBytes_Min = 2; 225 p->numHashOutBits = 0; 226 p->bigHash = 0; 227} 228 229#define kCrcPoly 0xEDB88320 230 231void MatchFinder_Construct(CMatchFinder *p) 232{ 233 unsigned i; 234 p->buffer = NULL; 235 p->bufBase = NULL; 236 p->directInput = 0; 237 p->stream = NULL; 238 p->hash = NULL; 239 p->expectedDataSize = (UInt64)(Int64)-1; 240 MatchFinder_SetDefaultSettings(p); 241 242 for (i = 0; i < 256; i++) 243 { 244 UInt32 r = (UInt32)i; 245 unsigned j; 246 for (j = 0; j < 8; j++) 247 r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); 248 p->crc[i] = r; 249 } 250} 251 252#undef kCrcPoly 253 254static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc) 255{ 256 ISzAlloc_Free(alloc, p->hash); 257 p->hash = NULL; 258} 259 260void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc) 261{ 262 MatchFinder_FreeThisClassMemory(p, alloc); 263 LzInWindow_Free(p, alloc); 264} 265 266static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc) 267{ 268 const size_t sizeInBytes = (size_t)num * sizeof(CLzRef); 269 if (sizeInBytes / sizeof(CLzRef) != num) 270 return NULL; 271 return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes); 272} 273 274#if (kBlockSizeReserveMin < kBlockSizeAlign * 2) 275 #error Stop_Compiling_Bad_Reserve 276#endif 277 278 279 280static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize) 281{ 282 UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter); 283 /* 284 if (historySize > kMaxHistorySize) 285 return 0; 286 */ 287 // printf("\nhistorySize == 0x%x\n", historySize); 288 289 if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow 290 return 0; 291 292 { 293 const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign; 294 const UInt32 rem = kBlockSizeMax - blockSize; 295 const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2)) 296 + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here 297 if (blockSize >= kBlockSizeMax 298 || rem < kBlockSizeReserveMin) // we reject settings that will be slow 299 return 0; 300 if (reserve >= rem) 301 blockSize = kBlockSizeMax; 302 else 303 { 304 blockSize += reserve; 305 blockSize &= ~(UInt32)(kBlockSizeAlign - 1); 306 } 307 } 308 // printf("\n LzFind_blockSize = %x\n", blockSize); 309 // printf("\n LzFind_blockSize = %d\n", blockSize >> 20); 310 return blockSize; 311} 312 313 314// input is historySize 315static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs) 316{ 317 if (p->numHashBytes == 2) 318 return (1 << 16) - 1; 319 if (hs != 0) 320 hs--; 321 hs |= (hs >> 1); 322 hs |= (hs >> 2); 323 hs |= (hs >> 4); 324 hs |= (hs >> 8); 325 // we propagated 16 bits in (hs). Low 16 bits must be set later 326 if (hs >= (1 << 24)) 327 { 328 if (p->numHashBytes == 3) 329 hs = (1 << 24) - 1; 330 /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ 331 } 332 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) 333 hs |= (1 << 16) - 1; /* don't change it! */ 334 // bt5: we adjust the size with recommended minimum size 335 if (p->numHashBytes >= 5) 336 hs |= (256 << kLzHash_CrcShift_2) - 1; 337 return hs; 338} 339 340// input is historySize 341static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs) 342{ 343 if (p->numHashBytes == 2) 344 return (1 << 16) - 1; 345 if (hs != 0) 346 hs--; 347 hs |= (hs >> 1); 348 hs |= (hs >> 2); 349 hs |= (hs >> 4); 350 hs |= (hs >> 8); 351 // we propagated 16 bits in (hs). Low 16 bits must be set later 352 hs >>= 1; 353 if (hs >= (1 << 24)) 354 { 355 if (p->numHashBytes == 3) 356 hs = (1 << 24) - 1; 357 else 358 hs >>= 1; 359 /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ 360 } 361 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) 362 hs |= (1 << 16) - 1; /* don't change it! */ 363 // bt5: we adjust the size with recommended minimum size 364 if (p->numHashBytes >= 5) 365 hs |= (256 << kLzHash_CrcShift_2) - 1; 366 return hs; 367} 368 369 370int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, 371 UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, 372 ISzAllocPtr alloc) 373{ 374 /* we need one additional byte in (p->keepSizeBefore), 375 since we use MoveBlock() after (p->pos++) and before dictionary using */ 376 // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug 377 p->keepSizeBefore = historySize + keepAddBufferBefore + 1; 378 379 keepAddBufferAfter += matchMaxLen; 380 /* we need (p->keepSizeAfter >= p->numHashBytes) */ 381 if (keepAddBufferAfter < p->numHashBytes) 382 keepAddBufferAfter = p->numHashBytes; 383 // keepAddBufferAfter -= 2; // for debug 384 p->keepSizeAfter = keepAddBufferAfter; 385 386 if (p->directInput) 387 p->blockSize = 0; 388 if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc)) 389 { 390 size_t hashSizeSum; 391 { 392 UInt32 hs; 393 UInt32 hsCur; 394 395 if (p->numHashOutBits != 0) 396 { 397 unsigned numBits = p->numHashOutBits; 398 const unsigned nbMax = 399 (p->numHashBytes == 2 ? 16 : 400 (p->numHashBytes == 3 ? 24 : 32)); 401 if (numBits > nbMax) 402 numBits = nbMax; 403 if (numBits >= 32) 404 hs = (UInt32)0 - 1; 405 else 406 hs = ((UInt32)1 << numBits) - 1; 407 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) 408 hs |= (1 << 16) - 1; /* don't change it! */ 409 if (p->numHashBytes >= 5) 410 hs |= (256 << kLzHash_CrcShift_2) - 1; 411 { 412 const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); 413 if (hs > hs2) 414 hs = hs2; 415 } 416 hsCur = hs; 417 if (p->expectedDataSize < historySize) 418 { 419 const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); 420 if (hsCur > hs2) 421 hsCur = hs2; 422 } 423 } 424 else 425 { 426 hs = MatchFinder_GetHashMask(p, historySize); 427 hsCur = hs; 428 if (p->expectedDataSize < historySize) 429 { 430 hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); 431 if (hsCur > hs) // is it possible? 432 hsCur = hs; 433 } 434 } 435 436 p->hashMask = hsCur; 437 438 hashSizeSum = hs; 439 hashSizeSum++; 440 if (hashSizeSum < hs) 441 return 0; 442 { 443 UInt32 fixedHashSize = 0; 444 if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size; 445 if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size; 446 // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size; 447 hashSizeSum += fixedHashSize; 448 p->fixedHashSize = fixedHashSize; 449 } 450 } 451 452 p->matchMaxLen = matchMaxLen; 453 454 { 455 size_t newSize; 456 size_t numSons; 457 const UInt32 newCyclicBufferSize = historySize + 1; // do not change it 458 p->historySize = historySize; 459 p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1) 460 461 numSons = newCyclicBufferSize; 462 if (p->btMode) 463 numSons <<= 1; 464 newSize = hashSizeSum + numSons; 465 466 if (numSons < newCyclicBufferSize || newSize < numSons) 467 return 0; 468 469 // aligned size is not required here, but it can be better for some loops 470 #define NUM_REFS_ALIGN_MASK 0xF 471 newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK; 472 473 // 22.02: we don't reallocate buffer, if old size is enough 474 if (p->hash && p->numRefs >= newSize) 475 return 1; 476 477 MatchFinder_FreeThisClassMemory(p, alloc); 478 p->numRefs = newSize; 479 p->hash = AllocRefs(newSize, alloc); 480 481 if (p->hash) 482 { 483 p->son = p->hash + hashSizeSum; 484 return 1; 485 } 486 } 487 } 488 489 MatchFinder_Free(p, alloc); 490 return 0; 491} 492 493 494static void MatchFinder_SetLimits(CMatchFinder *p) 495{ 496 UInt32 k; 497 UInt32 n = kMaxValForNormalize - p->pos; 498 if (n == 0) 499 n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0) 500 501 k = p->cyclicBufferSize - p->cyclicBufferPos; 502 if (k < n) 503 n = k; 504 505 k = GET_AVAIL_BYTES(p); 506 { 507 const UInt32 ksa = p->keepSizeAfter; 508 UInt32 mm = p->matchMaxLen; 509 if (k > ksa) 510 k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock 511 else if (k >= mm) 512 { 513 // the limitation for (p->lenLimit) update 514 k -= mm; // optimization : to reduce the number of checks 515 k++; 516 // k = 1; // non-optimized version : for debug 517 } 518 else 519 { 520 mm = k; 521 if (k != 0) 522 k = 1; 523 } 524 p->lenLimit = mm; 525 } 526 if (k < n) 527 n = k; 528 529 p->posLimit = p->pos + n; 530} 531 532 533void MatchFinder_Init_LowHash(CMatchFinder *p) 534{ 535 size_t i; 536 CLzRef *items = p->hash; 537 const size_t numItems = p->fixedHashSize; 538 for (i = 0; i < numItems; i++) 539 items[i] = kEmptyHashValue; 540} 541 542 543void MatchFinder_Init_HighHash(CMatchFinder *p) 544{ 545 size_t i; 546 CLzRef *items = p->hash + p->fixedHashSize; 547 const size_t numItems = (size_t)p->hashMask + 1; 548 for (i = 0; i < numItems; i++) 549 items[i] = kEmptyHashValue; 550} 551 552 553void MatchFinder_Init_4(CMatchFinder *p) 554{ 555 if (!p->directInput) 556 p->buffer = p->bufBase; 557 { 558 /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker. 559 the code in CMatchFinderMt expects (pos = 1) */ 560 p->pos = 561 p->streamPos = 562 1; // it's smallest optimal value. do not change it 563 // 0; // for debug 564 } 565 p->result = SZ_OK; 566 p->streamEndWasReached = 0; 567} 568 569 570// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code 571#define CYC_TO_POS_OFFSET 0 572// #define CYC_TO_POS_OFFSET 1 // for debug 573 574void MatchFinder_Init(CMatchFinder *p) 575{ 576 MatchFinder_Init_HighHash(p); 577 MatchFinder_Init_LowHash(p); 578 MatchFinder_Init_4(p); 579 // if (readData) 580 MatchFinder_ReadBlock(p); 581 582 /* if we init (cyclicBufferPos = pos), then we can use one variable 583 instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */ 584 p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos) 585 // p->cyclicBufferPos = 0; // smallest value 586 // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses. 587 MatchFinder_SetLimits(p); 588} 589 590 591 592#ifdef MY_CPU_X86_OR_AMD64 593 #if defined(__clang__) && (__clang_major__ >= 4) \ 594 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) 595 // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) 596 597 #define USE_LZFIND_SATUR_SUB_128 598 #define USE_LZFIND_SATUR_SUB_256 599 #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) 600 #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2"))) 601 #elif defined(_MSC_VER) 602 #if (_MSC_VER >= 1600) 603 #define USE_LZFIND_SATUR_SUB_128 604 #endif 605 #if (_MSC_VER >= 1900) 606 #define USE_LZFIND_SATUR_SUB_256 607 #endif 608 #endif 609 610// #elif defined(MY_CPU_ARM_OR_ARM64) 611#elif defined(MY_CPU_ARM64) 612 613 #if defined(__clang__) && (__clang_major__ >= 8) \ 614 || defined(__GNUC__) && (__GNUC__ >= 8) 615 #define USE_LZFIND_SATUR_SUB_128 616 #ifdef MY_CPU_ARM64 617 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) 618 #else 619 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 620 #endif 621 622 #elif defined(_MSC_VER) 623 #if (_MSC_VER >= 1910) 624 #define USE_LZFIND_SATUR_SUB_128 625 #endif 626 #endif 627 628 #if defined(_MSC_VER) && defined(MY_CPU_ARM64) 629 #include <arm64_neon.h> 630 #else 631 #include <arm_neon.h> 632 #endif 633 634#endif 635 636 637#ifdef USE_LZFIND_SATUR_SUB_128 638 639// #define Z7_SHOW_HW_STATUS 640 641#ifdef Z7_SHOW_HW_STATUS 642#include <stdio.h> 643#define PRF(x) x 644PRF(;) 645#else 646#define PRF(x) 647#endif 648 649 650#ifdef MY_CPU_ARM_OR_ARM64 651 652#ifdef MY_CPU_ARM64 653// #define FORCE_LZFIND_SATUR_SUB_128 654#endif 655typedef uint32x4_t LzFind_v128; 656#define SASUB_128_V(v, s) \ 657 vsubq_u32(vmaxq_u32(v, s), s) 658 659#else // MY_CPU_ARM_OR_ARM64 660 661#include <smmintrin.h> // sse4.1 662 663typedef __m128i LzFind_v128; 664// SSE 4.1 665#define SASUB_128_V(v, s) \ 666 _mm_sub_epi32(_mm_max_epu32(v, s), s) 667 668#endif // MY_CPU_ARM_OR_ARM64 669 670 671#define SASUB_128(i) \ 672 *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \ 673 *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2); 674 675 676Z7_NO_INLINE 677static 678#ifdef LZFIND_ATTRIB_SSE41 679LZFIND_ATTRIB_SSE41 680#endif 681void 682Z7_FASTCALL 683LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim) 684{ 685 const LzFind_v128 sub2 = 686 #ifdef MY_CPU_ARM_OR_ARM64 687 vdupq_n_u32(subValue); 688 #else 689 _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); 690 #endif 691 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 692 do 693 { 694 SASUB_128(0) SASUB_128(1) items += 2 * 4; 695 SASUB_128(0) SASUB_128(1) items += 2 * 4; 696 } 697 while (items != lim); 698} 699 700 701 702#ifdef USE_LZFIND_SATUR_SUB_256 703 704#include <immintrin.h> // avx 705/* 706clang :immintrin.h uses 707#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ 708 defined(__AVX2__) 709#include <avx2intrin.h> 710#endif 711so we need <avxintrin.h> for clang-cl */ 712 713#if defined(__clang__) 714#include <avxintrin.h> 715#include <avx2intrin.h> 716#endif 717 718// AVX2: 719#define SASUB_256(i) \ 720 *( __m256i *)( void *)(items + (i) * 8) = \ 721 _mm256_sub_epi32(_mm256_max_epu32( \ 722 *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); 723 724Z7_NO_INLINE 725static 726#ifdef LZFIND_ATTRIB_AVX2 727LZFIND_ATTRIB_AVX2 728#endif 729void 730Z7_FASTCALL 731LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim) 732{ 733 const __m256i sub2 = _mm256_set_epi32( 734 (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, 735 (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); 736 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 737 do 738 { 739 SASUB_256(0) SASUB_256(1) items += 2 * 8; 740 SASUB_256(0) SASUB_256(1) items += 2 * 8; 741 } 742 while (items != lim); 743} 744#endif // USE_LZFIND_SATUR_SUB_256 745 746#ifndef FORCE_LZFIND_SATUR_SUB_128 747typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)( 748 UInt32 subValue, CLzRef *items, const CLzRef *lim); 749static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; 750#endif // FORCE_LZFIND_SATUR_SUB_128 751 752#endif // USE_LZFIND_SATUR_SUB_128 753 754 755// kEmptyHashValue must be zero 756// #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; } 757#define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; } 758 759#ifdef FORCE_LZFIND_SATUR_SUB_128 760 761#define DEFAULT_SaturSub LzFind_SaturSub_128 762 763#else 764 765#define DEFAULT_SaturSub LzFind_SaturSub_32 766 767Z7_NO_INLINE 768static 769void 770Z7_FASTCALL 771LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) 772{ 773 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 774 do 775 { 776 SASUB_32(0) SASUB_32(1) items += 2; 777 SASUB_32(0) SASUB_32(1) items += 2; 778 SASUB_32(0) SASUB_32(1) items += 2; 779 SASUB_32(0) SASUB_32(1) items += 2; 780 } 781 while (items != lim); 782} 783 784#endif 785 786 787Z7_NO_INLINE 788void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) 789{ 790 #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7) 791 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 792 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) 793 { 794 SASUB_32(0) 795 items++; 796 } 797 { 798 const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1); 799 CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask); 800 numItems &= k_Align_Mask; 801 if (items != lim) 802 { 803 #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128) 804 if (g_LzFind_SaturSub) 805 g_LzFind_SaturSub(subValue, items, lim); 806 else 807 #endif 808 DEFAULT_SaturSub(subValue, items, lim); 809 } 810 items = lim; 811 } 812 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 813 for (; numItems != 0; numItems--) 814 { 815 SASUB_32(0) 816 items++; 817 } 818} 819 820 821 822// call MatchFinder_CheckLimits() only after (p->pos++) update 823 824Z7_NO_INLINE 825static void MatchFinder_CheckLimits(CMatchFinder *p) 826{ 827 if (// !p->streamEndWasReached && p->result == SZ_OK && 828 p->keepSizeAfter == GET_AVAIL_BYTES(p)) 829 { 830 // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p)) 831 if (MatchFinder_NeedMove(p)) 832 MatchFinder_MoveBlock(p); 833 MatchFinder_ReadBlock(p); 834 } 835 836 if (p->pos == kMaxValForNormalize) 837 if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data. 838 /* 839 if we disable normalization for last bytes of data, and 840 if (data_size == 4 GiB), we don't call wastfull normalization, 841 but (pos) will be wrapped over Zero (0) in that case. 842 And we cannot resume later to normal operation 843 */ 844 { 845 // MatchFinder_Normalize(p); 846 /* after normalization we need (p->pos >= p->historySize + 1); */ 847 /* we can reduce subValue to aligned value, if want to keep alignment 848 of (p->pos) and (p->buffer) for speculated accesses. */ 849 const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */; 850 // const UInt32 subValue = (1 << 15); // for debug 851 // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue); 852 MatchFinder_REDUCE_OFFSETS(p, subValue) 853 MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize); 854 { 855 size_t numSonRefs = p->cyclicBufferSize; 856 if (p->btMode) 857 numSonRefs <<= 1; 858 MatchFinder_Normalize3(subValue, p->son, numSonRefs); 859 } 860 } 861 862 if (p->cyclicBufferPos == p->cyclicBufferSize) 863 p->cyclicBufferPos = 0; 864 865 MatchFinder_SetLimits(p); 866} 867 868 869/* 870 (lenLimit > maxLen) 871*/ 872Z7_FORCE_INLINE 873static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, 874 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, 875 UInt32 *d, unsigned maxLen) 876{ 877 /* 878 son[_cyclicBufferPos] = curMatch; 879 for (;;) 880 { 881 UInt32 delta = pos - curMatch; 882 if (cutValue-- == 0 || delta >= _cyclicBufferSize) 883 return d; 884 { 885 const Byte *pb = cur - delta; 886 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; 887 if (pb[maxLen] == cur[maxLen] && *pb == *cur) 888 { 889 UInt32 len = 0; 890 while (++len != lenLimit) 891 if (pb[len] != cur[len]) 892 break; 893 if (maxLen < len) 894 { 895 maxLen = len; 896 *d++ = len; 897 *d++ = delta - 1; 898 if (len == lenLimit) 899 return d; 900 } 901 } 902 } 903 } 904 */ 905 906 const Byte *lim = cur + lenLimit; 907 son[_cyclicBufferPos] = curMatch; 908 909 do 910 { 911 UInt32 delta; 912 913 if (curMatch == 0) 914 break; 915 // if (curMatch2 >= curMatch) return NULL; 916 delta = pos - curMatch; 917 if (delta >= _cyclicBufferSize) 918 break; 919 { 920 ptrdiff_t diff; 921 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; 922 diff = (ptrdiff_t)0 - (ptrdiff_t)delta; 923 if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) 924 { 925 const Byte *c = cur; 926 while (*c == c[diff]) 927 { 928 if (++c == lim) 929 { 930 d[0] = (UInt32)(lim - cur); 931 d[1] = delta - 1; 932 return d + 2; 933 } 934 } 935 { 936 const unsigned len = (unsigned)(c - cur); 937 if (maxLen < len) 938 { 939 maxLen = len; 940 d[0] = (UInt32)len; 941 d[1] = delta - 1; 942 d += 2; 943 } 944 } 945 } 946 } 947 } 948 while (--cutValue); 949 950 return d; 951} 952 953 954Z7_FORCE_INLINE 955UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, 956 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, 957 UInt32 *d, UInt32 maxLen) 958{ 959 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; 960 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); 961 unsigned len0 = 0, len1 = 0; 962 963 UInt32 cmCheck; 964 965 // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } 966 967 cmCheck = (UInt32)(pos - _cyclicBufferSize); 968 if ((UInt32)pos <= _cyclicBufferSize) 969 cmCheck = 0; 970 971 if (cmCheck < curMatch) 972 do 973 { 974 const UInt32 delta = pos - curMatch; 975 { 976 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); 977 const Byte *pb = cur - delta; 978 unsigned len = (len0 < len1 ? len0 : len1); 979 const UInt32 pair0 = pair[0]; 980 if (pb[len] == cur[len]) 981 { 982 if (++len != lenLimit && pb[len] == cur[len]) 983 while (++len != lenLimit) 984 if (pb[len] != cur[len]) 985 break; 986 if (maxLen < len) 987 { 988 maxLen = (UInt32)len; 989 *d++ = (UInt32)len; 990 *d++ = delta - 1; 991 if (len == lenLimit) 992 { 993 *ptr1 = pair0; 994 *ptr0 = pair[1]; 995 return d; 996 } 997 } 998 } 999 if (pb[len] < cur[len]) 1000 { 1001 *ptr1 = curMatch; 1002 // const UInt32 curMatch2 = pair[1]; 1003 // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } 1004 // curMatch = curMatch2; 1005 curMatch = pair[1]; 1006 ptr1 = pair + 1; 1007 len1 = len; 1008 } 1009 else 1010 { 1011 *ptr0 = curMatch; 1012 curMatch = pair[0]; 1013 ptr0 = pair; 1014 len0 = len; 1015 } 1016 } 1017 } 1018 while(--cutValue && cmCheck < curMatch); 1019 1020 *ptr0 = *ptr1 = kEmptyHashValue; 1021 return d; 1022} 1023 1024 1025static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, 1026 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue) 1027{ 1028 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; 1029 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); 1030 unsigned len0 = 0, len1 = 0; 1031 1032 UInt32 cmCheck; 1033 1034 cmCheck = (UInt32)(pos - _cyclicBufferSize); 1035 if ((UInt32)pos <= _cyclicBufferSize) 1036 cmCheck = 0; 1037 1038 if (// curMatch >= pos || // failure 1039 cmCheck < curMatch) 1040 do 1041 { 1042 const UInt32 delta = pos - curMatch; 1043 { 1044 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); 1045 const Byte *pb = cur - delta; 1046 unsigned len = (len0 < len1 ? len0 : len1); 1047 if (pb[len] == cur[len]) 1048 { 1049 while (++len != lenLimit) 1050 if (pb[len] != cur[len]) 1051 break; 1052 { 1053 if (len == lenLimit) 1054 { 1055 *ptr1 = pair[0]; 1056 *ptr0 = pair[1]; 1057 return; 1058 } 1059 } 1060 } 1061 if (pb[len] < cur[len]) 1062 { 1063 *ptr1 = curMatch; 1064 curMatch = pair[1]; 1065 ptr1 = pair + 1; 1066 len1 = len; 1067 } 1068 else 1069 { 1070 *ptr0 = curMatch; 1071 curMatch = pair[0]; 1072 ptr0 = pair; 1073 len0 = len; 1074 } 1075 } 1076 } 1077 while(--cutValue && cmCheck < curMatch); 1078 1079 *ptr0 = *ptr1 = kEmptyHashValue; 1080 return; 1081} 1082 1083 1084#define MOVE_POS \ 1085 ++p->cyclicBufferPos; \ 1086 p->buffer++; \ 1087 { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } 1088 1089#define MOVE_POS_RET MOVE_POS return distances; 1090 1091Z7_NO_INLINE 1092static void MatchFinder_MovePos(CMatchFinder *p) 1093{ 1094 /* we go here at the end of stream data, when (avail < num_hash_bytes) 1095 We don't update sons[cyclicBufferPos << btMode]. 1096 So (sons) record will contain junk. And we cannot resume match searching 1097 to normal operation, even if we will provide more input data in buffer. 1098 p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue 1099 if (p->btMode) 1100 p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue 1101 */ 1102 MOVE_POS 1103} 1104 1105#define GET_MATCHES_HEADER2(minLen, ret_op) \ 1106 unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ 1107 lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ 1108 cur = p->buffer; 1109 1110#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) 1111#define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue) 1112 1113#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue 1114 1115#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num); 1116 1117#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ 1118 distances = func(MF_PARAMS(p), \ 1119 distances, (UInt32)_maxLen_); MOVE_POS_RET 1120 1121#define GET_MATCHES_FOOTER_BT(_maxLen_) \ 1122 GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) 1123 1124#define GET_MATCHES_FOOTER_HC(_maxLen_) \ 1125 GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec) 1126 1127 1128 1129#define UPDATE_maxLen { \ 1130 const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \ 1131 const Byte *c = cur + maxLen; \ 1132 const Byte *lim = cur + lenLimit; \ 1133 for (; c != lim; c++) if (*(c + diff) != *c) break; \ 1134 maxLen = (unsigned)(c - cur); } 1135 1136static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1137{ 1138 GET_MATCHES_HEADER(2) 1139 HASH2_CALC 1140 curMatch = p->hash[hv]; 1141 p->hash[hv] = p->pos; 1142 GET_MATCHES_FOOTER_BT(1) 1143} 1144 1145UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1146{ 1147 GET_MATCHES_HEADER(3) 1148 HASH_ZIP_CALC 1149 curMatch = p->hash[hv]; 1150 p->hash[hv] = p->pos; 1151 GET_MATCHES_FOOTER_BT(2) 1152} 1153 1154 1155#define SET_mmm \ 1156 mmm = p->cyclicBufferSize; \ 1157 if (pos < mmm) \ 1158 mmm = pos; 1159 1160 1161static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1162{ 1163 UInt32 mmm; 1164 UInt32 h2, d2, pos; 1165 unsigned maxLen; 1166 UInt32 *hash; 1167 GET_MATCHES_HEADER(3) 1168 1169 HASH3_CALC 1170 1171 hash = p->hash; 1172 pos = p->pos; 1173 1174 d2 = pos - hash[h2]; 1175 1176 curMatch = (hash + kFix3HashSize)[hv]; 1177 1178 hash[h2] = pos; 1179 (hash + kFix3HashSize)[hv] = pos; 1180 1181 SET_mmm 1182 1183 maxLen = 2; 1184 1185 if (d2 < mmm && *(cur - d2) == *cur) 1186 { 1187 UPDATE_maxLen 1188 distances[0] = (UInt32)maxLen; 1189 distances[1] = d2 - 1; 1190 distances += 2; 1191 if (maxLen == lenLimit) 1192 { 1193 SkipMatchesSpec(MF_PARAMS(p)); 1194 MOVE_POS_RET 1195 } 1196 } 1197 1198 GET_MATCHES_FOOTER_BT(maxLen) 1199} 1200 1201 1202static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1203{ 1204 UInt32 mmm; 1205 UInt32 h2, h3, d2, d3, pos; 1206 unsigned maxLen; 1207 UInt32 *hash; 1208 GET_MATCHES_HEADER(4) 1209 1210 HASH4_CALC 1211 1212 hash = p->hash; 1213 pos = p->pos; 1214 1215 d2 = pos - hash [h2]; 1216 d3 = pos - (hash + kFix3HashSize)[h3]; 1217 curMatch = (hash + kFix4HashSize)[hv]; 1218 1219 hash [h2] = pos; 1220 (hash + kFix3HashSize)[h3] = pos; 1221 (hash + kFix4HashSize)[hv] = pos; 1222 1223 SET_mmm 1224 1225 maxLen = 3; 1226 1227 for (;;) 1228 { 1229 if (d2 < mmm && *(cur - d2) == *cur) 1230 { 1231 distances[0] = 2; 1232 distances[1] = d2 - 1; 1233 distances += 2; 1234 if (*(cur - d2 + 2) == cur[2]) 1235 { 1236 // distances[-2] = 3; 1237 } 1238 else if (d3 < mmm && *(cur - d3) == *cur) 1239 { 1240 d2 = d3; 1241 distances[1] = d3 - 1; 1242 distances += 2; 1243 } 1244 else 1245 break; 1246 } 1247 else if (d3 < mmm && *(cur - d3) == *cur) 1248 { 1249 d2 = d3; 1250 distances[1] = d3 - 1; 1251 distances += 2; 1252 } 1253 else 1254 break; 1255 1256 UPDATE_maxLen 1257 distances[-2] = (UInt32)maxLen; 1258 if (maxLen == lenLimit) 1259 { 1260 SkipMatchesSpec(MF_PARAMS(p)); 1261 MOVE_POS_RET 1262 } 1263 break; 1264 } 1265 1266 GET_MATCHES_FOOTER_BT(maxLen) 1267} 1268 1269 1270static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1271{ 1272 UInt32 mmm; 1273 UInt32 h2, h3, d2, d3, maxLen, pos; 1274 UInt32 *hash; 1275 GET_MATCHES_HEADER(5) 1276 1277 HASH5_CALC 1278 1279 hash = p->hash; 1280 pos = p->pos; 1281 1282 d2 = pos - hash [h2]; 1283 d3 = pos - (hash + kFix3HashSize)[h3]; 1284 // d4 = pos - (hash + kFix4HashSize)[h4]; 1285 1286 curMatch = (hash + kFix5HashSize)[hv]; 1287 1288 hash [h2] = pos; 1289 (hash + kFix3HashSize)[h3] = pos; 1290 // (hash + kFix4HashSize)[h4] = pos; 1291 (hash + kFix5HashSize)[hv] = pos; 1292 1293 SET_mmm 1294 1295 maxLen = 4; 1296 1297 for (;;) 1298 { 1299 if (d2 < mmm && *(cur - d2) == *cur) 1300 { 1301 distances[0] = 2; 1302 distances[1] = d2 - 1; 1303 distances += 2; 1304 if (*(cur - d2 + 2) == cur[2]) 1305 { 1306 } 1307 else if (d3 < mmm && *(cur - d3) == *cur) 1308 { 1309 distances[1] = d3 - 1; 1310 distances += 2; 1311 d2 = d3; 1312 } 1313 else 1314 break; 1315 } 1316 else if (d3 < mmm && *(cur - d3) == *cur) 1317 { 1318 distances[1] = d3 - 1; 1319 distances += 2; 1320 d2 = d3; 1321 } 1322 else 1323 break; 1324 1325 distances[-2] = 3; 1326 if (*(cur - d2 + 3) != cur[3]) 1327 break; 1328 UPDATE_maxLen 1329 distances[-2] = (UInt32)maxLen; 1330 if (maxLen == lenLimit) 1331 { 1332 SkipMatchesSpec(MF_PARAMS(p)); 1333 MOVE_POS_RET 1334 } 1335 break; 1336 } 1337 1338 GET_MATCHES_FOOTER_BT(maxLen) 1339} 1340 1341 1342static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1343{ 1344 UInt32 mmm; 1345 UInt32 h2, h3, d2, d3, pos; 1346 unsigned maxLen; 1347 UInt32 *hash; 1348 GET_MATCHES_HEADER(4) 1349 1350 HASH4_CALC 1351 1352 hash = p->hash; 1353 pos = p->pos; 1354 1355 d2 = pos - hash [h2]; 1356 d3 = pos - (hash + kFix3HashSize)[h3]; 1357 curMatch = (hash + kFix4HashSize)[hv]; 1358 1359 hash [h2] = pos; 1360 (hash + kFix3HashSize)[h3] = pos; 1361 (hash + kFix4HashSize)[hv] = pos; 1362 1363 SET_mmm 1364 1365 maxLen = 3; 1366 1367 for (;;) 1368 { 1369 if (d2 < mmm && *(cur - d2) == *cur) 1370 { 1371 distances[0] = 2; 1372 distances[1] = d2 - 1; 1373 distances += 2; 1374 if (*(cur - d2 + 2) == cur[2]) 1375 { 1376 // distances[-2] = 3; 1377 } 1378 else if (d3 < mmm && *(cur - d3) == *cur) 1379 { 1380 d2 = d3; 1381 distances[1] = d3 - 1; 1382 distances += 2; 1383 } 1384 else 1385 break; 1386 } 1387 else if (d3 < mmm && *(cur - d3) == *cur) 1388 { 1389 d2 = d3; 1390 distances[1] = d3 - 1; 1391 distances += 2; 1392 } 1393 else 1394 break; 1395 1396 UPDATE_maxLen 1397 distances[-2] = (UInt32)maxLen; 1398 if (maxLen == lenLimit) 1399 { 1400 p->son[p->cyclicBufferPos] = curMatch; 1401 MOVE_POS_RET 1402 } 1403 break; 1404 } 1405 1406 GET_MATCHES_FOOTER_HC(maxLen) 1407} 1408 1409 1410static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1411{ 1412 UInt32 mmm; 1413 UInt32 h2, h3, d2, d3, maxLen, pos; 1414 UInt32 *hash; 1415 GET_MATCHES_HEADER(5) 1416 1417 HASH5_CALC 1418 1419 hash = p->hash; 1420 pos = p->pos; 1421 1422 d2 = pos - hash [h2]; 1423 d3 = pos - (hash + kFix3HashSize)[h3]; 1424 // d4 = pos - (hash + kFix4HashSize)[h4]; 1425 1426 curMatch = (hash + kFix5HashSize)[hv]; 1427 1428 hash [h2] = pos; 1429 (hash + kFix3HashSize)[h3] = pos; 1430 // (hash + kFix4HashSize)[h4] = pos; 1431 (hash + kFix5HashSize)[hv] = pos; 1432 1433 SET_mmm 1434 1435 maxLen = 4; 1436 1437 for (;;) 1438 { 1439 if (d2 < mmm && *(cur - d2) == *cur) 1440 { 1441 distances[0] = 2; 1442 distances[1] = d2 - 1; 1443 distances += 2; 1444 if (*(cur - d2 + 2) == cur[2]) 1445 { 1446 } 1447 else if (d3 < mmm && *(cur - d3) == *cur) 1448 { 1449 distances[1] = d3 - 1; 1450 distances += 2; 1451 d2 = d3; 1452 } 1453 else 1454 break; 1455 } 1456 else if (d3 < mmm && *(cur - d3) == *cur) 1457 { 1458 distances[1] = d3 - 1; 1459 distances += 2; 1460 d2 = d3; 1461 } 1462 else 1463 break; 1464 1465 distances[-2] = 3; 1466 if (*(cur - d2 + 3) != cur[3]) 1467 break; 1468 UPDATE_maxLen 1469 distances[-2] = maxLen; 1470 if (maxLen == lenLimit) 1471 { 1472 p->son[p->cyclicBufferPos] = curMatch; 1473 MOVE_POS_RET 1474 } 1475 break; 1476 } 1477 1478 GET_MATCHES_FOOTER_HC(maxLen) 1479} 1480 1481 1482UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1483{ 1484 GET_MATCHES_HEADER(3) 1485 HASH_ZIP_CALC 1486 curMatch = p->hash[hv]; 1487 p->hash[hv] = p->pos; 1488 GET_MATCHES_FOOTER_HC(2) 1489} 1490 1491 1492static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1493{ 1494 SKIP_HEADER(2) 1495 { 1496 HASH2_CALC 1497 curMatch = p->hash[hv]; 1498 p->hash[hv] = p->pos; 1499 } 1500 SKIP_FOOTER 1501} 1502 1503void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1504{ 1505 SKIP_HEADER(3) 1506 { 1507 HASH_ZIP_CALC 1508 curMatch = p->hash[hv]; 1509 p->hash[hv] = p->pos; 1510 } 1511 SKIP_FOOTER 1512} 1513 1514static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1515{ 1516 SKIP_HEADER(3) 1517 { 1518 UInt32 h2; 1519 UInt32 *hash; 1520 HASH3_CALC 1521 hash = p->hash; 1522 curMatch = (hash + kFix3HashSize)[hv]; 1523 hash[h2] = 1524 (hash + kFix3HashSize)[hv] = p->pos; 1525 } 1526 SKIP_FOOTER 1527} 1528 1529static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1530{ 1531 SKIP_HEADER(4) 1532 { 1533 UInt32 h2, h3; 1534 UInt32 *hash; 1535 HASH4_CALC 1536 hash = p->hash; 1537 curMatch = (hash + kFix4HashSize)[hv]; 1538 hash [h2] = 1539 (hash + kFix3HashSize)[h3] = 1540 (hash + kFix4HashSize)[hv] = p->pos; 1541 } 1542 SKIP_FOOTER 1543} 1544 1545static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1546{ 1547 SKIP_HEADER(5) 1548 { 1549 UInt32 h2, h3; 1550 UInt32 *hash; 1551 HASH5_CALC 1552 hash = p->hash; 1553 curMatch = (hash + kFix5HashSize)[hv]; 1554 hash [h2] = 1555 (hash + kFix3HashSize)[h3] = 1556 // (hash + kFix4HashSize)[h4] = 1557 (hash + kFix5HashSize)[hv] = p->pos; 1558 } 1559 SKIP_FOOTER 1560} 1561 1562 1563#define HC_SKIP_HEADER(minLen) \ 1564 do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \ 1565 const Byte *cur; \ 1566 UInt32 *hash; \ 1567 UInt32 *son; \ 1568 UInt32 pos = p->pos; \ 1569 UInt32 num2 = num; \ 1570 /* (p->pos == p->posLimit) is not allowed here !!! */ \ 1571 { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ 1572 num -= num2; \ 1573 { const UInt32 cycPos = p->cyclicBufferPos; \ 1574 son = p->son + cycPos; \ 1575 p->cyclicBufferPos = cycPos + num2; } \ 1576 cur = p->buffer; \ 1577 hash = p->hash; \ 1578 do { \ 1579 UInt32 curMatch; \ 1580 UInt32 hv; 1581 1582 1583#define HC_SKIP_FOOTER \ 1584 cur++; pos++; *son++ = curMatch; \ 1585 } while (--num2); \ 1586 p->buffer = cur; \ 1587 p->pos = pos; \ 1588 if (pos == p->posLimit) MatchFinder_CheckLimits(p); \ 1589 }} while(num); \ 1590 1591 1592static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1593{ 1594 HC_SKIP_HEADER(4) 1595 1596 UInt32 h2, h3; 1597 HASH4_CALC 1598 curMatch = (hash + kFix4HashSize)[hv]; 1599 hash [h2] = 1600 (hash + kFix3HashSize)[h3] = 1601 (hash + kFix4HashSize)[hv] = pos; 1602 1603 HC_SKIP_FOOTER 1604} 1605 1606 1607static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1608{ 1609 HC_SKIP_HEADER(5) 1610 1611 UInt32 h2, h3; 1612 HASH5_CALC 1613 curMatch = (hash + kFix5HashSize)[hv]; 1614 hash [h2] = 1615 (hash + kFix3HashSize)[h3] = 1616 // (hash + kFix4HashSize)[h4] = 1617 (hash + kFix5HashSize)[hv] = pos; 1618 1619 HC_SKIP_FOOTER 1620} 1621 1622 1623void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1624{ 1625 HC_SKIP_HEADER(3) 1626 1627 HASH_ZIP_CALC 1628 curMatch = hash[hv]; 1629 hash[hv] = pos; 1630 1631 HC_SKIP_FOOTER 1632} 1633 1634 1635void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) 1636{ 1637 vTable->Init = (Mf_Init_Func)MatchFinder_Init; 1638 vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; 1639 vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; 1640 if (!p->btMode) 1641 { 1642 if (p->numHashBytes <= 4) 1643 { 1644 vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; 1645 vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; 1646 } 1647 else 1648 { 1649 vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches; 1650 vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip; 1651 } 1652 } 1653 else if (p->numHashBytes == 2) 1654 { 1655 vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; 1656 vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; 1657 } 1658 else if (p->numHashBytes == 3) 1659 { 1660 vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; 1661 vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; 1662 } 1663 else if (p->numHashBytes == 4) 1664 { 1665 vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; 1666 vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; 1667 } 1668 else 1669 { 1670 vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches; 1671 vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; 1672 } 1673} 1674 1675 1676 1677void LzFindPrepare(void) 1678{ 1679 #ifndef FORCE_LZFIND_SATUR_SUB_128 1680 #ifdef USE_LZFIND_SATUR_SUB_128 1681 LZFIND_SATUR_SUB_CODE_FUNC f = NULL; 1682 #ifdef MY_CPU_ARM_OR_ARM64 1683 { 1684 if (CPU_IsSupported_NEON()) 1685 { 1686 // #pragma message ("=== LzFind NEON") 1687 PRF(printf("\n=== LzFind NEON\n")); 1688 f = LzFind_SaturSub_128; 1689 } 1690 // f = 0; // for debug 1691 } 1692 #else // MY_CPU_ARM_OR_ARM64 1693 if (CPU_IsSupported_SSE41()) 1694 { 1695 // #pragma message ("=== LzFind SSE41") 1696 PRF(printf("\n=== LzFind SSE41\n")); 1697 f = LzFind_SaturSub_128; 1698 1699 #ifdef USE_LZFIND_SATUR_SUB_256 1700 if (CPU_IsSupported_AVX2()) 1701 { 1702 // #pragma message ("=== LzFind AVX2") 1703 PRF(printf("\n=== LzFind AVX2\n")); 1704 f = LzFind_SaturSub_256; 1705 } 1706 #endif 1707 } 1708 #endif // MY_CPU_ARM_OR_ARM64 1709 g_LzFind_SaturSub = f; 1710 #endif // USE_LZFIND_SATUR_SUB_128 1711 #endif // FORCE_LZFIND_SATUR_SUB_128 1712} 1713 1714 1715#undef MOVE_POS 1716#undef MOVE_POS_RET 1717#undef PRF 1718