xref: /third_party/lzma/C/LzFind.c (revision 370b324c)
1/* LzFind.c -- Match finder for LZ algorithms
22023-03-14 : Igor Pavlov : Public domain */
3
4#include "Precomp.h"
5
6#include <string.h>
7// #include <stdio.h>
8
9#include "CpuArch.h"
10#include "LzFind.h"
11#include "LzHash.h"
12
13#define kBlockMoveAlign       (1 << 7)    // alignment for memmove()
14#define kBlockSizeAlign       (1 << 16)   // alignment for block allocation
15#define kBlockSizeReserveMin  (1 << 24)   // it's 1/256 from 4 GB dictinary
16
17#define kEmptyHashValue 0
18
19#define kMaxValForNormalize ((UInt32)0)
20// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
21
22// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
23
24#define GET_AVAIL_BYTES(p) \
25  Inline_MatchFinder_GetNumAvailableBytes(p)
26
27
28// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
29#define kFix5HashSize kFix4HashSize
30
31/*
32 HASH2_CALC:
33   if (hv) match, then cur[0] and cur[1] also match
34*/
35#define HASH2_CALC hv = GetUi16(cur);
36
37// (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
38
39/*
40 HASH3_CALC:
41   if (cur[0]) and (h2) match, then cur[1]            also match
42   if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
43*/
44#define HASH3_CALC { \
45  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
46  h2 = temp & (kHash2Size - 1); \
47  hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
48
49#define HASH4_CALC { \
50  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
51  h2 = temp & (kHash2Size - 1); \
52  temp ^= ((UInt32)cur[2] << 8); \
53  h3 = temp & (kHash3Size - 1); \
54  hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
55
56#define HASH5_CALC { \
57  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
58  h2 = temp & (kHash2Size - 1); \
59  temp ^= ((UInt32)cur[2] << 8); \
60  h3 = temp & (kHash3Size - 1); \
61  temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
62  /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
63  hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
64
65#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
66
67
68static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
69{
70  // if (!p->directInput)
71  {
72    ISzAlloc_Free(alloc, p->bufBase);
73    p->bufBase = NULL;
74  }
75}
76
77
78static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
79{
80  if (blockSize == 0)
81    return 0;
82  if (!p->bufBase || p->blockSize != blockSize)
83  {
84    // size_t blockSizeT;
85    LzInWindow_Free(p, alloc);
86    p->blockSize = blockSize;
87    // blockSizeT = blockSize;
88
89    // printf("\nblockSize = 0x%x\n", blockSize);
90    /*
91    #if defined _WIN64
92    // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
93    // we use UInt32 type for (p->blockSize), because
94    // we don't want to wrap over 4 GiB,
95    // when we use (p->streamPos - p->pos) that is UInt32.
96    if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
97    {
98      blockSizeT = ((size_t)1 << 32);
99      printf("\nchanged to blockSizeT = 4GiB\n");
100    }
101    #endif
102    */
103
104    p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
105    // printf("\nbufferBase = %p\n", p->bufBase);
106    // return 0; // for debug
107  }
108  return (p->bufBase != NULL);
109}
110
111static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; }
112
113static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); }
114
115
116Z7_NO_INLINE
117static void MatchFinder_ReadBlock(CMatchFinder *p)
118{
119  if (p->streamEndWasReached || p->result != SZ_OK)
120    return;
121
122  /* We use (p->streamPos - p->pos) value.
123     (p->streamPos < p->pos) is allowed. */
124
125  if (p->directInput)
126  {
127    UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
128    if (curSize > p->directInputRem)
129      curSize = (UInt32)p->directInputRem;
130    p->streamPos += curSize;
131    p->directInputRem -= curSize;
132    if (p->directInputRem == 0)
133      p->streamEndWasReached = 1;
134    return;
135  }
136
137  for (;;)
138  {
139    const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
140    size_t size = (size_t)(p->bufBase + p->blockSize - dest);
141    if (size == 0)
142    {
143      /* we call ReadBlock() after NeedMove() and MoveBlock().
144         NeedMove() and MoveBlock() povide more than (keepSizeAfter)
145         to the end of (blockSize).
146         So we don't execute this branch in normal code flow.
147         We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
148      */
149      // p->result = SZ_ERROR_FAIL; // we can show error here
150      return;
151    }
152
153    // #define kRead 3
154    // if (size > kRead) size = kRead; // for debug
155
156    /*
157    // we need cast (Byte *)dest.
158    #ifdef __clang__
159      #pragma GCC diagnostic ignored "-Wcast-qual"
160    #endif
161    */
162    p->result = ISeqInStream_Read(p->stream,
163        p->bufBase + (dest - p->bufBase), &size);
164    if (p->result != SZ_OK)
165      return;
166    if (size == 0)
167    {
168      p->streamEndWasReached = 1;
169      return;
170    }
171    p->streamPos += (UInt32)size;
172    if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
173      return;
174    /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
175         (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
176  }
177
178  // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
179}
180
181
182
183Z7_NO_INLINE
184void MatchFinder_MoveBlock(CMatchFinder *p)
185{
186  const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
187  const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
188  p->buffer = p->bufBase + keepBefore;
189  memmove(p->bufBase,
190      p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
191      keepBefore + (size_t)GET_AVAIL_BYTES(p));
192}
193
194/* We call MoveBlock() before ReadBlock().
195   So MoveBlock() can be wasteful operation, if the whole input data
196   can fit in current block even without calling MoveBlock().
197   in important case where (dataSize <= historySize)
198     condition (p->blockSize > dataSize + p->keepSizeAfter) is met
199     So there is no MoveBlock() in that case case.
200*/
201
202int MatchFinder_NeedMove(CMatchFinder *p)
203{
204  if (p->directInput)
205    return 0;
206  if (p->streamEndWasReached || p->result != SZ_OK)
207    return 0;
208  return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
209}
210
211void MatchFinder_ReadIfRequired(CMatchFinder *p)
212{
213  if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
214    MatchFinder_ReadBlock(p);
215}
216
217
218
219static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
220{
221  p->cutValue = 32;
222  p->btMode = 1;
223  p->numHashBytes = 4;
224  p->numHashBytes_Min = 2;
225  p->numHashOutBits = 0;
226  p->bigHash = 0;
227}
228
229#define kCrcPoly 0xEDB88320
230
231void MatchFinder_Construct(CMatchFinder *p)
232{
233  unsigned i;
234  p->buffer = NULL;
235  p->bufBase = NULL;
236  p->directInput = 0;
237  p->stream = NULL;
238  p->hash = NULL;
239  p->expectedDataSize = (UInt64)(Int64)-1;
240  MatchFinder_SetDefaultSettings(p);
241
242  for (i = 0; i < 256; i++)
243  {
244    UInt32 r = (UInt32)i;
245    unsigned j;
246    for (j = 0; j < 8; j++)
247      r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
248    p->crc[i] = r;
249  }
250}
251
252#undef kCrcPoly
253
254static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
255{
256  ISzAlloc_Free(alloc, p->hash);
257  p->hash = NULL;
258}
259
260void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
261{
262  MatchFinder_FreeThisClassMemory(p, alloc);
263  LzInWindow_Free(p, alloc);
264}
265
266static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
267{
268  const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
269  if (sizeInBytes / sizeof(CLzRef) != num)
270    return NULL;
271  return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
272}
273
274#if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
275  #error Stop_Compiling_Bad_Reserve
276#endif
277
278
279
280static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
281{
282  UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
283  /*
284  if (historySize > kMaxHistorySize)
285    return 0;
286  */
287  // printf("\nhistorySize == 0x%x\n", historySize);
288
289  if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore)  // if 32-bit overflow
290    return 0;
291
292  {
293    const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
294    const UInt32 rem = kBlockSizeMax - blockSize;
295    const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
296        + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
297    if (blockSize >= kBlockSizeMax
298        || rem < kBlockSizeReserveMin) // we reject settings that will be slow
299      return 0;
300    if (reserve >= rem)
301      blockSize = kBlockSizeMax;
302    else
303    {
304      blockSize += reserve;
305      blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
306    }
307  }
308  // printf("\n LzFind_blockSize = %x\n", blockSize);
309  // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
310  return blockSize;
311}
312
313
314// input is historySize
315static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
316{
317  if (p->numHashBytes == 2)
318    return (1 << 16) - 1;
319  if (hs != 0)
320    hs--;
321  hs |= (hs >> 1);
322  hs |= (hs >> 2);
323  hs |= (hs >> 4);
324  hs |= (hs >> 8);
325  // we propagated 16 bits in (hs). Low 16 bits must be set later
326  if (hs >= (1 << 24))
327  {
328    if (p->numHashBytes == 3)
329      hs = (1 << 24) - 1;
330    /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
331  }
332  // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
333  hs |= (1 << 16) - 1; /* don't change it! */
334  // bt5: we adjust the size with recommended minimum size
335  if (p->numHashBytes >= 5)
336    hs |= (256 << kLzHash_CrcShift_2) - 1;
337  return hs;
338}
339
340// input is historySize
341static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
342{
343  if (p->numHashBytes == 2)
344    return (1 << 16) - 1;
345  if (hs != 0)
346    hs--;
347  hs |= (hs >> 1);
348  hs |= (hs >> 2);
349  hs |= (hs >> 4);
350  hs |= (hs >> 8);
351  // we propagated 16 bits in (hs). Low 16 bits must be set later
352  hs >>= 1;
353  if (hs >= (1 << 24))
354  {
355    if (p->numHashBytes == 3)
356      hs = (1 << 24) - 1;
357    else
358      hs >>= 1;
359    /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
360  }
361  // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
362  hs |= (1 << 16) - 1; /* don't change it! */
363  // bt5: we adjust the size with recommended minimum size
364  if (p->numHashBytes >= 5)
365    hs |= (256 << kLzHash_CrcShift_2) - 1;
366  return hs;
367}
368
369
370int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
371    UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
372    ISzAllocPtr alloc)
373{
374  /* we need one additional byte in (p->keepSizeBefore),
375     since we use MoveBlock() after (p->pos++) and before dictionary using */
376  // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
377  p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
378
379  keepAddBufferAfter += matchMaxLen;
380  /* we need (p->keepSizeAfter >= p->numHashBytes) */
381  if (keepAddBufferAfter < p->numHashBytes)
382    keepAddBufferAfter = p->numHashBytes;
383  // keepAddBufferAfter -= 2; // for debug
384  p->keepSizeAfter = keepAddBufferAfter;
385
386  if (p->directInput)
387    p->blockSize = 0;
388  if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
389  {
390    size_t hashSizeSum;
391    {
392      UInt32 hs;
393      UInt32 hsCur;
394
395      if (p->numHashOutBits != 0)
396      {
397        unsigned numBits = p->numHashOutBits;
398        const unsigned nbMax =
399            (p->numHashBytes == 2 ? 16 :
400            (p->numHashBytes == 3 ? 24 : 32));
401        if (numBits > nbMax)
402          numBits = nbMax;
403        if (numBits >= 32)
404          hs = (UInt32)0 - 1;
405        else
406          hs = ((UInt32)1 << numBits) - 1;
407        // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
408        hs |= (1 << 16) - 1; /* don't change it! */
409        if (p->numHashBytes >= 5)
410          hs |= (256 << kLzHash_CrcShift_2) - 1;
411        {
412          const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
413          if (hs > hs2)
414            hs = hs2;
415        }
416        hsCur = hs;
417        if (p->expectedDataSize < historySize)
418        {
419          const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
420          if (hsCur > hs2)
421            hsCur = hs2;
422        }
423      }
424      else
425      {
426        hs = MatchFinder_GetHashMask(p, historySize);
427        hsCur = hs;
428        if (p->expectedDataSize < historySize)
429        {
430          hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
431          if (hsCur > hs) // is it possible?
432            hsCur = hs;
433        }
434      }
435
436      p->hashMask = hsCur;
437
438      hashSizeSum = hs;
439      hashSizeSum++;
440      if (hashSizeSum < hs)
441        return 0;
442      {
443        UInt32 fixedHashSize = 0;
444        if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
445        if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
446        // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
447        hashSizeSum += fixedHashSize;
448        p->fixedHashSize = fixedHashSize;
449      }
450    }
451
452    p->matchMaxLen = matchMaxLen;
453
454    {
455      size_t newSize;
456      size_t numSons;
457      const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
458      p->historySize = historySize;
459      p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
460
461      numSons = newCyclicBufferSize;
462      if (p->btMode)
463        numSons <<= 1;
464      newSize = hashSizeSum + numSons;
465
466      if (numSons < newCyclicBufferSize || newSize < numSons)
467        return 0;
468
469      // aligned size is not required here, but it can be better for some loops
470      #define NUM_REFS_ALIGN_MASK 0xF
471      newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
472
473      // 22.02: we don't reallocate buffer, if old size is enough
474      if (p->hash && p->numRefs >= newSize)
475        return 1;
476
477      MatchFinder_FreeThisClassMemory(p, alloc);
478      p->numRefs = newSize;
479      p->hash = AllocRefs(newSize, alloc);
480
481      if (p->hash)
482      {
483        p->son = p->hash + hashSizeSum;
484        return 1;
485      }
486    }
487  }
488
489  MatchFinder_Free(p, alloc);
490  return 0;
491}
492
493
494static void MatchFinder_SetLimits(CMatchFinder *p)
495{
496  UInt32 k;
497  UInt32 n = kMaxValForNormalize - p->pos;
498  if (n == 0)
499    n = (UInt32)(Int32)-1;  // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
500
501  k = p->cyclicBufferSize - p->cyclicBufferPos;
502  if (k < n)
503    n = k;
504
505  k = GET_AVAIL_BYTES(p);
506  {
507    const UInt32 ksa = p->keepSizeAfter;
508    UInt32 mm = p->matchMaxLen;
509    if (k > ksa)
510      k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
511    else if (k >= mm)
512    {
513      // the limitation for (p->lenLimit) update
514      k -= mm;   // optimization : to reduce the number of checks
515      k++;
516      // k = 1; // non-optimized version : for debug
517    }
518    else
519    {
520      mm = k;
521      if (k != 0)
522        k = 1;
523    }
524    p->lenLimit = mm;
525  }
526  if (k < n)
527    n = k;
528
529  p->posLimit = p->pos + n;
530}
531
532
533void MatchFinder_Init_LowHash(CMatchFinder *p)
534{
535  size_t i;
536  CLzRef *items = p->hash;
537  const size_t numItems = p->fixedHashSize;
538  for (i = 0; i < numItems; i++)
539    items[i] = kEmptyHashValue;
540}
541
542
543void MatchFinder_Init_HighHash(CMatchFinder *p)
544{
545  size_t i;
546  CLzRef *items = p->hash + p->fixedHashSize;
547  const size_t numItems = (size_t)p->hashMask + 1;
548  for (i = 0; i < numItems; i++)
549    items[i] = kEmptyHashValue;
550}
551
552
553void MatchFinder_Init_4(CMatchFinder *p)
554{
555  if (!p->directInput)
556    p->buffer = p->bufBase;
557  {
558    /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
559       the code in CMatchFinderMt expects (pos = 1) */
560    p->pos =
561    p->streamPos =
562        1; // it's smallest optimal value. do not change it
563        // 0; // for debug
564  }
565  p->result = SZ_OK;
566  p->streamEndWasReached = 0;
567}
568
569
570// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
571#define CYC_TO_POS_OFFSET 0
572// #define CYC_TO_POS_OFFSET 1 // for debug
573
574void MatchFinder_Init(CMatchFinder *p)
575{
576  MatchFinder_Init_HighHash(p);
577  MatchFinder_Init_LowHash(p);
578  MatchFinder_Init_4(p);
579  // if (readData)
580  MatchFinder_ReadBlock(p);
581
582  /* if we init (cyclicBufferPos = pos), then we can use one variable
583     instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
584  p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
585  // p->cyclicBufferPos = 0; // smallest value
586  // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
587  MatchFinder_SetLimits(p);
588}
589
590
591
592#ifdef MY_CPU_X86_OR_AMD64
593  #if defined(__clang__) && (__clang_major__ >= 4) \
594    || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
595    // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
596
597      #define USE_LZFIND_SATUR_SUB_128
598      #define USE_LZFIND_SATUR_SUB_256
599      #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
600      #define LZFIND_ATTRIB_AVX2  __attribute__((__target__("avx2")))
601  #elif defined(_MSC_VER)
602    #if (_MSC_VER >= 1600)
603      #define USE_LZFIND_SATUR_SUB_128
604    #endif
605    #if (_MSC_VER >= 1900)
606      #define USE_LZFIND_SATUR_SUB_256
607    #endif
608  #endif
609
610// #elif defined(MY_CPU_ARM_OR_ARM64)
611#elif defined(MY_CPU_ARM64)
612
613  #if defined(__clang__) && (__clang_major__ >= 8) \
614    || defined(__GNUC__) && (__GNUC__ >= 8)
615      #define USE_LZFIND_SATUR_SUB_128
616    #ifdef MY_CPU_ARM64
617      // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
618    #else
619      // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
620    #endif
621
622  #elif defined(_MSC_VER)
623    #if (_MSC_VER >= 1910)
624      #define USE_LZFIND_SATUR_SUB_128
625    #endif
626  #endif
627
628  #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
629    #include <arm64_neon.h>
630  #else
631    #include <arm_neon.h>
632  #endif
633
634#endif
635
636
637#ifdef USE_LZFIND_SATUR_SUB_128
638
639// #define Z7_SHOW_HW_STATUS
640
641#ifdef Z7_SHOW_HW_STATUS
642#include <stdio.h>
643#define PRF(x) x
644PRF(;)
645#else
646#define PRF(x)
647#endif
648
649
650#ifdef MY_CPU_ARM_OR_ARM64
651
652#ifdef MY_CPU_ARM64
653// #define FORCE_LZFIND_SATUR_SUB_128
654#endif
655typedef uint32x4_t LzFind_v128;
656#define SASUB_128_V(v, s) \
657  vsubq_u32(vmaxq_u32(v, s), s)
658
659#else // MY_CPU_ARM_OR_ARM64
660
661#include <smmintrin.h> // sse4.1
662
663typedef __m128i LzFind_v128;
664// SSE 4.1
665#define SASUB_128_V(v, s)   \
666  _mm_sub_epi32(_mm_max_epu32(v, s), s)
667
668#endif // MY_CPU_ARM_OR_ARM64
669
670
671#define SASUB_128(i) \
672  *(      LzFind_v128 *)(      void *)(items + (i) * 4) = SASUB_128_V( \
673  *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
674
675
676Z7_NO_INLINE
677static
678#ifdef LZFIND_ATTRIB_SSE41
679LZFIND_ATTRIB_SSE41
680#endif
681void
682Z7_FASTCALL
683LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
684{
685  const LzFind_v128 sub2 =
686    #ifdef MY_CPU_ARM_OR_ARM64
687      vdupq_n_u32(subValue);
688    #else
689      _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
690    #endif
691  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
692  do
693  {
694    SASUB_128(0)  SASUB_128(1)  items += 2 * 4;
695    SASUB_128(0)  SASUB_128(1)  items += 2 * 4;
696  }
697  while (items != lim);
698}
699
700
701
702#ifdef USE_LZFIND_SATUR_SUB_256
703
704#include <immintrin.h> // avx
705/*
706clang :immintrin.h uses
707#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
708    defined(__AVX2__)
709#include <avx2intrin.h>
710#endif
711so we need <avxintrin.h> for clang-cl */
712
713#if defined(__clang__)
714#include <avxintrin.h>
715#include <avx2intrin.h>
716#endif
717
718// AVX2:
719#define SASUB_256(i) \
720    *(      __m256i *)(      void *)(items + (i) * 8) = \
721   _mm256_sub_epi32(_mm256_max_epu32( \
722    *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
723
724Z7_NO_INLINE
725static
726#ifdef LZFIND_ATTRIB_AVX2
727LZFIND_ATTRIB_AVX2
728#endif
729void
730Z7_FASTCALL
731LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
732{
733  const __m256i sub2 = _mm256_set_epi32(
734      (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
735      (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
736  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
737  do
738  {
739    SASUB_256(0)  SASUB_256(1)  items += 2 * 8;
740    SASUB_256(0)  SASUB_256(1)  items += 2 * 8;
741  }
742  while (items != lim);
743}
744#endif // USE_LZFIND_SATUR_SUB_256
745
746#ifndef FORCE_LZFIND_SATUR_SUB_128
747typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
748    UInt32 subValue, CLzRef *items, const CLzRef *lim);
749static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
750#endif // FORCE_LZFIND_SATUR_SUB_128
751
752#endif // USE_LZFIND_SATUR_SUB_128
753
754
755// kEmptyHashValue must be zero
756// #define SASUB_32(i)  { UInt32 v = items[i];  UInt32 m = v - subValue;  if (v < subValue) m = kEmptyHashValue;  items[i] = m; }
757#define SASUB_32(i)  { UInt32 v = items[i];  if (v < subValue) v = subValue; items[i] = v - subValue; }
758
759#ifdef FORCE_LZFIND_SATUR_SUB_128
760
761#define DEFAULT_SaturSub LzFind_SaturSub_128
762
763#else
764
765#define DEFAULT_SaturSub LzFind_SaturSub_32
766
767Z7_NO_INLINE
768static
769void
770Z7_FASTCALL
771LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
772{
773  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
774  do
775  {
776    SASUB_32(0)  SASUB_32(1)  items += 2;
777    SASUB_32(0)  SASUB_32(1)  items += 2;
778    SASUB_32(0)  SASUB_32(1)  items += 2;
779    SASUB_32(0)  SASUB_32(1)  items += 2;
780  }
781  while (items != lim);
782}
783
784#endif
785
786
787Z7_NO_INLINE
788void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
789{
790  #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
791  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
792  for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
793  {
794    SASUB_32(0)
795    items++;
796  }
797  {
798    const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
799    CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
800    numItems &= k_Align_Mask;
801    if (items != lim)
802    {
803      #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
804        if (g_LzFind_SaturSub)
805          g_LzFind_SaturSub(subValue, items, lim);
806        else
807      #endif
808          DEFAULT_SaturSub(subValue, items, lim);
809    }
810    items = lim;
811  }
812  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
813  for (; numItems != 0; numItems--)
814  {
815    SASUB_32(0)
816    items++;
817  }
818}
819
820
821
822// call MatchFinder_CheckLimits() only after (p->pos++) update
823
824Z7_NO_INLINE
825static void MatchFinder_CheckLimits(CMatchFinder *p)
826{
827  if (// !p->streamEndWasReached && p->result == SZ_OK &&
828      p->keepSizeAfter == GET_AVAIL_BYTES(p))
829  {
830    // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
831    if (MatchFinder_NeedMove(p))
832      MatchFinder_MoveBlock(p);
833    MatchFinder_ReadBlock(p);
834  }
835
836  if (p->pos == kMaxValForNormalize)
837  if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
838    /*
839       if we disable normalization for last bytes of data, and
840       if (data_size == 4 GiB), we don't call wastfull normalization,
841       but (pos) will be wrapped over Zero (0) in that case.
842       And we cannot resume later to normal operation
843    */
844  {
845    // MatchFinder_Normalize(p);
846    /* after normalization we need (p->pos >= p->historySize + 1); */
847    /* we can reduce subValue to aligned value, if want to keep alignment
848       of (p->pos) and (p->buffer) for speculated accesses. */
849    const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
850    // const UInt32 subValue = (1 << 15); // for debug
851    // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
852    MatchFinder_REDUCE_OFFSETS(p, subValue)
853    MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
854    {
855      size_t numSonRefs = p->cyclicBufferSize;
856      if (p->btMode)
857        numSonRefs <<= 1;
858      MatchFinder_Normalize3(subValue, p->son, numSonRefs);
859    }
860  }
861
862  if (p->cyclicBufferPos == p->cyclicBufferSize)
863    p->cyclicBufferPos = 0;
864
865  MatchFinder_SetLimits(p);
866}
867
868
869/*
870  (lenLimit > maxLen)
871*/
872Z7_FORCE_INLINE
873static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
874    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
875    UInt32 *d, unsigned maxLen)
876{
877  /*
878  son[_cyclicBufferPos] = curMatch;
879  for (;;)
880  {
881    UInt32 delta = pos - curMatch;
882    if (cutValue-- == 0 || delta >= _cyclicBufferSize)
883      return d;
884    {
885      const Byte *pb = cur - delta;
886      curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
887      if (pb[maxLen] == cur[maxLen] && *pb == *cur)
888      {
889        UInt32 len = 0;
890        while (++len != lenLimit)
891          if (pb[len] != cur[len])
892            break;
893        if (maxLen < len)
894        {
895          maxLen = len;
896          *d++ = len;
897          *d++ = delta - 1;
898          if (len == lenLimit)
899            return d;
900        }
901      }
902    }
903  }
904  */
905
906  const Byte *lim = cur + lenLimit;
907  son[_cyclicBufferPos] = curMatch;
908
909  do
910  {
911    UInt32 delta;
912
913    if (curMatch == 0)
914      break;
915    // if (curMatch2 >= curMatch) return NULL;
916    delta = pos - curMatch;
917    if (delta >= _cyclicBufferSize)
918      break;
919    {
920      ptrdiff_t diff;
921      curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
922      diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
923      if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
924      {
925        const Byte *c = cur;
926        while (*c == c[diff])
927        {
928          if (++c == lim)
929          {
930            d[0] = (UInt32)(lim - cur);
931            d[1] = delta - 1;
932            return d + 2;
933          }
934        }
935        {
936          const unsigned len = (unsigned)(c - cur);
937          if (maxLen < len)
938          {
939            maxLen = len;
940            d[0] = (UInt32)len;
941            d[1] = delta - 1;
942            d += 2;
943          }
944        }
945      }
946    }
947  }
948  while (--cutValue);
949
950  return d;
951}
952
953
954Z7_FORCE_INLINE
955UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
956    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
957    UInt32 *d, UInt32 maxLen)
958{
959  CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
960  CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
961  unsigned len0 = 0, len1 = 0;
962
963  UInt32 cmCheck;
964
965  // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
966
967  cmCheck = (UInt32)(pos - _cyclicBufferSize);
968  if ((UInt32)pos <= _cyclicBufferSize)
969    cmCheck = 0;
970
971  if (cmCheck < curMatch)
972  do
973  {
974    const UInt32 delta = pos - curMatch;
975    {
976      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
977      const Byte *pb = cur - delta;
978      unsigned len = (len0 < len1 ? len0 : len1);
979      const UInt32 pair0 = pair[0];
980      if (pb[len] == cur[len])
981      {
982        if (++len != lenLimit && pb[len] == cur[len])
983          while (++len != lenLimit)
984            if (pb[len] != cur[len])
985              break;
986        if (maxLen < len)
987        {
988          maxLen = (UInt32)len;
989          *d++ = (UInt32)len;
990          *d++ = delta - 1;
991          if (len == lenLimit)
992          {
993            *ptr1 = pair0;
994            *ptr0 = pair[1];
995            return d;
996          }
997        }
998      }
999      if (pb[len] < cur[len])
1000      {
1001        *ptr1 = curMatch;
1002        // const UInt32 curMatch2 = pair[1];
1003        // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue;  return NULL; }
1004        // curMatch = curMatch2;
1005        curMatch = pair[1];
1006        ptr1 = pair + 1;
1007        len1 = len;
1008      }
1009      else
1010      {
1011        *ptr0 = curMatch;
1012        curMatch = pair[0];
1013        ptr0 = pair;
1014        len0 = len;
1015      }
1016    }
1017  }
1018  while(--cutValue && cmCheck < curMatch);
1019
1020  *ptr0 = *ptr1 = kEmptyHashValue;
1021  return d;
1022}
1023
1024
1025static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
1026    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
1027{
1028  CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
1029  CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
1030  unsigned len0 = 0, len1 = 0;
1031
1032  UInt32 cmCheck;
1033
1034  cmCheck = (UInt32)(pos - _cyclicBufferSize);
1035  if ((UInt32)pos <= _cyclicBufferSize)
1036    cmCheck = 0;
1037
1038  if (// curMatch >= pos ||  // failure
1039      cmCheck < curMatch)
1040  do
1041  {
1042    const UInt32 delta = pos - curMatch;
1043    {
1044      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
1045      const Byte *pb = cur - delta;
1046      unsigned len = (len0 < len1 ? len0 : len1);
1047      if (pb[len] == cur[len])
1048      {
1049        while (++len != lenLimit)
1050          if (pb[len] != cur[len])
1051            break;
1052        {
1053          if (len == lenLimit)
1054          {
1055            *ptr1 = pair[0];
1056            *ptr0 = pair[1];
1057            return;
1058          }
1059        }
1060      }
1061      if (pb[len] < cur[len])
1062      {
1063        *ptr1 = curMatch;
1064        curMatch = pair[1];
1065        ptr1 = pair + 1;
1066        len1 = len;
1067      }
1068      else
1069      {
1070        *ptr0 = curMatch;
1071        curMatch = pair[0];
1072        ptr0 = pair;
1073        len0 = len;
1074      }
1075    }
1076  }
1077  while(--cutValue && cmCheck < curMatch);
1078
1079  *ptr0 = *ptr1 = kEmptyHashValue;
1080  return;
1081}
1082
1083
1084#define MOVE_POS \
1085  ++p->cyclicBufferPos; \
1086  p->buffer++; \
1087  { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
1088
1089#define MOVE_POS_RET MOVE_POS return distances;
1090
1091Z7_NO_INLINE
1092static void MatchFinder_MovePos(CMatchFinder *p)
1093{
1094  /* we go here at the end of stream data, when (avail < num_hash_bytes)
1095     We don't update sons[cyclicBufferPos << btMode].
1096     So (sons) record will contain junk. And we cannot resume match searching
1097     to normal operation, even if we will provide more input data in buffer.
1098     p->sons[p->cyclicBufferPos << p->btMode] = 0;  // kEmptyHashValue
1099     if (p->btMode)
1100        p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0;  // kEmptyHashValue
1101  */
1102  MOVE_POS
1103}
1104
1105#define GET_MATCHES_HEADER2(minLen, ret_op) \
1106  unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \
1107  lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \
1108  cur = p->buffer;
1109
1110#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
1111#define SKIP_HEADER(minLen)   do { GET_MATCHES_HEADER2(minLen, continue)
1112
1113#define MF_PARAMS(p)  lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
1114
1115#define SKIP_FOOTER  SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num);
1116
1117#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
1118  distances = func(MF_PARAMS(p), \
1119  distances, (UInt32)_maxLen_); MOVE_POS_RET
1120
1121#define GET_MATCHES_FOOTER_BT(_maxLen_) \
1122  GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
1123
1124#define GET_MATCHES_FOOTER_HC(_maxLen_) \
1125  GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
1126
1127
1128
1129#define UPDATE_maxLen { \
1130    const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
1131    const Byte *c = cur + maxLen; \
1132    const Byte *lim = cur + lenLimit; \
1133    for (; c != lim; c++) if (*(c + diff) != *c) break; \
1134    maxLen = (unsigned)(c - cur); }
1135
1136static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1137{
1138  GET_MATCHES_HEADER(2)
1139  HASH2_CALC
1140  curMatch = p->hash[hv];
1141  p->hash[hv] = p->pos;
1142  GET_MATCHES_FOOTER_BT(1)
1143}
1144
1145UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1146{
1147  GET_MATCHES_HEADER(3)
1148  HASH_ZIP_CALC
1149  curMatch = p->hash[hv];
1150  p->hash[hv] = p->pos;
1151  GET_MATCHES_FOOTER_BT(2)
1152}
1153
1154
1155#define SET_mmm  \
1156  mmm = p->cyclicBufferSize; \
1157  if (pos < mmm) \
1158    mmm = pos;
1159
1160
1161static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1162{
1163  UInt32 mmm;
1164  UInt32 h2, d2, pos;
1165  unsigned maxLen;
1166  UInt32 *hash;
1167  GET_MATCHES_HEADER(3)
1168
1169  HASH3_CALC
1170
1171  hash = p->hash;
1172  pos = p->pos;
1173
1174  d2 = pos - hash[h2];
1175
1176  curMatch = (hash + kFix3HashSize)[hv];
1177
1178  hash[h2] = pos;
1179  (hash + kFix3HashSize)[hv] = pos;
1180
1181  SET_mmm
1182
1183  maxLen = 2;
1184
1185  if (d2 < mmm && *(cur - d2) == *cur)
1186  {
1187    UPDATE_maxLen
1188    distances[0] = (UInt32)maxLen;
1189    distances[1] = d2 - 1;
1190    distances += 2;
1191    if (maxLen == lenLimit)
1192    {
1193      SkipMatchesSpec(MF_PARAMS(p));
1194      MOVE_POS_RET
1195    }
1196  }
1197
1198  GET_MATCHES_FOOTER_BT(maxLen)
1199}
1200
1201
1202static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1203{
1204  UInt32 mmm;
1205  UInt32 h2, h3, d2, d3, pos;
1206  unsigned maxLen;
1207  UInt32 *hash;
1208  GET_MATCHES_HEADER(4)
1209
1210  HASH4_CALC
1211
1212  hash = p->hash;
1213  pos = p->pos;
1214
1215  d2 = pos - hash                  [h2];
1216  d3 = pos - (hash + kFix3HashSize)[h3];
1217  curMatch = (hash + kFix4HashSize)[hv];
1218
1219  hash                  [h2] = pos;
1220  (hash + kFix3HashSize)[h3] = pos;
1221  (hash + kFix4HashSize)[hv] = pos;
1222
1223  SET_mmm
1224
1225  maxLen = 3;
1226
1227  for (;;)
1228  {
1229    if (d2 < mmm && *(cur - d2) == *cur)
1230    {
1231      distances[0] = 2;
1232      distances[1] = d2 - 1;
1233      distances += 2;
1234      if (*(cur - d2 + 2) == cur[2])
1235      {
1236        // distances[-2] = 3;
1237      }
1238      else if (d3 < mmm && *(cur - d3) == *cur)
1239      {
1240        d2 = d3;
1241        distances[1] = d3 - 1;
1242        distances += 2;
1243      }
1244      else
1245        break;
1246    }
1247    else if (d3 < mmm && *(cur - d3) == *cur)
1248    {
1249      d2 = d3;
1250      distances[1] = d3 - 1;
1251      distances += 2;
1252    }
1253    else
1254      break;
1255
1256    UPDATE_maxLen
1257    distances[-2] = (UInt32)maxLen;
1258    if (maxLen == lenLimit)
1259    {
1260      SkipMatchesSpec(MF_PARAMS(p));
1261      MOVE_POS_RET
1262    }
1263    break;
1264  }
1265
1266  GET_MATCHES_FOOTER_BT(maxLen)
1267}
1268
1269
1270static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1271{
1272  UInt32 mmm;
1273  UInt32 h2, h3, d2, d3, maxLen, pos;
1274  UInt32 *hash;
1275  GET_MATCHES_HEADER(5)
1276
1277  HASH5_CALC
1278
1279  hash = p->hash;
1280  pos = p->pos;
1281
1282  d2 = pos - hash                  [h2];
1283  d3 = pos - (hash + kFix3HashSize)[h3];
1284  // d4 = pos - (hash + kFix4HashSize)[h4];
1285
1286  curMatch = (hash + kFix5HashSize)[hv];
1287
1288  hash                  [h2] = pos;
1289  (hash + kFix3HashSize)[h3] = pos;
1290  // (hash + kFix4HashSize)[h4] = pos;
1291  (hash + kFix5HashSize)[hv] = pos;
1292
1293  SET_mmm
1294
1295  maxLen = 4;
1296
1297  for (;;)
1298  {
1299    if (d2 < mmm && *(cur - d2) == *cur)
1300    {
1301      distances[0] = 2;
1302      distances[1] = d2 - 1;
1303      distances += 2;
1304      if (*(cur - d2 + 2) == cur[2])
1305      {
1306      }
1307      else if (d3 < mmm && *(cur - d3) == *cur)
1308      {
1309        distances[1] = d3 - 1;
1310        distances += 2;
1311        d2 = d3;
1312      }
1313      else
1314        break;
1315    }
1316    else if (d3 < mmm && *(cur - d3) == *cur)
1317    {
1318      distances[1] = d3 - 1;
1319      distances += 2;
1320      d2 = d3;
1321    }
1322    else
1323      break;
1324
1325    distances[-2] = 3;
1326    if (*(cur - d2 + 3) != cur[3])
1327      break;
1328    UPDATE_maxLen
1329    distances[-2] = (UInt32)maxLen;
1330    if (maxLen == lenLimit)
1331    {
1332      SkipMatchesSpec(MF_PARAMS(p));
1333      MOVE_POS_RET
1334    }
1335    break;
1336  }
1337
1338  GET_MATCHES_FOOTER_BT(maxLen)
1339}
1340
1341
1342static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1343{
1344  UInt32 mmm;
1345  UInt32 h2, h3, d2, d3, pos;
1346  unsigned maxLen;
1347  UInt32 *hash;
1348  GET_MATCHES_HEADER(4)
1349
1350  HASH4_CALC
1351
1352  hash = p->hash;
1353  pos = p->pos;
1354
1355  d2 = pos - hash                  [h2];
1356  d3 = pos - (hash + kFix3HashSize)[h3];
1357  curMatch = (hash + kFix4HashSize)[hv];
1358
1359  hash                  [h2] = pos;
1360  (hash + kFix3HashSize)[h3] = pos;
1361  (hash + kFix4HashSize)[hv] = pos;
1362
1363  SET_mmm
1364
1365  maxLen = 3;
1366
1367  for (;;)
1368  {
1369    if (d2 < mmm && *(cur - d2) == *cur)
1370    {
1371      distances[0] = 2;
1372      distances[1] = d2 - 1;
1373      distances += 2;
1374      if (*(cur - d2 + 2) == cur[2])
1375      {
1376        // distances[-2] = 3;
1377      }
1378      else if (d3 < mmm && *(cur - d3) == *cur)
1379      {
1380        d2 = d3;
1381        distances[1] = d3 - 1;
1382        distances += 2;
1383      }
1384      else
1385        break;
1386    }
1387    else if (d3 < mmm && *(cur - d3) == *cur)
1388    {
1389      d2 = d3;
1390      distances[1] = d3 - 1;
1391      distances += 2;
1392    }
1393    else
1394      break;
1395
1396    UPDATE_maxLen
1397    distances[-2] = (UInt32)maxLen;
1398    if (maxLen == lenLimit)
1399    {
1400      p->son[p->cyclicBufferPos] = curMatch;
1401      MOVE_POS_RET
1402    }
1403    break;
1404  }
1405
1406  GET_MATCHES_FOOTER_HC(maxLen)
1407}
1408
1409
1410static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1411{
1412  UInt32 mmm;
1413  UInt32 h2, h3, d2, d3, maxLen, pos;
1414  UInt32 *hash;
1415  GET_MATCHES_HEADER(5)
1416
1417  HASH5_CALC
1418
1419  hash = p->hash;
1420  pos = p->pos;
1421
1422  d2 = pos - hash                  [h2];
1423  d3 = pos - (hash + kFix3HashSize)[h3];
1424  // d4 = pos - (hash + kFix4HashSize)[h4];
1425
1426  curMatch = (hash + kFix5HashSize)[hv];
1427
1428  hash                  [h2] = pos;
1429  (hash + kFix3HashSize)[h3] = pos;
1430  // (hash + kFix4HashSize)[h4] = pos;
1431  (hash + kFix5HashSize)[hv] = pos;
1432
1433  SET_mmm
1434
1435  maxLen = 4;
1436
1437  for (;;)
1438  {
1439    if (d2 < mmm && *(cur - d2) == *cur)
1440    {
1441      distances[0] = 2;
1442      distances[1] = d2 - 1;
1443      distances += 2;
1444      if (*(cur - d2 + 2) == cur[2])
1445      {
1446      }
1447      else if (d3 < mmm && *(cur - d3) == *cur)
1448      {
1449        distances[1] = d3 - 1;
1450        distances += 2;
1451        d2 = d3;
1452      }
1453      else
1454        break;
1455    }
1456    else if (d3 < mmm && *(cur - d3) == *cur)
1457    {
1458      distances[1] = d3 - 1;
1459      distances += 2;
1460      d2 = d3;
1461    }
1462    else
1463      break;
1464
1465    distances[-2] = 3;
1466    if (*(cur - d2 + 3) != cur[3])
1467      break;
1468    UPDATE_maxLen
1469    distances[-2] = maxLen;
1470    if (maxLen == lenLimit)
1471    {
1472      p->son[p->cyclicBufferPos] = curMatch;
1473      MOVE_POS_RET
1474    }
1475    break;
1476  }
1477
1478  GET_MATCHES_FOOTER_HC(maxLen)
1479}
1480
1481
1482UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1483{
1484  GET_MATCHES_HEADER(3)
1485  HASH_ZIP_CALC
1486  curMatch = p->hash[hv];
1487  p->hash[hv] = p->pos;
1488  GET_MATCHES_FOOTER_HC(2)
1489}
1490
1491
1492static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1493{
1494  SKIP_HEADER(2)
1495  {
1496    HASH2_CALC
1497    curMatch = p->hash[hv];
1498    p->hash[hv] = p->pos;
1499  }
1500  SKIP_FOOTER
1501}
1502
1503void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1504{
1505  SKIP_HEADER(3)
1506  {
1507    HASH_ZIP_CALC
1508    curMatch = p->hash[hv];
1509    p->hash[hv] = p->pos;
1510  }
1511  SKIP_FOOTER
1512}
1513
1514static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1515{
1516  SKIP_HEADER(3)
1517  {
1518    UInt32 h2;
1519    UInt32 *hash;
1520    HASH3_CALC
1521    hash = p->hash;
1522    curMatch = (hash + kFix3HashSize)[hv];
1523    hash[h2] =
1524    (hash + kFix3HashSize)[hv] = p->pos;
1525  }
1526  SKIP_FOOTER
1527}
1528
1529static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1530{
1531  SKIP_HEADER(4)
1532  {
1533    UInt32 h2, h3;
1534    UInt32 *hash;
1535    HASH4_CALC
1536    hash = p->hash;
1537    curMatch = (hash + kFix4HashSize)[hv];
1538    hash                  [h2] =
1539    (hash + kFix3HashSize)[h3] =
1540    (hash + kFix4HashSize)[hv] = p->pos;
1541  }
1542  SKIP_FOOTER
1543}
1544
1545static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1546{
1547  SKIP_HEADER(5)
1548  {
1549    UInt32 h2, h3;
1550    UInt32 *hash;
1551    HASH5_CALC
1552    hash = p->hash;
1553    curMatch = (hash + kFix5HashSize)[hv];
1554    hash                  [h2] =
1555    (hash + kFix3HashSize)[h3] =
1556    // (hash + kFix4HashSize)[h4] =
1557    (hash + kFix5HashSize)[hv] = p->pos;
1558  }
1559  SKIP_FOOTER
1560}
1561
1562
1563#define HC_SKIP_HEADER(minLen) \
1564    do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
1565    const Byte *cur; \
1566    UInt32 *hash; \
1567    UInt32 *son; \
1568    UInt32 pos = p->pos; \
1569    UInt32 num2 = num; \
1570    /* (p->pos == p->posLimit) is not allowed here !!! */ \
1571    { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
1572    num -= num2; \
1573    { const UInt32 cycPos = p->cyclicBufferPos; \
1574      son = p->son + cycPos; \
1575      p->cyclicBufferPos = cycPos + num2; } \
1576    cur = p->buffer; \
1577    hash = p->hash; \
1578    do { \
1579    UInt32 curMatch; \
1580    UInt32 hv;
1581
1582
1583#define HC_SKIP_FOOTER \
1584    cur++;  pos++;  *son++ = curMatch; \
1585    } while (--num2); \
1586    p->buffer = cur; \
1587    p->pos = pos; \
1588    if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
1589    }} while(num); \
1590
1591
1592static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1593{
1594  HC_SKIP_HEADER(4)
1595
1596    UInt32 h2, h3;
1597    HASH4_CALC
1598    curMatch = (hash + kFix4HashSize)[hv];
1599    hash                  [h2] =
1600    (hash + kFix3HashSize)[h3] =
1601    (hash + kFix4HashSize)[hv] = pos;
1602
1603  HC_SKIP_FOOTER
1604}
1605
1606
1607static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1608{
1609  HC_SKIP_HEADER(5)
1610
1611    UInt32 h2, h3;
1612    HASH5_CALC
1613    curMatch = (hash + kFix5HashSize)[hv];
1614    hash                  [h2] =
1615    (hash + kFix3HashSize)[h3] =
1616    // (hash + kFix4HashSize)[h4] =
1617    (hash + kFix5HashSize)[hv] = pos;
1618
1619  HC_SKIP_FOOTER
1620}
1621
1622
1623void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1624{
1625  HC_SKIP_HEADER(3)
1626
1627    HASH_ZIP_CALC
1628    curMatch = hash[hv];
1629    hash[hv] = pos;
1630
1631  HC_SKIP_FOOTER
1632}
1633
1634
1635void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
1636{
1637  vTable->Init = (Mf_Init_Func)MatchFinder_Init;
1638  vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes;
1639  vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos;
1640  if (!p->btMode)
1641  {
1642    if (p->numHashBytes <= 4)
1643    {
1644      vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches;
1645      vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip;
1646    }
1647    else
1648    {
1649      vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches;
1650      vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip;
1651    }
1652  }
1653  else if (p->numHashBytes == 2)
1654  {
1655    vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches;
1656    vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip;
1657  }
1658  else if (p->numHashBytes == 3)
1659  {
1660    vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches;
1661    vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip;
1662  }
1663  else if (p->numHashBytes == 4)
1664  {
1665    vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches;
1666    vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip;
1667  }
1668  else
1669  {
1670    vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches;
1671    vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip;
1672  }
1673}
1674
1675
1676
1677void LzFindPrepare(void)
1678{
1679  #ifndef FORCE_LZFIND_SATUR_SUB_128
1680  #ifdef USE_LZFIND_SATUR_SUB_128
1681  LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
1682  #ifdef MY_CPU_ARM_OR_ARM64
1683  {
1684    if (CPU_IsSupported_NEON())
1685    {
1686      // #pragma message ("=== LzFind NEON")
1687      PRF(printf("\n=== LzFind NEON\n"));
1688      f = LzFind_SaturSub_128;
1689    }
1690    // f = 0; // for debug
1691  }
1692  #else // MY_CPU_ARM_OR_ARM64
1693  if (CPU_IsSupported_SSE41())
1694  {
1695    // #pragma message ("=== LzFind SSE41")
1696    PRF(printf("\n=== LzFind SSE41\n"));
1697    f = LzFind_SaturSub_128;
1698
1699    #ifdef USE_LZFIND_SATUR_SUB_256
1700    if (CPU_IsSupported_AVX2())
1701    {
1702      // #pragma message ("=== LzFind AVX2")
1703      PRF(printf("\n=== LzFind AVX2\n"));
1704      f = LzFind_SaturSub_256;
1705    }
1706    #endif
1707  }
1708  #endif // MY_CPU_ARM_OR_ARM64
1709  g_LzFind_SaturSub = f;
1710  #endif // USE_LZFIND_SATUR_SUB_128
1711  #endif // FORCE_LZFIND_SATUR_SUB_128
1712}
1713
1714
1715#undef MOVE_POS
1716#undef MOVE_POS_RET
1717#undef PRF
1718