11cb0ef41Sopenharmony_ci/* slide_hash_simd.h
21cb0ef41Sopenharmony_ci *
31cb0ef41Sopenharmony_ci * Copyright 2022 The Chromium Authors
41cb0ef41Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be
51cb0ef41Sopenharmony_ci * found in the Chromium source repository LICENSE file.
61cb0ef41Sopenharmony_ci */
71cb0ef41Sopenharmony_ci
81cb0ef41Sopenharmony_ci#ifndef SLIDE_HASH_SIMD_H
91cb0ef41Sopenharmony_ci#define SLIDE_HASH_SIMD_H
101cb0ef41Sopenharmony_ci
111cb0ef41Sopenharmony_ci#include "deflate.h"
121cb0ef41Sopenharmony_ci
131cb0ef41Sopenharmony_ci#ifndef INLINE
141cb0ef41Sopenharmony_ci#if defined(_MSC_VER) && !defined(__clang__)
151cb0ef41Sopenharmony_ci#define INLINE __inline
161cb0ef41Sopenharmony_ci#else
171cb0ef41Sopenharmony_ci#define INLINE inline
181cb0ef41Sopenharmony_ci#endif
191cb0ef41Sopenharmony_ci#endif
201cb0ef41Sopenharmony_ci
211cb0ef41Sopenharmony_ci#if defined(CPU_NO_SIMD)
221cb0ef41Sopenharmony_ci
231cb0ef41Sopenharmony_ci#error SIMD has been disabled for your build target
241cb0ef41Sopenharmony_ci
251cb0ef41Sopenharmony_ci#elif defined(DEFLATE_SLIDE_HASH_SSE2)
261cb0ef41Sopenharmony_ci
271cb0ef41Sopenharmony_ci#include <emmintrin.h>  /* SSE2 */
281cb0ef41Sopenharmony_ci
291cb0ef41Sopenharmony_ci#define Z_SLIDE_INIT_SIMD(wsize) _mm_set1_epi16((ush)(wsize))
301cb0ef41Sopenharmony_ci
311cb0ef41Sopenharmony_ci#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
321cb0ef41Sopenharmony_ci    for (const Posf* const end = table + size; table != end;) { \
331cb0ef41Sopenharmony_ci        __m128i vO = _mm_loadu_si128((__m128i *)(table + 0)); \
341cb0ef41Sopenharmony_ci        vO = _mm_subs_epu16(vO, vector_wsize); \
351cb0ef41Sopenharmony_ci        _mm_storeu_si128((__m128i *)(table + 0), vO); \
361cb0ef41Sopenharmony_ci        table += 8; \
371cb0ef41Sopenharmony_ci    }
381cb0ef41Sopenharmony_ci
391cb0ef41Sopenharmony_citypedef __m128i z_vec128i_u16x8_t;
401cb0ef41Sopenharmony_ci
411cb0ef41Sopenharmony_ci#elif defined(DEFLATE_SLIDE_HASH_NEON)
421cb0ef41Sopenharmony_ci
431cb0ef41Sopenharmony_ci#include <arm_neon.h>  /* NEON */
441cb0ef41Sopenharmony_ci
451cb0ef41Sopenharmony_ci#define Z_SLIDE_INIT_SIMD(wsize) vdupq_n_u16((ush)(wsize))
461cb0ef41Sopenharmony_ci
471cb0ef41Sopenharmony_ci#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
481cb0ef41Sopenharmony_ci    for (const Posf* const end = table + size; table != end;) { \
491cb0ef41Sopenharmony_ci        uint16x8_t vO = vld1q_u16(table + 0); \
501cb0ef41Sopenharmony_ci        uint16x8_t v8 = vld1q_u16(table + 8); \
511cb0ef41Sopenharmony_ci        vO = vqsubq_u16(vO, vector_wsize); \
521cb0ef41Sopenharmony_ci        v8 = vqsubq_u16(v8, vector_wsize); \
531cb0ef41Sopenharmony_ci        vst1q_u16(table + 0, vO); \
541cb0ef41Sopenharmony_ci        vst1q_u16(table + 8, v8); \
551cb0ef41Sopenharmony_ci        table += 8 + 8; \
561cb0ef41Sopenharmony_ci    }
571cb0ef41Sopenharmony_ci
581cb0ef41Sopenharmony_citypedef uint16x8_t z_vec128i_u16x8_t;
591cb0ef41Sopenharmony_ci
601cb0ef41Sopenharmony_ci#else
611cb0ef41Sopenharmony_ci
621cb0ef41Sopenharmony_ci#error slide_hash_simd is not defined for your build target
631cb0ef41Sopenharmony_ci
641cb0ef41Sopenharmony_ci#endif
651cb0ef41Sopenharmony_ci
661cb0ef41Sopenharmony_ci/* ===========================================================================
671cb0ef41Sopenharmony_ci * Slide the hash table when sliding the window down (could be avoided with 32
681cb0ef41Sopenharmony_ci * bit values at the expense of memory usage). We slide even when level == 0 to
691cb0ef41Sopenharmony_ci * keep the hash table consistent if we switch back to level > 0 later.
701cb0ef41Sopenharmony_ci */
711cb0ef41Sopenharmony_cilocal INLINE void slide_hash_simd(
721cb0ef41Sopenharmony_ci    Posf *head, Posf *prev, const uInt w_size, const uInt hash_size) {
731cb0ef41Sopenharmony_ci    /*
741cb0ef41Sopenharmony_ci     * The SIMD implementation of the hash table slider assumes:
751cb0ef41Sopenharmony_ci     *
761cb0ef41Sopenharmony_ci     * 1. hash chain offset is 2 bytes. Should be true as Pos is "ush" type.
771cb0ef41Sopenharmony_ci     */
781cb0ef41Sopenharmony_ci    Assert(sizeof(Pos) == 2, "Pos type size error: should be 2 bytes");
791cb0ef41Sopenharmony_ci    Assert(sizeof(ush) == 2, "ush type size error: should be 2 bytes");
801cb0ef41Sopenharmony_ci
811cb0ef41Sopenharmony_ci    Assert(hash_size <= (1 << 16), "Hash table maximum size error");
821cb0ef41Sopenharmony_ci    Assert(hash_size >= (1 << 8), "Hash table minimum size error");
831cb0ef41Sopenharmony_ci    Assert(w_size == (ush)w_size, "Prev table size error");
841cb0ef41Sopenharmony_ci
851cb0ef41Sopenharmony_ci    /*
861cb0ef41Sopenharmony_ci     * 2. The hash & prev table sizes are a multiple of 32 bytes (256 bits),
871cb0ef41Sopenharmony_ci     * since the NEON table slider moves two 128-bit items per loop (loop is
881cb0ef41Sopenharmony_ci     * unrolled on NEON for performance, see http://crbug.com/863257).
891cb0ef41Sopenharmony_ci     */
901cb0ef41Sopenharmony_ci    Assert(!((hash_size * sizeof(head[0])) & (32 - 1)),
911cb0ef41Sopenharmony_ci        "Hash table size error: should be a multiple of 32 bytes");
921cb0ef41Sopenharmony_ci    Assert(!((w_size * sizeof(prev[0])) & (32 - 1)),
931cb0ef41Sopenharmony_ci        "Prev table size error: should be a multiple of 32 bytes");
941cb0ef41Sopenharmony_ci
951cb0ef41Sopenharmony_ci    /*
961cb0ef41Sopenharmony_ci     * Duplicate (ush)w_size in each uint16_t component of a 128-bit vector.
971cb0ef41Sopenharmony_ci     */
981cb0ef41Sopenharmony_ci    const z_vec128i_u16x8_t vec_wsize = Z_SLIDE_INIT_SIMD(w_size);
991cb0ef41Sopenharmony_ci
1001cb0ef41Sopenharmony_ci    /*
1011cb0ef41Sopenharmony_ci     * Slide {head,prev} hash chain values: subtracts (ush)w_size from every
1021cb0ef41Sopenharmony_ci     * value with a saturating SIMD subtract, to clamp the result to 0(NIL),
1031cb0ef41Sopenharmony_ci     * to implement slide_hash() `(m >= wsize ? m - wsize : NIL);` code.
1041cb0ef41Sopenharmony_ci     */
1051cb0ef41Sopenharmony_ci    Z_SLIDE_HASH_SIMD(head, hash_size, vec_wsize);
1061cb0ef41Sopenharmony_ci#ifndef FASTEST
1071cb0ef41Sopenharmony_ci    Z_SLIDE_HASH_SIMD(prev, w_size, vec_wsize);
1081cb0ef41Sopenharmony_ci#endif
1091cb0ef41Sopenharmony_ci
1101cb0ef41Sopenharmony_ci}
1111cb0ef41Sopenharmony_ci
1121cb0ef41Sopenharmony_ci#undef z_vec128i_u16x8_t
1131cb0ef41Sopenharmony_ci#undef Z_SLIDE_HASH_SIMD
1141cb0ef41Sopenharmony_ci#undef Z_SLIDE_INIT_SIMD
1151cb0ef41Sopenharmony_ci
1161cb0ef41Sopenharmony_ci#endif  /* SLIDE_HASH_SIMD_H */
117