1370b324cSopenharmony_ci/* Bra86.c -- Branch converter for X86 code (BCJ)
2370b324cSopenharmony_ci2023-04-02 : Igor Pavlov : Public domain */
3370b324cSopenharmony_ci
4370b324cSopenharmony_ci#include "Precomp.h"
5370b324cSopenharmony_ci
6370b324cSopenharmony_ci#include "Bra.h"
7370b324cSopenharmony_ci#include "CpuArch.h"
8370b324cSopenharmony_ci
9370b324cSopenharmony_ci
10370b324cSopenharmony_ci#if defined(MY_CPU_SIZEOF_POINTER) \
11370b324cSopenharmony_ci    && ( MY_CPU_SIZEOF_POINTER == 4 \
12370b324cSopenharmony_ci      || MY_CPU_SIZEOF_POINTER == 8)
13370b324cSopenharmony_ci  #define BR_CONV_USE_OPT_PC_PTR
14370b324cSopenharmony_ci#endif
15370b324cSopenharmony_ci
16370b324cSopenharmony_ci#ifdef BR_CONV_USE_OPT_PC_PTR
17370b324cSopenharmony_ci#define BR_PC_INIT  pc -= (UInt32)(SizeT)p; // (MY_uintptr_t)
18370b324cSopenharmony_ci#define BR_PC_GET   (pc + (UInt32)(SizeT)p)
19370b324cSopenharmony_ci#else
20370b324cSopenharmony_ci#define BR_PC_INIT  pc += (UInt32)size;
21370b324cSopenharmony_ci#define BR_PC_GET   (pc - (UInt32)(SizeT)(lim - p))
22370b324cSopenharmony_ci// #define BR_PC_INIT
23370b324cSopenharmony_ci// #define BR_PC_GET   (pc + (UInt32)(SizeT)(p - data))
24370b324cSopenharmony_ci#endif
25370b324cSopenharmony_ci
26370b324cSopenharmony_ci#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
27370b324cSopenharmony_ci// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
28370b324cSopenharmony_ci
29370b324cSopenharmony_ci#define Z7_BRANCH_CONV_ST(name) z7_BranchConvSt_ ## name
30370b324cSopenharmony_ci
31370b324cSopenharmony_ci#define BR86_NEED_CONV_FOR_MS_BYTE(b) ((((b) + 1) & 0xfe) == 0)
32370b324cSopenharmony_ci
33370b324cSopenharmony_ci#ifdef MY_CPU_LE_UNALIGN
34370b324cSopenharmony_ci  #define BR86_PREPARE_BCJ_SCAN  const UInt32 v = GetUi32(p) ^ 0xe8e8e8e8;
35370b324cSopenharmony_ci  #define BR86_IS_BCJ_BYTE(n)    ((v & ((UInt32)0xfe << (n) * 8)) == 0)
36370b324cSopenharmony_ci#else
37370b324cSopenharmony_ci  #define BR86_PREPARE_BCJ_SCAN
38370b324cSopenharmony_ci  // bad for MSVC X86 (partial write to byte reg):
39370b324cSopenharmony_ci  #define BR86_IS_BCJ_BYTE(n)    ((p[n - 4] & 0xfe) == 0xe8)
40370b324cSopenharmony_ci  // bad for old MSVC (partial write to byte reg):
41370b324cSopenharmony_ci  // #define BR86_IS_BCJ_BYTE(n)    (((*p ^ 0xe8) & 0xfe) == 0)
42370b324cSopenharmony_ci#endif
43370b324cSopenharmony_ci
44370b324cSopenharmony_cistatic
45370b324cSopenharmony_ciZ7_FORCE_INLINE
46370b324cSopenharmony_ciZ7_ATTRIB_NO_VECTOR
47370b324cSopenharmony_ciByte *Z7_BRANCH_CONV_ST(X86)(Byte *p, SizeT size, UInt32 pc, UInt32 *state, int encoding)
48370b324cSopenharmony_ci{
49370b324cSopenharmony_ci  if (size < 5)
50370b324cSopenharmony_ci    return p;
51370b324cSopenharmony_ci {
52370b324cSopenharmony_ci  // Byte *p = data;
53370b324cSopenharmony_ci  const Byte *lim = p + size - 4;
54370b324cSopenharmony_ci  unsigned mask = (unsigned)*state;  // & 7;
55370b324cSopenharmony_ci#ifdef BR_CONV_USE_OPT_PC_PTR
56370b324cSopenharmony_ci  /* if BR_CONV_USE_OPT_PC_PTR is defined: we need to adjust (pc) for (+4),
57370b324cSopenharmony_ci        because call/jump offset is relative to the next instruction.
58370b324cSopenharmony_ci     if BR_CONV_USE_OPT_PC_PTR is not defined : we don't need to adjust (pc) for (+4),
59370b324cSopenharmony_ci         because  BR_PC_GET uses (pc - (lim - p)), and lim was adjusted for (-4) before.
60370b324cSopenharmony_ci  */
61370b324cSopenharmony_ci  pc += 4;
62370b324cSopenharmony_ci#endif
63370b324cSopenharmony_ci  BR_PC_INIT
64370b324cSopenharmony_ci  goto start;
65370b324cSopenharmony_ci
66370b324cSopenharmony_ci  for (;; mask |= 4)
67370b324cSopenharmony_ci  {
68370b324cSopenharmony_ci    // cont: mask |= 4;
69370b324cSopenharmony_ci  start:
70370b324cSopenharmony_ci    if (p >= lim)
71370b324cSopenharmony_ci      goto fin;
72370b324cSopenharmony_ci    {
73370b324cSopenharmony_ci      BR86_PREPARE_BCJ_SCAN
74370b324cSopenharmony_ci      p += 4;
75370b324cSopenharmony_ci      if (BR86_IS_BCJ_BYTE(0))  { goto m0; }  mask >>= 1;
76370b324cSopenharmony_ci      if (BR86_IS_BCJ_BYTE(1))  { goto m1; }  mask >>= 1;
77370b324cSopenharmony_ci      if (BR86_IS_BCJ_BYTE(2))  { goto m2; }  mask = 0;
78370b324cSopenharmony_ci      if (BR86_IS_BCJ_BYTE(3))  { goto a3; }
79370b324cSopenharmony_ci    }
80370b324cSopenharmony_ci    goto main_loop;
81370b324cSopenharmony_ci
82370b324cSopenharmony_ci  m0: p--;
83370b324cSopenharmony_ci  m1: p--;
84370b324cSopenharmony_ci  m2: p--;
85370b324cSopenharmony_ci    if (mask == 0)
86370b324cSopenharmony_ci      goto a3;
87370b324cSopenharmony_ci    if (p > lim)
88370b324cSopenharmony_ci      goto fin_p;
89370b324cSopenharmony_ci
90370b324cSopenharmony_ci    // if (((0x17u >> mask) & 1) == 0)
91370b324cSopenharmony_ci    if (mask > 4 || mask == 3)
92370b324cSopenharmony_ci    {
93370b324cSopenharmony_ci      mask >>= 1;
94370b324cSopenharmony_ci      continue; // goto cont;
95370b324cSopenharmony_ci    }
96370b324cSopenharmony_ci    mask >>= 1;
97370b324cSopenharmony_ci    if (BR86_NEED_CONV_FOR_MS_BYTE(p[mask]))
98370b324cSopenharmony_ci      continue; // goto cont;
99370b324cSopenharmony_ci    // if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont;
100370b324cSopenharmony_ci    {
101370b324cSopenharmony_ci      UInt32 v = GetUi32(p);
102370b324cSopenharmony_ci      UInt32 c;
103370b324cSopenharmony_ci      v += (1 << 24);  if (v & 0xfe000000) continue; // goto cont;
104370b324cSopenharmony_ci      c = BR_PC_GET;
105370b324cSopenharmony_ci      BR_CONVERT_VAL(v, c)
106370b324cSopenharmony_ci      {
107370b324cSopenharmony_ci        mask <<= 3;
108370b324cSopenharmony_ci        if (BR86_NEED_CONV_FOR_MS_BYTE(v >> mask))
109370b324cSopenharmony_ci        {
110370b324cSopenharmony_ci          v ^= (((UInt32)0x100 << mask) - 1);
111370b324cSopenharmony_ci          #ifdef MY_CPU_X86
112370b324cSopenharmony_ci          // for X86 : we can recalculate (c) to reduce register pressure
113370b324cSopenharmony_ci            c = BR_PC_GET;
114370b324cSopenharmony_ci          #endif
115370b324cSopenharmony_ci          BR_CONVERT_VAL(v, c)
116370b324cSopenharmony_ci        }
117370b324cSopenharmony_ci        mask = 0;
118370b324cSopenharmony_ci      }
119370b324cSopenharmony_ci      // v = (v & ((1 << 24) - 1)) - (v & (1 << 24));
120370b324cSopenharmony_ci      v &= (1 << 25) - 1;  v -= (1 << 24);
121370b324cSopenharmony_ci      SetUi32(p, v)
122370b324cSopenharmony_ci      p += 4;
123370b324cSopenharmony_ci      goto main_loop;
124370b324cSopenharmony_ci    }
125370b324cSopenharmony_ci
126370b324cSopenharmony_ci  main_loop:
127370b324cSopenharmony_ci    if (p >= lim)
128370b324cSopenharmony_ci      goto fin;
129370b324cSopenharmony_ci    for (;;)
130370b324cSopenharmony_ci    {
131370b324cSopenharmony_ci      BR86_PREPARE_BCJ_SCAN
132370b324cSopenharmony_ci      p += 4;
133370b324cSopenharmony_ci      if (BR86_IS_BCJ_BYTE(0))  { goto a0; }
134370b324cSopenharmony_ci      if (BR86_IS_BCJ_BYTE(1))  { goto a1; }
135370b324cSopenharmony_ci      if (BR86_IS_BCJ_BYTE(2))  { goto a2; }
136370b324cSopenharmony_ci      if (BR86_IS_BCJ_BYTE(3))  { goto a3; }
137370b324cSopenharmony_ci      if (p >= lim)
138370b324cSopenharmony_ci        goto fin;
139370b324cSopenharmony_ci    }
140370b324cSopenharmony_ci
141370b324cSopenharmony_ci  a0: p--;
142370b324cSopenharmony_ci  a1: p--;
143370b324cSopenharmony_ci  a2: p--;
144370b324cSopenharmony_ci  a3:
145370b324cSopenharmony_ci    if (p > lim)
146370b324cSopenharmony_ci      goto fin_p;
147370b324cSopenharmony_ci    // if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont;
148370b324cSopenharmony_ci    {
149370b324cSopenharmony_ci      UInt32 v = GetUi32(p);
150370b324cSopenharmony_ci      UInt32 c;
151370b324cSopenharmony_ci      v += (1 << 24);  if (v & 0xfe000000) continue; // goto cont;
152370b324cSopenharmony_ci      c = BR_PC_GET;
153370b324cSopenharmony_ci      BR_CONVERT_VAL(v, c)
154370b324cSopenharmony_ci      // v = (v & ((1 << 24) - 1)) - (v & (1 << 24));
155370b324cSopenharmony_ci      v &= (1 << 25) - 1;  v -= (1 << 24);
156370b324cSopenharmony_ci      SetUi32(p, v)
157370b324cSopenharmony_ci      p += 4;
158370b324cSopenharmony_ci      goto main_loop;
159370b324cSopenharmony_ci    }
160370b324cSopenharmony_ci  }
161370b324cSopenharmony_ci
162370b324cSopenharmony_cifin_p:
163370b324cSopenharmony_ci  p--;
164370b324cSopenharmony_cifin:
165370b324cSopenharmony_ci  // the following processing for tail is optional and can be commented
166370b324cSopenharmony_ci  /*
167370b324cSopenharmony_ci  lim += 4;
168370b324cSopenharmony_ci  for (; p < lim; p++, mask >>= 1)
169370b324cSopenharmony_ci    if ((*p & 0xfe) == 0xe8)
170370b324cSopenharmony_ci      break;
171370b324cSopenharmony_ci  */
172370b324cSopenharmony_ci  *state = (UInt32)mask;
173370b324cSopenharmony_ci  return p;
174370b324cSopenharmony_ci }
175370b324cSopenharmony_ci}
176370b324cSopenharmony_ci
177370b324cSopenharmony_ci
178370b324cSopenharmony_ci#define Z7_BRANCH_CONV_ST_FUNC_IMP(name, m, encoding) \
179370b324cSopenharmony_ciZ7_NO_INLINE \
180370b324cSopenharmony_ciZ7_ATTRIB_NO_VECTOR \
181370b324cSopenharmony_ciByte *m(name)(Byte *data, SizeT size, UInt32 pc, UInt32 *state) \
182370b324cSopenharmony_ci  { return Z7_BRANCH_CONV_ST(name)(data, size, pc, state, encoding); }
183370b324cSopenharmony_ci
184370b324cSopenharmony_ciZ7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_DEC, 0)
185370b324cSopenharmony_ci#ifndef Z7_EXTRACT_ONLY
186370b324cSopenharmony_ciZ7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_ENC, 1)
187370b324cSopenharmony_ci#endif
188