1275793eaSopenharmony_ci/* 2275793eaSopenharmony_ci;uInt longest_match_x64( 3275793eaSopenharmony_ci; deflate_state *s, 4275793eaSopenharmony_ci; IPos cur_match); // current match 5275793eaSopenharmony_ci 6275793eaSopenharmony_ci; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64 7275793eaSopenharmony_ci; (AMD64 on Athlon 64, Opteron, Phenom 8275793eaSopenharmony_ci; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7) 9275793eaSopenharmony_ci; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode) 10275793eaSopenharmony_ci; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant. 11275793eaSopenharmony_ci; 12275793eaSopenharmony_ci; File written by Gilles Vollant, by converting to assembly the longest_match 13275793eaSopenharmony_ci; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. 14275793eaSopenharmony_ci; and by taking inspiration on asm686 with masm, optimised assembly code 15275793eaSopenharmony_ci; from Brian Raiter, written 1998 16275793eaSopenharmony_ci; 17275793eaSopenharmony_ci; This software is provided 'as-is', without any express or implied 18275793eaSopenharmony_ci; warranty. In no event will the authors be held liable for any damages 19275793eaSopenharmony_ci; arising from the use of this software. 20275793eaSopenharmony_ci; 21275793eaSopenharmony_ci; Permission is granted to anyone to use this software for any purpose, 22275793eaSopenharmony_ci; including commercial applications, and to alter it and redistribute it 23275793eaSopenharmony_ci; freely, subject to the following restrictions: 24275793eaSopenharmony_ci; 25275793eaSopenharmony_ci; 1. The origin of this software must not be misrepresented; you must not 26275793eaSopenharmony_ci; claim that you wrote the original software. If you use this software 27275793eaSopenharmony_ci; in a product, an acknowledgment in the product documentation would be 28275793eaSopenharmony_ci; appreciated but is not required. 29275793eaSopenharmony_ci; 2. Altered source versions must be plainly marked as such, and must not be 30275793eaSopenharmony_ci; misrepresented as being the original software 31275793eaSopenharmony_ci; 3. This notice may not be removed or altered from any source distribution. 32275793eaSopenharmony_ci; 33275793eaSopenharmony_ci; http://www.zlib.net 34275793eaSopenharmony_ci; http://www.winimage.com/zLibDll 35275793eaSopenharmony_ci; http://www.muppetlabs.com/~breadbox/software/assembly.html 36275793eaSopenharmony_ci; 37275793eaSopenharmony_ci; to compile this file for zLib, I use option: 38275793eaSopenharmony_ci; gcc -c -arch x86_64 gvmat64.S 39275793eaSopenharmony_ci 40275793eaSopenharmony_ci 41275793eaSopenharmony_ci;uInt longest_match(s, cur_match) 42275793eaSopenharmony_ci; deflate_state *s; 43275793eaSopenharmony_ci; IPos cur_match; // current match / 44275793eaSopenharmony_ci; 45275793eaSopenharmony_ci; with XCode for Mac, I had strange error with some jump on intel syntax 46275793eaSopenharmony_ci; this is why BEFORE_JMP and AFTER_JMP are used 47275793eaSopenharmony_ci */ 48275793eaSopenharmony_ci 49275793eaSopenharmony_ci 50275793eaSopenharmony_ci#define BEFORE_JMP .att_syntax 51275793eaSopenharmony_ci#define AFTER_JMP .intel_syntax noprefix 52275793eaSopenharmony_ci 53275793eaSopenharmony_ci#ifndef NO_UNDERLINE 54275793eaSopenharmony_ci# define match_init _match_init 55275793eaSopenharmony_ci# define longest_match _longest_match 56275793eaSopenharmony_ci#endif 57275793eaSopenharmony_ci 58275793eaSopenharmony_ci.intel_syntax noprefix 59275793eaSopenharmony_ci 60275793eaSopenharmony_ci.globl match_init, longest_match 61275793eaSopenharmony_ci.text 62275793eaSopenharmony_cilongest_match: 63275793eaSopenharmony_ci 64275793eaSopenharmony_ci 65275793eaSopenharmony_ci 66275793eaSopenharmony_ci#define LocalVarsSize 96 67275793eaSopenharmony_ci/* 68275793eaSopenharmony_ci; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 69275793eaSopenharmony_ci; free register : r14,r15 70275793eaSopenharmony_ci; register can be saved : rsp 71275793eaSopenharmony_ci*/ 72275793eaSopenharmony_ci 73275793eaSopenharmony_ci#define chainlenwmask (rsp + 8 - LocalVarsSize) 74275793eaSopenharmony_ci#define nicematch (rsp + 16 - LocalVarsSize) 75275793eaSopenharmony_ci 76275793eaSopenharmony_ci#define save_rdi (rsp + 24 - LocalVarsSize) 77275793eaSopenharmony_ci#define save_rsi (rsp + 32 - LocalVarsSize) 78275793eaSopenharmony_ci#define save_rbx (rsp + 40 - LocalVarsSize) 79275793eaSopenharmony_ci#define save_rbp (rsp + 48 - LocalVarsSize) 80275793eaSopenharmony_ci#define save_r12 (rsp + 56 - LocalVarsSize) 81275793eaSopenharmony_ci#define save_r13 (rsp + 64 - LocalVarsSize) 82275793eaSopenharmony_ci#define save_r14 (rsp + 72 - LocalVarsSize) 83275793eaSopenharmony_ci#define save_r15 (rsp + 80 - LocalVarsSize) 84275793eaSopenharmony_ci 85275793eaSopenharmony_ci 86275793eaSopenharmony_ci/* 87275793eaSopenharmony_ci; all the +4 offsets are due to the addition of pending_buf_size (in zlib 88275793eaSopenharmony_ci; in the deflate_state structure since the asm code was first written 89275793eaSopenharmony_ci; (if you compile with zlib 1.0.4 or older, remove the +4). 90275793eaSopenharmony_ci; Note : these value are good with a 8 bytes boundary pack structure 91275793eaSopenharmony_ci*/ 92275793eaSopenharmony_ci 93275793eaSopenharmony_ci#define MAX_MATCH 258 94275793eaSopenharmony_ci#define MIN_MATCH 3 95275793eaSopenharmony_ci#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) 96275793eaSopenharmony_ci 97275793eaSopenharmony_ci/* 98275793eaSopenharmony_ci;;; Offsets for fields in the deflate_state structure. These numbers 99275793eaSopenharmony_ci;;; are calculated from the definition of deflate_state, with the 100275793eaSopenharmony_ci;;; assumption that the compiler will dword-align the fields. (Thus, 101275793eaSopenharmony_ci;;; changing the definition of deflate_state could easily cause this 102275793eaSopenharmony_ci;;; program to crash horribly, without so much as a warning at 103275793eaSopenharmony_ci;;; compile time. Sigh.) 104275793eaSopenharmony_ci 105275793eaSopenharmony_ci; all the +zlib1222add offsets are due to the addition of fields 106275793eaSopenharmony_ci; in zlib in the deflate_state structure since the asm code was first written 107275793eaSopenharmony_ci; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 108275793eaSopenharmony_ci; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 109275793eaSopenharmony_ci; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 110275793eaSopenharmony_ci*/ 111275793eaSopenharmony_ci 112275793eaSopenharmony_ci 113275793eaSopenharmony_ci 114275793eaSopenharmony_ci/* you can check the structure offset by running 115275793eaSopenharmony_ci 116275793eaSopenharmony_ci#include <stdlib.h> 117275793eaSopenharmony_ci#include <stdio.h> 118275793eaSopenharmony_ci#include "deflate.h" 119275793eaSopenharmony_ci 120275793eaSopenharmony_civoid print_depl() 121275793eaSopenharmony_ci{ 122275793eaSopenharmony_cideflate_state ds; 123275793eaSopenharmony_cideflate_state *s=&ds; 124275793eaSopenharmony_ciprintf("size pointer=%u\n",(int)sizeof(void*)); 125275793eaSopenharmony_ci 126275793eaSopenharmony_ciprintf("#define dsWSize %u\n",(int)(((char*)&(s->w_size))-((char*)s))); 127275793eaSopenharmony_ciprintf("#define dsWMask %u\n",(int)(((char*)&(s->w_mask))-((char*)s))); 128275793eaSopenharmony_ciprintf("#define dsWindow %u\n",(int)(((char*)&(s->window))-((char*)s))); 129275793eaSopenharmony_ciprintf("#define dsPrev %u\n",(int)(((char*)&(s->prev))-((char*)s))); 130275793eaSopenharmony_ciprintf("#define dsMatchLen %u\n",(int)(((char*)&(s->match_length))-((char*)s))); 131275793eaSopenharmony_ciprintf("#define dsPrevMatch %u\n",(int)(((char*)&(s->prev_match))-((char*)s))); 132275793eaSopenharmony_ciprintf("#define dsStrStart %u\n",(int)(((char*)&(s->strstart))-((char*)s))); 133275793eaSopenharmony_ciprintf("#define dsMatchStart %u\n",(int)(((char*)&(s->match_start))-((char*)s))); 134275793eaSopenharmony_ciprintf("#define dsLookahead %u\n",(int)(((char*)&(s->lookahead))-((char*)s))); 135275793eaSopenharmony_ciprintf("#define dsPrevLen %u\n",(int)(((char*)&(s->prev_length))-((char*)s))); 136275793eaSopenharmony_ciprintf("#define dsMaxChainLen %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s))); 137275793eaSopenharmony_ciprintf("#define dsGoodMatch %u\n",(int)(((char*)&(s->good_match))-((char*)s))); 138275793eaSopenharmony_ciprintf("#define dsNiceMatch %u\n",(int)(((char*)&(s->nice_match))-((char*)s))); 139275793eaSopenharmony_ci} 140275793eaSopenharmony_ci*/ 141275793eaSopenharmony_ci 142275793eaSopenharmony_ci#define dsWSize 68 143275793eaSopenharmony_ci#define dsWMask 76 144275793eaSopenharmony_ci#define dsWindow 80 145275793eaSopenharmony_ci#define dsPrev 96 146275793eaSopenharmony_ci#define dsMatchLen 144 147275793eaSopenharmony_ci#define dsPrevMatch 148 148275793eaSopenharmony_ci#define dsStrStart 156 149275793eaSopenharmony_ci#define dsMatchStart 160 150275793eaSopenharmony_ci#define dsLookahead 164 151275793eaSopenharmony_ci#define dsPrevLen 168 152275793eaSopenharmony_ci#define dsMaxChainLen 172 153275793eaSopenharmony_ci#define dsGoodMatch 188 154275793eaSopenharmony_ci#define dsNiceMatch 192 155275793eaSopenharmony_ci 156275793eaSopenharmony_ci#define window_size [ rcx + dsWSize] 157275793eaSopenharmony_ci#define WMask [ rcx + dsWMask] 158275793eaSopenharmony_ci#define window_ad [ rcx + dsWindow] 159275793eaSopenharmony_ci#define prev_ad [ rcx + dsPrev] 160275793eaSopenharmony_ci#define strstart [ rcx + dsStrStart] 161275793eaSopenharmony_ci#define match_start [ rcx + dsMatchStart] 162275793eaSopenharmony_ci#define Lookahead [ rcx + dsLookahead] //; 0ffffffffh on infozip 163275793eaSopenharmony_ci#define prev_length [ rcx + dsPrevLen] 164275793eaSopenharmony_ci#define max_chain_length [ rcx + dsMaxChainLen] 165275793eaSopenharmony_ci#define good_match [ rcx + dsGoodMatch] 166275793eaSopenharmony_ci#define nice_match [ rcx + dsNiceMatch] 167275793eaSopenharmony_ci 168275793eaSopenharmony_ci/* 169275793eaSopenharmony_ci; windows: 170275793eaSopenharmony_ci; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match) 171275793eaSopenharmony_ci 172275793eaSopenharmony_ci; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 173275793eaSopenharmony_ci; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 174275793eaSopenharmony_ci; 175275793eaSopenharmony_ci; All registers must be preserved across the call, except for 176275793eaSopenharmony_ci; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. 177275793eaSopenharmony_ci 178275793eaSopenharmony_ci; 179275793eaSopenharmony_ci; gcc on macosx-linux: 180275793eaSopenharmony_ci; see http://www.x86-64.org/documentation/abi-0.99.pdf 181275793eaSopenharmony_ci; param 1 in rdi, param 2 in rsi 182275793eaSopenharmony_ci; rbx, rsp, rbp, r12 to r15 must be preserved 183275793eaSopenharmony_ci 184275793eaSopenharmony_ci;;; Save registers that the compiler may be using, and adjust esp to 185275793eaSopenharmony_ci;;; make room for our stack frame. 186275793eaSopenharmony_ci 187275793eaSopenharmony_ci 188275793eaSopenharmony_ci;;; Retrieve the function arguments. r8d will hold cur_match 189275793eaSopenharmony_ci;;; throughout the entire function. edx will hold the pointer to the 190275793eaSopenharmony_ci;;; deflate_state structure during the function's setup (before 191275793eaSopenharmony_ci;;; entering the main loop. 192275793eaSopenharmony_ci 193275793eaSopenharmony_ci; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) 194275793eaSopenharmony_ci; mac: param 1 in rdi, param 2 rsi 195275793eaSopenharmony_ci; this clear high 32 bits of r8, which can be garbage in both r8 and rdx 196275793eaSopenharmony_ci*/ 197275793eaSopenharmony_ci mov [save_rbx],rbx 198275793eaSopenharmony_ci mov [save_rbp],rbp 199275793eaSopenharmony_ci 200275793eaSopenharmony_ci 201275793eaSopenharmony_ci mov rcx,rdi 202275793eaSopenharmony_ci 203275793eaSopenharmony_ci mov r8d,esi 204275793eaSopenharmony_ci 205275793eaSopenharmony_ci 206275793eaSopenharmony_ci mov [save_r12],r12 207275793eaSopenharmony_ci mov [save_r13],r13 208275793eaSopenharmony_ci mov [save_r14],r14 209275793eaSopenharmony_ci mov [save_r15],r15 210275793eaSopenharmony_ci 211275793eaSopenharmony_ci 212275793eaSopenharmony_ci//;;; uInt wmask = s->w_mask; 213275793eaSopenharmony_ci//;;; unsigned chain_length = s->max_chain_length; 214275793eaSopenharmony_ci//;;; if (s->prev_length >= s->good_match) { 215275793eaSopenharmony_ci//;;; chain_length >>= 2; 216275793eaSopenharmony_ci//;;; } 217275793eaSopenharmony_ci 218275793eaSopenharmony_ci 219275793eaSopenharmony_ci mov edi, prev_length 220275793eaSopenharmony_ci mov esi, good_match 221275793eaSopenharmony_ci mov eax, WMask 222275793eaSopenharmony_ci mov ebx, max_chain_length 223275793eaSopenharmony_ci cmp edi, esi 224275793eaSopenharmony_ci jl LastMatchGood 225275793eaSopenharmony_ci shr ebx, 2 226275793eaSopenharmony_ciLastMatchGood: 227275793eaSopenharmony_ci 228275793eaSopenharmony_ci//;;; chainlen is decremented once beforehand so that the function can 229275793eaSopenharmony_ci//;;; use the sign flag instead of the zero flag for the exit test. 230275793eaSopenharmony_ci//;;; It is then shifted into the high word, to make room for the wmask 231275793eaSopenharmony_ci//;;; value, which it will always accompany. 232275793eaSopenharmony_ci 233275793eaSopenharmony_ci dec ebx 234275793eaSopenharmony_ci shl ebx, 16 235275793eaSopenharmony_ci or ebx, eax 236275793eaSopenharmony_ci 237275793eaSopenharmony_ci//;;; on zlib only 238275793eaSopenharmony_ci//;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 239275793eaSopenharmony_ci 240275793eaSopenharmony_ci 241275793eaSopenharmony_ci 242275793eaSopenharmony_ci mov eax, nice_match 243275793eaSopenharmony_ci mov [chainlenwmask], ebx 244275793eaSopenharmony_ci mov r10d, Lookahead 245275793eaSopenharmony_ci cmp r10d, eax 246275793eaSopenharmony_ci cmovnl r10d, eax 247275793eaSopenharmony_ci mov [nicematch],r10d 248275793eaSopenharmony_ci 249275793eaSopenharmony_ci 250275793eaSopenharmony_ci 251275793eaSopenharmony_ci//;;; register Bytef *scan = s->window + s->strstart; 252275793eaSopenharmony_ci mov r10, window_ad 253275793eaSopenharmony_ci mov ebp, strstart 254275793eaSopenharmony_ci lea r13, [r10 + rbp] 255275793eaSopenharmony_ci 256275793eaSopenharmony_ci//;;; Determine how many bytes the scan ptr is off from being 257275793eaSopenharmony_ci//;;; dword-aligned. 258275793eaSopenharmony_ci 259275793eaSopenharmony_ci mov r9,r13 260275793eaSopenharmony_ci neg r13 261275793eaSopenharmony_ci and r13,3 262275793eaSopenharmony_ci 263275793eaSopenharmony_ci//;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 264275793eaSopenharmony_ci//;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 265275793eaSopenharmony_ci 266275793eaSopenharmony_ci 267275793eaSopenharmony_ci mov eax, window_size 268275793eaSopenharmony_ci sub eax, MIN_LOOKAHEAD 269275793eaSopenharmony_ci 270275793eaSopenharmony_ci 271275793eaSopenharmony_ci xor edi,edi 272275793eaSopenharmony_ci sub ebp, eax 273275793eaSopenharmony_ci 274275793eaSopenharmony_ci mov r11d, prev_length 275275793eaSopenharmony_ci 276275793eaSopenharmony_ci cmovng ebp,edi 277275793eaSopenharmony_ci 278275793eaSopenharmony_ci//;;; int best_len = s->prev_length; 279275793eaSopenharmony_ci 280275793eaSopenharmony_ci 281275793eaSopenharmony_ci//;;; Store the sum of s->window + best_len in esi locally, and in esi. 282275793eaSopenharmony_ci 283275793eaSopenharmony_ci lea rsi,[r10+r11] 284275793eaSopenharmony_ci 285275793eaSopenharmony_ci//;;; register ush scan_start = *(ushf*)scan; 286275793eaSopenharmony_ci//;;; register ush scan_end = *(ushf*)(scan+best_len-1); 287275793eaSopenharmony_ci//;;; Posf *prev = s->prev; 288275793eaSopenharmony_ci 289275793eaSopenharmony_ci movzx r12d,word ptr [r9] 290275793eaSopenharmony_ci movzx ebx, word ptr [r9 + r11 - 1] 291275793eaSopenharmony_ci 292275793eaSopenharmony_ci mov rdi, prev_ad 293275793eaSopenharmony_ci 294275793eaSopenharmony_ci//;;; Jump into the main loop. 295275793eaSopenharmony_ci 296275793eaSopenharmony_ci mov edx, [chainlenwmask] 297275793eaSopenharmony_ci 298275793eaSopenharmony_ci cmp bx,word ptr [rsi + r8 - 1] 299275793eaSopenharmony_ci jz LookupLoopIsZero 300275793eaSopenharmony_ci 301275793eaSopenharmony_ci 302275793eaSopenharmony_ci 303275793eaSopenharmony_ciLookupLoop1: 304275793eaSopenharmony_ci and r8d, edx 305275793eaSopenharmony_ci 306275793eaSopenharmony_ci movzx r8d, word ptr [rdi + r8*2] 307275793eaSopenharmony_ci cmp r8d, ebp 308275793eaSopenharmony_ci jbe LeaveNow 309275793eaSopenharmony_ci 310275793eaSopenharmony_ci 311275793eaSopenharmony_ci 312275793eaSopenharmony_ci sub edx, 0x00010000 313275793eaSopenharmony_ci BEFORE_JMP 314275793eaSopenharmony_ci js LeaveNow 315275793eaSopenharmony_ci AFTER_JMP 316275793eaSopenharmony_ci 317275793eaSopenharmony_ciLoopEntry1: 318275793eaSopenharmony_ci cmp bx,word ptr [rsi + r8 - 1] 319275793eaSopenharmony_ci BEFORE_JMP 320275793eaSopenharmony_ci jz LookupLoopIsZero 321275793eaSopenharmony_ci AFTER_JMP 322275793eaSopenharmony_ci 323275793eaSopenharmony_ciLookupLoop2: 324275793eaSopenharmony_ci and r8d, edx 325275793eaSopenharmony_ci 326275793eaSopenharmony_ci movzx r8d, word ptr [rdi + r8*2] 327275793eaSopenharmony_ci cmp r8d, ebp 328275793eaSopenharmony_ci BEFORE_JMP 329275793eaSopenharmony_ci jbe LeaveNow 330275793eaSopenharmony_ci AFTER_JMP 331275793eaSopenharmony_ci sub edx, 0x00010000 332275793eaSopenharmony_ci BEFORE_JMP 333275793eaSopenharmony_ci js LeaveNow 334275793eaSopenharmony_ci AFTER_JMP 335275793eaSopenharmony_ci 336275793eaSopenharmony_ciLoopEntry2: 337275793eaSopenharmony_ci cmp bx,word ptr [rsi + r8 - 1] 338275793eaSopenharmony_ci BEFORE_JMP 339275793eaSopenharmony_ci jz LookupLoopIsZero 340275793eaSopenharmony_ci AFTER_JMP 341275793eaSopenharmony_ci 342275793eaSopenharmony_ciLookupLoop4: 343275793eaSopenharmony_ci and r8d, edx 344275793eaSopenharmony_ci 345275793eaSopenharmony_ci movzx r8d, word ptr [rdi + r8*2] 346275793eaSopenharmony_ci cmp r8d, ebp 347275793eaSopenharmony_ci BEFORE_JMP 348275793eaSopenharmony_ci jbe LeaveNow 349275793eaSopenharmony_ci AFTER_JMP 350275793eaSopenharmony_ci sub edx, 0x00010000 351275793eaSopenharmony_ci BEFORE_JMP 352275793eaSopenharmony_ci js LeaveNow 353275793eaSopenharmony_ci AFTER_JMP 354275793eaSopenharmony_ci 355275793eaSopenharmony_ciLoopEntry4: 356275793eaSopenharmony_ci 357275793eaSopenharmony_ci cmp bx,word ptr [rsi + r8 - 1] 358275793eaSopenharmony_ci BEFORE_JMP 359275793eaSopenharmony_ci jnz LookupLoop1 360275793eaSopenharmony_ci jmp LookupLoopIsZero 361275793eaSopenharmony_ci AFTER_JMP 362275793eaSopenharmony_ci/* 363275793eaSopenharmony_ci;;; do { 364275793eaSopenharmony_ci;;; match = s->window + cur_match; 365275793eaSopenharmony_ci;;; if (*(ushf*)(match+best_len-1) != scan_end || 366275793eaSopenharmony_ci;;; *(ushf*)match != scan_start) continue; 367275793eaSopenharmony_ci;;; [...] 368275793eaSopenharmony_ci;;; } while ((cur_match = prev[cur_match & wmask]) > limit 369275793eaSopenharmony_ci;;; && --chain_length != 0); 370275793eaSopenharmony_ci;;; 371275793eaSopenharmony_ci;;; Here is the inner loop of the function. The function will spend the 372275793eaSopenharmony_ci;;; majority of its time in this loop, and majority of that time will 373275793eaSopenharmony_ci;;; be spent in the first ten instructions. 374275793eaSopenharmony_ci;;; 375275793eaSopenharmony_ci;;; Within this loop: 376275793eaSopenharmony_ci;;; ebx = scanend 377275793eaSopenharmony_ci;;; r8d = curmatch 378275793eaSopenharmony_ci;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 379275793eaSopenharmony_ci;;; esi = windowbestlen - i.e., (window + bestlen) 380275793eaSopenharmony_ci;;; edi = prev 381275793eaSopenharmony_ci;;; ebp = limit 382275793eaSopenharmony_ci*/ 383275793eaSopenharmony_ci.balign 16 384275793eaSopenharmony_ciLookupLoop: 385275793eaSopenharmony_ci and r8d, edx 386275793eaSopenharmony_ci 387275793eaSopenharmony_ci movzx r8d, word ptr [rdi + r8*2] 388275793eaSopenharmony_ci cmp r8d, ebp 389275793eaSopenharmony_ci BEFORE_JMP 390275793eaSopenharmony_ci jbe LeaveNow 391275793eaSopenharmony_ci AFTER_JMP 392275793eaSopenharmony_ci sub edx, 0x00010000 393275793eaSopenharmony_ci BEFORE_JMP 394275793eaSopenharmony_ci js LeaveNow 395275793eaSopenharmony_ci AFTER_JMP 396275793eaSopenharmony_ci 397275793eaSopenharmony_ciLoopEntry: 398275793eaSopenharmony_ci 399275793eaSopenharmony_ci cmp bx,word ptr [rsi + r8 - 1] 400275793eaSopenharmony_ci BEFORE_JMP 401275793eaSopenharmony_ci jnz LookupLoop1 402275793eaSopenharmony_ci AFTER_JMP 403275793eaSopenharmony_ciLookupLoopIsZero: 404275793eaSopenharmony_ci cmp r12w, word ptr [r10 + r8] 405275793eaSopenharmony_ci BEFORE_JMP 406275793eaSopenharmony_ci jnz LookupLoop1 407275793eaSopenharmony_ci AFTER_JMP 408275793eaSopenharmony_ci 409275793eaSopenharmony_ci 410275793eaSopenharmony_ci//;;; Store the current value of chainlen. 411275793eaSopenharmony_ci mov [chainlenwmask], edx 412275793eaSopenharmony_ci/* 413275793eaSopenharmony_ci;;; Point edi to the string under scrutiny, and esi to the string we 414275793eaSopenharmony_ci;;; are hoping to match it up with. In actuality, esi and edi are 415275793eaSopenharmony_ci;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 416275793eaSopenharmony_ci;;; initialized to -(MAX_MATCH_8 - scanalign). 417275793eaSopenharmony_ci*/ 418275793eaSopenharmony_ci lea rsi,[r8+r10] 419275793eaSopenharmony_ci mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8) 420275793eaSopenharmony_ci lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8] 421275793eaSopenharmony_ci lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8] 422275793eaSopenharmony_ci 423275793eaSopenharmony_ci prefetcht1 [rsi+rdx] 424275793eaSopenharmony_ci prefetcht1 [rdi+rdx] 425275793eaSopenharmony_ci 426275793eaSopenharmony_ci/* 427275793eaSopenharmony_ci;;; Test the strings for equality, 8 bytes at a time. At the end, 428275793eaSopenharmony_ci;;; adjust rdx so that it is offset to the exact byte that mismatched. 429275793eaSopenharmony_ci;;; 430275793eaSopenharmony_ci;;; We already know at this point that the first three bytes of the 431275793eaSopenharmony_ci;;; strings match each other, and they can be safely passed over before 432275793eaSopenharmony_ci;;; starting the compare loop. So what this code does is skip over 0-3 433275793eaSopenharmony_ci;;; bytes, as much as necessary in order to dword-align the edi 434275793eaSopenharmony_ci;;; pointer. (rsi will still be misaligned three times out of four.) 435275793eaSopenharmony_ci;;; 436275793eaSopenharmony_ci;;; It should be confessed that this loop usually does not represent 437275793eaSopenharmony_ci;;; much of the total running time. Replacing it with a more 438275793eaSopenharmony_ci;;; straightforward "rep cmpsb" would not drastically degrade 439275793eaSopenharmony_ci;;; performance. 440275793eaSopenharmony_ci*/ 441275793eaSopenharmony_ci 442275793eaSopenharmony_ciLoopCmps: 443275793eaSopenharmony_ci mov rax, [rsi + rdx] 444275793eaSopenharmony_ci xor rax, [rdi + rdx] 445275793eaSopenharmony_ci jnz LeaveLoopCmps 446275793eaSopenharmony_ci 447275793eaSopenharmony_ci mov rax, [rsi + rdx + 8] 448275793eaSopenharmony_ci xor rax, [rdi + rdx + 8] 449275793eaSopenharmony_ci jnz LeaveLoopCmps8 450275793eaSopenharmony_ci 451275793eaSopenharmony_ci 452275793eaSopenharmony_ci mov rax, [rsi + rdx + 8+8] 453275793eaSopenharmony_ci xor rax, [rdi + rdx + 8+8] 454275793eaSopenharmony_ci jnz LeaveLoopCmps16 455275793eaSopenharmony_ci 456275793eaSopenharmony_ci add rdx,8+8+8 457275793eaSopenharmony_ci 458275793eaSopenharmony_ci BEFORE_JMP 459275793eaSopenharmony_ci jnz LoopCmps 460275793eaSopenharmony_ci jmp LenMaximum 461275793eaSopenharmony_ci AFTER_JMP 462275793eaSopenharmony_ci 463275793eaSopenharmony_ciLeaveLoopCmps16: add rdx,8 464275793eaSopenharmony_ciLeaveLoopCmps8: add rdx,8 465275793eaSopenharmony_ciLeaveLoopCmps: 466275793eaSopenharmony_ci 467275793eaSopenharmony_ci test eax, 0x0000FFFF 468275793eaSopenharmony_ci jnz LenLower 469275793eaSopenharmony_ci 470275793eaSopenharmony_ci test eax,0xffffffff 471275793eaSopenharmony_ci 472275793eaSopenharmony_ci jnz LenLower32 473275793eaSopenharmony_ci 474275793eaSopenharmony_ci add rdx,4 475275793eaSopenharmony_ci shr rax,32 476275793eaSopenharmony_ci or ax,ax 477275793eaSopenharmony_ci BEFORE_JMP 478275793eaSopenharmony_ci jnz LenLower 479275793eaSopenharmony_ci AFTER_JMP 480275793eaSopenharmony_ci 481275793eaSopenharmony_ciLenLower32: 482275793eaSopenharmony_ci shr eax,16 483275793eaSopenharmony_ci add rdx,2 484275793eaSopenharmony_ci 485275793eaSopenharmony_ciLenLower: 486275793eaSopenharmony_ci sub al, 1 487275793eaSopenharmony_ci adc rdx, 0 488275793eaSopenharmony_ci//;;; Calculate the length of the match. If it is longer than MAX_MATCH, 489275793eaSopenharmony_ci//;;; then automatically accept it as the best possible match and leave. 490275793eaSopenharmony_ci 491275793eaSopenharmony_ci lea rax, [rdi + rdx] 492275793eaSopenharmony_ci sub rax, r9 493275793eaSopenharmony_ci cmp eax, MAX_MATCH 494275793eaSopenharmony_ci BEFORE_JMP 495275793eaSopenharmony_ci jge LenMaximum 496275793eaSopenharmony_ci AFTER_JMP 497275793eaSopenharmony_ci/* 498275793eaSopenharmony_ci;;; If the length of the match is not longer than the best match we 499275793eaSopenharmony_ci;;; have so far, then forget it and return to the lookup loop. 500275793eaSopenharmony_ci;/////////////////////////////////// 501275793eaSopenharmony_ci*/ 502275793eaSopenharmony_ci cmp eax, r11d 503275793eaSopenharmony_ci jg LongerMatch 504275793eaSopenharmony_ci 505275793eaSopenharmony_ci lea rsi,[r10+r11] 506275793eaSopenharmony_ci 507275793eaSopenharmony_ci mov rdi, prev_ad 508275793eaSopenharmony_ci mov edx, [chainlenwmask] 509275793eaSopenharmony_ci BEFORE_JMP 510275793eaSopenharmony_ci jmp LookupLoop 511275793eaSopenharmony_ci AFTER_JMP 512275793eaSopenharmony_ci/* 513275793eaSopenharmony_ci;;; s->match_start = cur_match; 514275793eaSopenharmony_ci;;; best_len = len; 515275793eaSopenharmony_ci;;; if (len >= nice_match) break; 516275793eaSopenharmony_ci;;; scan_end = *(ushf*)(scan+best_len-1); 517275793eaSopenharmony_ci*/ 518275793eaSopenharmony_ciLongerMatch: 519275793eaSopenharmony_ci mov r11d, eax 520275793eaSopenharmony_ci mov match_start, r8d 521275793eaSopenharmony_ci cmp eax, [nicematch] 522275793eaSopenharmony_ci BEFORE_JMP 523275793eaSopenharmony_ci jge LeaveNow 524275793eaSopenharmony_ci AFTER_JMP 525275793eaSopenharmony_ci 526275793eaSopenharmony_ci lea rsi,[r10+rax] 527275793eaSopenharmony_ci 528275793eaSopenharmony_ci movzx ebx, word ptr [r9 + rax - 1] 529275793eaSopenharmony_ci mov rdi, prev_ad 530275793eaSopenharmony_ci mov edx, [chainlenwmask] 531275793eaSopenharmony_ci BEFORE_JMP 532275793eaSopenharmony_ci jmp LookupLoop 533275793eaSopenharmony_ci AFTER_JMP 534275793eaSopenharmony_ci 535275793eaSopenharmony_ci//;;; Accept the current string, with the maximum possible length. 536275793eaSopenharmony_ci 537275793eaSopenharmony_ciLenMaximum: 538275793eaSopenharmony_ci mov r11d,MAX_MATCH 539275793eaSopenharmony_ci mov match_start, r8d 540275793eaSopenharmony_ci 541275793eaSopenharmony_ci//;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 542275793eaSopenharmony_ci//;;; return s->lookahead; 543275793eaSopenharmony_ci 544275793eaSopenharmony_ciLeaveNow: 545275793eaSopenharmony_ci mov eax, Lookahead 546275793eaSopenharmony_ci cmp r11d, eax 547275793eaSopenharmony_ci cmovng eax, r11d 548275793eaSopenharmony_ci 549275793eaSopenharmony_ci 550275793eaSopenharmony_ci 551275793eaSopenharmony_ci//;;; Restore the stack and return from whence we came. 552275793eaSopenharmony_ci 553275793eaSopenharmony_ci 554275793eaSopenharmony_ci// mov rsi,[save_rsi] 555275793eaSopenharmony_ci// mov rdi,[save_rdi] 556275793eaSopenharmony_ci mov rbx,[save_rbx] 557275793eaSopenharmony_ci mov rbp,[save_rbp] 558275793eaSopenharmony_ci mov r12,[save_r12] 559275793eaSopenharmony_ci mov r13,[save_r13] 560275793eaSopenharmony_ci mov r14,[save_r14] 561275793eaSopenharmony_ci mov r15,[save_r15] 562275793eaSopenharmony_ci 563275793eaSopenharmony_ci 564275793eaSopenharmony_ci ret 0 565275793eaSopenharmony_ci//; please don't remove this string ! 566275793eaSopenharmony_ci//; Your can freely use gvmat64 in any free or commercial app 567275793eaSopenharmony_ci//; but it is far better don't remove the string in the binary! 568275793eaSopenharmony_ci // db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 569275793eaSopenharmony_ci 570275793eaSopenharmony_ci 571275793eaSopenharmony_cimatch_init: 572275793eaSopenharmony_ci ret 0 573275793eaSopenharmony_ci 574275793eaSopenharmony_ci 575