Qt/QML edition

This commit is contained in:
Floris Bos 2020-03-04 16:55:40 +01:00
commit d7b361ba44
2168 changed files with 721948 additions and 0 deletions

View file

@ -0,0 +1,2 @@
ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
ml64.exe /Flgvmat64 /c /Zi gvmat64.asm

View file

@ -0,0 +1,553 @@
;uInt longest_match_x64(
; deflate_state *s,
; IPos cur_match); /* current match */
; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
; (AMD64 on Athlon 64, Opteron, Phenom
; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
;
; File written by Gilles Vollant, by converting to assembly the longest_match
; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
;
; and by taking inspiration on asm686 with masm, optimised assembly code
; from Brian Raiter, written 1998
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software
; 3. This notice may not be removed or altered from any source distribution.
;
;
;
; http://www.zlib.net
; http://www.winimage.com/zLibDll
; http://www.muppetlabs.com/~breadbox/software/assembly.html
;
; to compile this file for infozip Zip, I use option:
; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
;
; to compile this file for zLib, I use option:
; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
; Be carrefull to adapt zlib1222add below to your version of zLib
; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
; value of zlib1222add later)
;
; This file compile with Microsoft Macro Assembler (x64) for AMD64
;
; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
;
; (you can get Windows WDK with ml64 for AMD64 from
; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
;
;uInt longest_match(s, cur_match)
; deflate_state *s;
; IPos cur_match; /* current match */
.code
longest_match PROC
;LocalVarsSize equ 88
LocalVarsSize equ 72
; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
; free register : r14,r15
; register can be saved : rsp
chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
; low word: s->wmask
;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
IFDEF INFOZIP
ELSE
nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size
ENDIF
save_rdi equ rsp + 24 - LocalVarsSize
save_rsi equ rsp + 32 - LocalVarsSize
save_rbx equ rsp + 40 - LocalVarsSize
save_rbp equ rsp + 48 - LocalVarsSize
save_r12 equ rsp + 56 - LocalVarsSize
save_r13 equ rsp + 64 - LocalVarsSize
;save_r14 equ rsp + 72 - LocalVarsSize
;save_r15 equ rsp + 80 - LocalVarsSize
; summary of register usage
; scanend ebx
; scanendw bx
; chainlenwmask edx
; curmatch rsi
; curmatchd esi
; windowbestlen r8
; scanalign r9
; scanalignd r9d
; window r10
; bestlen r11
; bestlend r11d
; scanstart r12d
; scanstartw r12w
; scan r13
; nicematch r14d
; limit r15
; limitd r15d
; prev rcx
; all the +4 offsets are due to the addition of pending_buf_size (in zlib
; in the deflate_state structure since the asm code was first written
; (if you compile with zlib 1.0.4 or older, remove the +4).
; Note : these value are good with a 8 bytes boundary pack structure
MAX_MATCH equ 258
MIN_MATCH equ 3
MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
;;; Offsets for fields in the deflate_state structure. These numbers
;;; are calculated from the definition of deflate_state, with the
;;; assumption that the compiler will dword-align the fields. (Thus,
;;; changing the definition of deflate_state could easily cause this
;;; program to crash horribly, without so much as a warning at
;;; compile time. Sigh.)
; all the +zlib1222add offsets are due to the addition of fields
; in zlib in the deflate_state structure since the asm code was first written
; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
IFDEF INFOZIP
_DATA SEGMENT
COMM window_size:DWORD
; WMask ; 7fff
COMM window:BYTE:010040H
COMM prev:WORD:08000H
; MatchLen : unused
; PrevMatch : unused
COMM strstart:DWORD
COMM match_start:DWORD
; Lookahead : ignore
COMM prev_length:DWORD ; PrevLen
COMM max_chain_length:DWORD
COMM good_match:DWORD
COMM nice_match:DWORD
prev_ad equ OFFSET prev
window_ad equ OFFSET window
nicematch equ nice_match
_DATA ENDS
WMask equ 07fffh
ELSE
IFNDEF zlib1222add
zlib1222add equ 8
ENDIF
dsWSize equ 56+zlib1222add+(zlib1222add/2)
dsWMask equ 64+zlib1222add+(zlib1222add/2)
dsWindow equ 72+zlib1222add
dsPrev equ 88+zlib1222add
dsMatchLen equ 128+zlib1222add
dsPrevMatch equ 132+zlib1222add
dsStrStart equ 140+zlib1222add
dsMatchStart equ 144+zlib1222add
dsLookahead equ 148+zlib1222add
dsPrevLen equ 152+zlib1222add
dsMaxChainLen equ 156+zlib1222add
dsGoodMatch equ 172+zlib1222add
dsNiceMatch equ 176+zlib1222add
window_size equ [ rcx + dsWSize]
WMask equ [ rcx + dsWMask]
window_ad equ [ rcx + dsWindow]
prev_ad equ [ rcx + dsPrev]
strstart equ [ rcx + dsStrStart]
match_start equ [ rcx + dsMatchStart]
Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
prev_length equ [ rcx + dsPrevLen]
max_chain_length equ [ rcx + dsMaxChainLen]
good_match equ [ rcx + dsGoodMatch]
nice_match equ [ rcx + dsNiceMatch]
ENDIF
; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
;
; All registers must be preserved across the call, except for
; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
;;; Save registers that the compiler may be using, and adjust esp to
;;; make room for our stack frame.
;;; Retrieve the function arguments. r8d will hold cur_match
;;; throughout the entire function. edx will hold the pointer to the
;;; deflate_state structure during the function's setup (before
;;; entering the main loop.
; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
mov [save_rdi],rdi
mov [save_rsi],rsi
mov [save_rbx],rbx
mov [save_rbp],rbp
IFDEF INFOZIP
mov r8d,ecx
ELSE
mov r8d,edx
ENDIF
mov [save_r12],r12
mov [save_r13],r13
; mov [save_r14],r14
; mov [save_r15],r15
;;; uInt wmask = s->w_mask;
;;; unsigned chain_length = s->max_chain_length;
;;; if (s->prev_length >= s->good_match) {
;;; chain_length >>= 2;
;;; }
mov edi, prev_length
mov esi, good_match
mov eax, WMask
mov ebx, max_chain_length
cmp edi, esi
jl LastMatchGood
shr ebx, 2
LastMatchGood:
;;; chainlen is decremented once beforehand so that the function can
;;; use the sign flag instead of the zero flag for the exit test.
;;; It is then shifted into the high word, to make room for the wmask
;;; value, which it will always accompany.
dec ebx
shl ebx, 16
or ebx, eax
;;; on zlib only
;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
IFDEF INFOZIP
mov [chainlenwmask], ebx
; on infozip nice_match = [nice_match]
ELSE
mov eax, nice_match
mov [chainlenwmask], ebx
mov r10d, Lookahead
cmp r10d, eax
cmovnl r10d, eax
mov [nicematch],r10d
ENDIF
;;; register Bytef *scan = s->window + s->strstart;
mov r10, window_ad
mov ebp, strstart
lea r13, [r10 + rbp]
;;; Determine how many bytes the scan ptr is off from being
;;; dword-aligned.
mov r9,r13
neg r13
and r13,3
;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
IFDEF INFOZIP
mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
ELSE
mov eax, window_size
sub eax, MIN_LOOKAHEAD
ENDIF
xor edi,edi
sub ebp, eax
mov r11d, prev_length
cmovng ebp,edi
;;; int best_len = s->prev_length;
;;; Store the sum of s->window + best_len in esi locally, and in esi.
lea rsi,[r10+r11]
;;; register ush scan_start = *(ushf*)scan;
;;; register ush scan_end = *(ushf*)(scan+best_len-1);
;;; Posf *prev = s->prev;
movzx r12d,word ptr [r9]
movzx ebx, word ptr [r9 + r11 - 1]
mov rdi, prev_ad
;;; Jump into the main loop.
mov edx, [chainlenwmask]
cmp bx,word ptr [rsi + r8 - 1]
jz LookupLoopIsZero
LookupLoop1:
and r8d, edx
movzx r8d, word ptr [rdi + r8*2]
cmp r8d, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry1:
cmp bx,word ptr [rsi + r8 - 1]
jz LookupLoopIsZero
LookupLoop2:
and r8d, edx
movzx r8d, word ptr [rdi + r8*2]
cmp r8d, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry2:
cmp bx,word ptr [rsi + r8 - 1]
jz LookupLoopIsZero
LookupLoop4:
and r8d, edx
movzx r8d, word ptr [rdi + r8*2]
cmp r8d, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry4:
cmp bx,word ptr [rsi + r8 - 1]
jnz LookupLoop1
jmp LookupLoopIsZero
;;; do {
;;; match = s->window + cur_match;
;;; if (*(ushf*)(match+best_len-1) != scan_end ||
;;; *(ushf*)match != scan_start) continue;
;;; [...]
;;; } while ((cur_match = prev[cur_match & wmask]) > limit
;;; && --chain_length != 0);
;;;
;;; Here is the inner loop of the function. The function will spend the
;;; majority of its time in this loop, and majority of that time will
;;; be spent in the first ten instructions.
;;;
;;; Within this loop:
;;; ebx = scanend
;;; r8d = curmatch
;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
;;; esi = windowbestlen - i.e., (window + bestlen)
;;; edi = prev
;;; ebp = limit
LookupLoop:
and r8d, edx
movzx r8d, word ptr [rdi + r8*2]
cmp r8d, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry:
cmp bx,word ptr [rsi + r8 - 1]
jnz LookupLoop1
LookupLoopIsZero:
cmp r12w, word ptr [r10 + r8]
jnz LookupLoop1
;;; Store the current value of chainlen.
mov [chainlenwmask], edx
;;; Point edi to the string under scrutiny, and esi to the string we
;;; are hoping to match it up with. In actuality, esi and edi are
;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
;;; initialized to -(MAX_MATCH_8 - scanalign).
lea rsi,[r8+r10]
mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
prefetcht1 [rsi+rdx]
prefetcht1 [rdi+rdx]
;;; Test the strings for equality, 8 bytes at a time. At the end,
;;; adjust rdx so that it is offset to the exact byte that mismatched.
;;;
;;; We already know at this point that the first three bytes of the
;;; strings match each other, and they can be safely passed over before
;;; starting the compare loop. So what this code does is skip over 0-3
;;; bytes, as much as necessary in order to dword-align the edi
;;; pointer. (rsi will still be misaligned three times out of four.)
;;;
;;; It should be confessed that this loop usually does not represent
;;; much of the total running time. Replacing it with a more
;;; straightforward "rep cmpsb" would not drastically degrade
;;; performance.
LoopCmps:
mov rax, [rsi + rdx]
xor rax, [rdi + rdx]
jnz LeaveLoopCmps
mov rax, [rsi + rdx + 8]
xor rax, [rdi + rdx + 8]
jnz LeaveLoopCmps8
mov rax, [rsi + rdx + 8+8]
xor rax, [rdi + rdx + 8+8]
jnz LeaveLoopCmps16
add rdx,8+8+8
jnz short LoopCmps
jmp short LenMaximum
LeaveLoopCmps16: add rdx,8
LeaveLoopCmps8: add rdx,8
LeaveLoopCmps:
test eax, 0000FFFFh
jnz LenLower
test eax,0ffffffffh
jnz LenLower32
add rdx,4
shr rax,32
or ax,ax
jnz LenLower
LenLower32:
shr eax,16
add rdx,2
LenLower: sub al, 1
adc rdx, 0
;;; Calculate the length of the match. If it is longer than MAX_MATCH,
;;; then automatically accept it as the best possible match and leave.
lea rax, [rdi + rdx]
sub rax, r9
cmp eax, MAX_MATCH
jge LenMaximum
;;; If the length of the match is not longer than the best match we
;;; have so far, then forget it and return to the lookup loop.
;///////////////////////////////////
cmp eax, r11d
jg LongerMatch
lea rsi,[r10+r11]
mov rdi, prev_ad
mov edx, [chainlenwmask]
jmp LookupLoop
;;; s->match_start = cur_match;
;;; best_len = len;
;;; if (len >= nice_match) break;
;;; scan_end = *(ushf*)(scan+best_len-1);
LongerMatch:
mov r11d, eax
mov match_start, r8d
cmp eax, [nicematch]
jge LeaveNow
lea rsi,[r10+rax]
movzx ebx, word ptr [r9 + rax - 1]
mov rdi, prev_ad
mov edx, [chainlenwmask]
jmp LookupLoop
;;; Accept the current string, with the maximum possible length.
LenMaximum:
mov r11d,MAX_MATCH
mov match_start, r8d
;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
;;; return s->lookahead;
LeaveNow:
IFDEF INFOZIP
mov eax,r11d
ELSE
mov eax, Lookahead
cmp r11d, eax
cmovng eax, r11d
ENDIF
;;; Restore the stack and return from whence we came.
mov rsi,[save_rsi]
mov rdi,[save_rdi]
mov rbx,[save_rbx]
mov rbp,[save_rbp]
mov r12,[save_r12]
mov r13,[save_r13]
; mov r14,[save_r14]
; mov r15,[save_r15]
ret 0
; please don't remove this string !
; Your can freely use gvmat64 in any free or commercial app
; but it is far better don't remove the string in the binary!
db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
longest_match ENDP
match_init PROC
ret 0
match_init ENDP
END

View file

@ -0,0 +1,186 @@
/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
* version for AMD64 on Windows using Microsoft C compiler
*
* Copyright (C) 1995-2003 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Copyright (C) 2003 Chris Anderson <christop@charm.net>
* Please use the copyright conditions above.
*
* 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
*
* inffas8664.c call function inffas8664fnc in inffasx64.asm
* inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
*
* Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
* slightly quicker on x86 systems because, instead of using rep movsb to copy
* data, it uses rep movsw, which moves data in 2-byte chunks instead of single
* bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
* from http://fedora.linux.duke.edu/fc1_x86_64
* which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
* 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
* when decompressing mozilla-source-1.3.tar.gz.
*
* Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
* the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
* the moment. I have successfully compiled and tested this code with gcc2.96,
* gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
* compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
* enabled. I will attempt to merge the MMX code into this version. Newer
* versions of this and inffast.S can be found at
* http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
*
*/
#include <stdio.h>
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"
/* Mark Adler's comments from inffast.c: */
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
available, an end-of-block is encountered, or a data error is encountered.
When large enough input and output buffers are supplied to inflate(), for
example, a 16K input buffer and a 64K output buffer, more than 95% of the
inflate execution time is spent in this routine.
Entry assumptions:
state->mode == LEN
strm->avail_in >= 6
strm->avail_out >= 258
start >= strm->avail_out
state->bits < 8
On return, state->mode is one of:
LEN -- ran out of enough output space or enough available input
TYPE -- reached end of block code, inflate() to interpret next block
BAD -- error in block data
Notes:
- The maximum input bits used by a length/distance pair is 15 bits for the
length code, 5 bits for the length extra, 15 bits for the distance code,
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
Therefore if strm->avail_in >= 6, then there is enough input to avoid
checking for available input while decoding.
- The maximum bytes that a single length/distance pair can output is 258
bytes, which is the maximum length that can be coded. inflate_fast()
requires strm->avail_out >= 258 for each loop to avoid checking for
output space.
*/
typedef struct inffast_ar {
/* 64 32 x86 x86_64 */
/* ar offset register */
/* 0 0 */ void *esp; /* esp save */
/* 8 4 */ void *ebp; /* ebp save */
/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
/* 92 48 */ unsigned wsize; /* window size */
/* 96 52 */ unsigned write; /* window write index */
/*100 56 */ unsigned lmask; /* r12 mask for lcode */
/*104 60 */ unsigned dmask; /* r13 mask for dcode */
/*108 64 */ unsigned len; /* r14 match length */
/*112 68 */ unsigned dist; /* r15 match distance */
/*116 72 */ unsigned status; /* set when state chng*/
} type_ar;
#ifdef ASMINF
void inflate_fast(strm, start)
z_streamp strm;
unsigned start; /* inflate()'s starting value for strm->avail_out */
{
struct inflate_state FAR *state;
type_ar ar;
void inffas8664fnc(struct inffast_ar * par);
#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
#define PAD_AVAIL_IN 6
#define PAD_AVAIL_OUT 258
#else
#define PAD_AVAIL_IN 5
#define PAD_AVAIL_OUT 257
#endif
/* copy state to local variables */
state = (struct inflate_state FAR *)strm->state;
ar.in = strm->next_in;
ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
ar.out = strm->next_out;
ar.beg = ar.out - (start - strm->avail_out);
ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
ar.wsize = state->wsize;
ar.write = state->wnext;
ar.window = state->window;
ar.hold = state->hold;
ar.bits = state->bits;
ar.lcode = state->lencode;
ar.dcode = state->distcode;
ar.lmask = (1U << state->lenbits) - 1;
ar.dmask = (1U << state->distbits) - 1;
/* decode literals and length/distances until end-of-block or not enough
input data or output space */
/* align in on 1/2 hold size boundary */
while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
ar.hold += (unsigned long)*ar.in++ << ar.bits;
ar.bits += 8;
}
inffas8664fnc(&ar);
if (ar.status > 1) {
if (ar.status == 2)
strm->msg = "invalid literal/length code";
else if (ar.status == 3)
strm->msg = "invalid distance code";
else
strm->msg = "invalid distance too far back";
state->mode = BAD;
}
else if ( ar.status == 1 ) {
state->mode = TYPE;
}
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
ar.len = ar.bits >> 3;
ar.in -= ar.len;
ar.bits -= ar.len << 3;
ar.hold &= (1U << ar.bits) - 1;
/* update state and return */
strm->next_in = ar.in;
strm->next_out = ar.out;
strm->avail_in = (unsigned)(ar.in < ar.last ?
PAD_AVAIL_IN + (ar.last - ar.in) :
PAD_AVAIL_IN - (ar.in - ar.last));
strm->avail_out = (unsigned)(ar.out < ar.end ?
PAD_AVAIL_OUT + (ar.end - ar.out) :
PAD_AVAIL_OUT - (ar.out - ar.end));
state->hold = (unsigned long)ar.hold;
state->bits = ar.bits;
return;
}
#endif

View file

@ -0,0 +1,396 @@
; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
; version for AMD64 on Windows using Microsoft C compiler
;
; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
; inffasx64.asm is called by inffas8664.c, which contain more info.
; to compile this file, I use option
; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
; with Microsoft Macro Assembler (x64) for AMD64
;
; This file compile with Microsoft Macro Assembler (x64) for AMD64
;
; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
;
; (you can get Windows WDK with ml64 for AMD64 from
; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
;
.code
inffas8664fnc PROC
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
;
; All registers must be preserved across the call, except for
; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
mov [rsp-8],rsi
mov [rsp-16],rdi
mov [rsp-24],r12
mov [rsp-32],r13
mov [rsp-40],r14
mov [rsp-48],r15
mov [rsp-56],rbx
mov rax,rcx
mov [rax+8], rbp ; /* save regs rbp and rsp */
mov [rax], rsp
mov rsp, rax ; /* make rsp point to &ar */
mov rsi, [rsp+16] ; /* rsi = in */
mov rdi, [rsp+32] ; /* rdi = out */
mov r9, [rsp+24] ; /* r9 = last */
mov r10, [rsp+48] ; /* r10 = end */
mov rbp, [rsp+64] ; /* rbp = lcode */
mov r11, [rsp+72] ; /* r11 = dcode */
mov rdx, [rsp+80] ; /* rdx = hold */
mov ebx, [rsp+88] ; /* ebx = bits */
mov r12d, [rsp+100] ; /* r12d = lmask */
mov r13d, [rsp+104] ; /* r13d = dmask */
; /* r14d = len */
; /* r15d = dist */
cld
cmp r10, rdi
je L_one_time ; /* if only one decode left */
cmp r9, rsi
jne L_do_loop
L_one_time:
mov r8, r12 ; /* r8 = lmask */
cmp bl, 32
ja L_get_length_code_one_time
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
jmp L_get_length_code_one_time
ALIGN 4
L_while_test:
cmp r10, rdi
jbe L_break_loop
cmp r9, rsi
jbe L_break_loop
L_do_loop:
mov r8, r12 ; /* r8 = lmask */
cmp bl, 32
ja L_get_length_code ; /* if (32 < bits) */
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
L_get_length_code:
and r8, rdx ; /* r8 &= hold */
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
mov cl, ah ; /* cl = this.bits */
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
test al, al
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
mov r8, r12 ; /* r8 = lmask */
shr eax, 16 ; /* output this.val char */
stosb
L_get_length_code_one_time:
and r8, rdx ; /* r8 &= hold */
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
L_dolen:
mov cl, ah ; /* cl = this.bits */
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
test al, al
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
shr eax, 16 ; /* output this.val char */
stosb
jmp L_while_test
ALIGN 4
L_test_for_length_base:
mov r14d, eax ; /* len = this */
shr r14d, 16 ; /* len = this.val */
mov cl, al
test al, 16
jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
and cl, 15 ; /* op &= 15 */
jz L_decode_distance ; /* if (!op) */
L_add_bits_to_len:
sub bl, cl
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
shr rdx, cl
add r14d, eax ; /* len += hold & mask[op] */
L_decode_distance:
mov r8, r13 ; /* r8 = dmask */
cmp bl, 32
ja L_get_distance_code ; /* if (32 < bits) */
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
L_get_distance_code:
and r8, rdx ; /* r8 &= hold */
mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
L_dodist:
mov r15d, eax ; /* dist = this */
shr r15d, 16 ; /* dist = this.val */
mov cl, ah
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
mov cl, al ; /* cl = this.op */
test al, 16 ; /* if ((op & 16) == 0) */
jz L_test_for_second_level_dist
and cl, 15 ; /* op &= 15 */
jz L_check_dist_one
L_add_bits_to_dist:
sub bl, cl
xor eax, eax
inc eax
shl eax, cl
dec eax ; /* (1 << op) - 1 */
and eax, edx ; /* eax &= hold */
shr rdx, cl
add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
L_check_window:
mov r8, rsi ; /* save in so from can use it's reg */
mov rax, rdi
sub rax, [rsp+40] ; /* nbytes = out - beg */
cmp eax, r15d
jb L_clip_window ; /* if (dist > nbytes) 4.2% */
mov ecx, r14d ; /* ecx = len */
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
sar ecx, 1
jnc L_copy_two ; /* if len % 2 == 0 */
rep movsw
mov al, [rsi]
mov [rdi], al
inc rdi
mov rsi, r8 ; /* move in back to %rsi, toss from */
jmp L_while_test
L_copy_two:
rep movsw
mov rsi, r8 ; /* move in back to %rsi, toss from */
jmp L_while_test
ALIGN 4
L_check_dist_one:
cmp r15d, 1 ; /* if dist 1, is a memset */
jne L_check_window
cmp [rsp+40], rdi ; /* if out == beg, outside window */
je L_check_window
mov ecx, r14d ; /* ecx = len */
mov al, [rdi-1]
mov ah, al
sar ecx, 1
jnc L_set_two
mov [rdi], al
inc rdi
L_set_two:
rep stosw
jmp L_while_test
ALIGN 4
L_test_for_second_level_length:
test al, 64
jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
add eax, r14d ; /* eax += len */
mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
jmp L_dolen
ALIGN 4
L_test_for_second_level_dist:
test al, 64
jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
add eax, r15d ; /* eax += dist */
mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
jmp L_dodist
ALIGN 4
L_clip_window:
mov ecx, eax ; /* ecx = nbytes */
mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
neg ecx ; /* nbytes = -nbytes */
cmp eax, r15d
jb L_invalid_distance_too_far ; /* if (dist > wsize) */
add ecx, r15d ; /* nbytes = dist - nbytes */
cmp dword ptr [rsp+96], 0
jne L_wrap_around_window ; /* if (write != 0) */
mov rsi, [rsp+56] ; /* from = window */
sub eax, ecx ; /* eax -= nbytes */
add rsi, rax ; /* from += wsize - nbytes */
mov eax, r14d ; /* eax = len */
cmp r14d, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* eax -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = &out[ -dist ] */
jmp L_do_copy
ALIGN 4
L_wrap_around_window:
mov eax, [rsp+96] ; /* eax = write */
cmp ecx, eax
jbe L_contiguous_in_window ; /* if (write >= nbytes) */
mov esi, [rsp+92] ; /* from = wsize */
add rsi, [rsp+56] ; /* from += window */
add rsi, rax ; /* from += write */
sub rsi, rcx ; /* from -= nbytes */
sub ecx, eax ; /* nbytes -= write */
mov eax, r14d ; /* eax = len */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, [rsp+56] ; /* from = window */
mov ecx, [rsp+96] ; /* nbytes = write */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
jmp L_do_copy
ALIGN 4
L_contiguous_in_window:
mov rsi, [rsp+56] ; /* rsi = window */
add rsi, rax
sub rsi, rcx ; /* from += write - nbytes */
mov eax, r14d ; /* eax = len */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
jmp L_do_copy ; /* if (nbytes >= len) */
ALIGN 4
L_do_copy:
mov ecx, eax ; /* ecx = len */
rep movsb
mov rsi, r8 ; /* move in back to %esi, toss from */
jmp L_while_test
L_test_for_end_of_block:
test al, 32
jz L_invalid_literal_length_code
mov dword ptr [rsp+116], 1
jmp L_break_loop_with_status
L_invalid_literal_length_code:
mov dword ptr [rsp+116], 2
jmp L_break_loop_with_status
L_invalid_distance_code:
mov dword ptr [rsp+116], 3
jmp L_break_loop_with_status
L_invalid_distance_too_far:
mov dword ptr [rsp+116], 4
jmp L_break_loop_with_status
L_break_loop:
mov dword ptr [rsp+116], 0
L_break_loop_with_status:
; /* put in, out, bits, and hold back into ar and pop esp */
mov [rsp+16], rsi ; /* in */
mov [rsp+32], rdi ; /* out */
mov [rsp+88], ebx ; /* bits */
mov [rsp+80], rdx ; /* hold */
mov rax, [rsp] ; /* restore rbp and rsp */
mov rbp, [rsp+8]
mov rsp, rax
mov rsi,[rsp-8]
mov rdi,[rsp-16]
mov r12,[rsp-24]
mov r13,[rsp-32]
mov r14,[rsp-40]
mov r15,[rsp-48]
mov rbx,[rsp-56]
ret 0
; :
; : "m" (ar)
; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
; );
inffas8664fnc ENDP
;_TEXT ENDS
END

View file

@ -0,0 +1,31 @@
Summary
-------
This directory contains ASM implementations of the functions
longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t),
for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits.
gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits
assembly optimized version from Jean-loup Gailly original longest_match function
inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing
original function from Mark Adler
Use instructions
----------------
Assemble the .asm files using MASM and put the object files into the zlib source
directory. You can also get object files here:
http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
define ASMV and ASMINF in your project. Include inffas8664.c in your source tree,
and inffasx64.obj and gvmat64.obj as object to link.
Build instructions
------------------
run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe)
ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
You can get Windows 2003 server DDK with ml64 and cl for AMD64 from
http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)