a6c2d692f576 — Laurens Holst 5 years ago
Branch: Inline the bit reading and keep the state in registers.

The reading cost of seven out of eight bits goes down from 79 to 26 cycles.

With this change, my test extracting a DSK file goes down from 191 seconds to
149 seconds on the Z80, and from 37 seconds to 29 seconds on the R800. In other
words, a 22% speed improvement.
M src/DynamicAlphabets.asm +40 -31
@@ 218,56 218,65 @@ DynamicAlphabets_ReadLiteralLengthDistan
 	add hl,de
 	ld a,(ix + DynamicAlphabets.hlit)
 	add a,(ix + DynamicAlphabets.hdist)
-	ld b,a
-	ld c,2  ; +1 for nested 8-bit loop
 	push ix
 	call DynamicAlphabets_GetHeaderCodeAlphabet
 	call Alphabet_GetRoot
+	ld e,a
+	ld d,2  ; +1 for nested 8-bit loop
 	push ix
 	ex (sp),iy
 	pop ix
+	call Reader_PrepareReadBitInline
 	call DynamicAlphabets_DecodeLiteralLengthDistanceCodeLengths
+	call Reader_FinishReadBitInline
 	pop ix
 	ret
 
-; bc = loop counter for nested 8-bit loop
+; bc = inline bit reader state
+; de = loop counter for nested 8-bit loop
 ; hl = literal/length/distance code lengths position
 ; ix = reader
 ; iy = header code alphabet root
 DynamicAlphabets_DecodeLiteralLengthDistanceCodeLengths:
 	jp iy
 
-; a = value
-; bc = loop counter for nested 8-bit loop
+; bc = inline bit reader state
+; de = loop counter for nested 8-bit loop
 ; hl = literal/length/distance code lengths position
 ; ix = reader
 ; iy = header code alphabet root
 DynamicAlphabets_WriteAndNext:
 	inc hl
-	djnz DynamicAlphabets_DecodeLiteralLengthDistanceCodeLengths
-	dec c
+	dec e
+	jp nz,DynamicAlphabets_DecodeLiteralLengthDistanceCodeLengths
+	dec d
 	jr nz,DynamicAlphabets_DecodeLiteralLengthDistanceCodeLengths
 	ret
 
 ; a = fill value
-; e = repeat count
-; bc = loop counter for nested 8-bit loop
+; a' = repeat count
+; bc = inline bit reader state
+; de = loop counter for nested 8-bit loop
 ; hl = literal/length/distance code lengths position
 ; ix = reader
 ; iy = header code alphabet root
 DynamicAlphabets_FillAndNext_Loop:
-	dec e
+	ex af,af'
+	dec a
 	jr z,DynamicAlphabets_DecodeLiteralLengthDistanceCodeLengths
+	ex af,af'
 DynamicAlphabets_FillAndNext:
 	ld (hl),a
 	inc hl
-	djnz DynamicAlphabets_FillAndNext_Loop
-	dec c
+	dec e
+	jp nz,DynamicAlphabets_FillAndNext_Loop
+	dec d
 	jr nz,DynamicAlphabets_FillAndNext_Loop
 	ret
 
 ; Header code alphabet symbols 0-15
-; bc = loop counter for nested 8-bit loop
+; bc = inline bit reader state
+; de = loop counter for nested 8-bit loop
 ; hl = literal/length/distance code lengths position
 ; ix = reader
 ; iy = header code alphabet root

          
@@ 277,49 286,49 @@ DynamicAlphabets_WriteLength: REPT 16, ?
 	ENDM
 
 ; Header code alphabet symbols 16
-; bc = loop counter for nested 8-bit loop
+; bc = inline bit reader state
+; de = loop counter for nested 8-bit loop
 ; hl = literal/length/distance code lengths position
 ; ix = reader
 ; iy = header code alphabet root
 DynamicAlphabets_Copy:
-	push bc
-	ld b,2
-	call Reader_ReadBits
-	pop bc
+	ex af,af'
+	ld a,2
+	call Reader_ReadBitsInline
 	add a,3
-	ld e,a
+	ex af,af'
 	dec hl
 	ld a,(hl)
 	inc hl
 	jp DynamicAlphabets_FillAndNext
 
 ; Header code alphabet symbols 17
-; bc = loop counter for nested 8-bit loop
+; bc = inline bit reader state
+; de = loop counter for nested 8-bit loop
 ; hl = literal/length/distance code lengths position
 ; ix = reader
 ; iy = header code alphabet root
 DynamicAlphabets_FillZero_3:
-	push bc
-	ld b,3
-	call Reader_ReadBits
-	pop bc
+	ex af,af'
+	ld a,3
+	call Reader_ReadBitsInline
 	add a,3
-	ld e,a
+	ex af,af'
 	xor a
 	jp DynamicAlphabets_FillAndNext
 
 ; Header code alphabet symbols 18
-; bc = loop counter for nested 8-bit loop
+; bc = inline bit reader state
+; de = loop counter for nested 8-bit loop
 ; hl = literal/length/distance code lengths position
 ; ix = reader
 ; iy = header code alphabet root
 DynamicAlphabets_FillZero_11:
-	push bc
-	ld b,7
-	call Reader_ReadBits
-	pop bc
+	ex af,af'
+	ld a,7
+	call Reader_ReadBitsInline
 	add a,11
-	ld e,a
+	ex af,af'
 	xor a
 	jp DynamicAlphabets_FillAndNext
 

          
M src/Inflate.asm +71 -28
@@ 224,10 224,13 @@ Inflate_InflateCompressed:
 	ld b,(ix + Inflate.reader + 1)
 	ld ixl,c
 	ld ixh,b
+	call Reader_PrepareReadBitInline
 	call Inflate_DecodeLiteralLength
+	call Reader_FinishReadBitInline
 	pop ix
 	ret
 
+; bc = inline bit reader state
 ; hl = literal/length alphabet root
 ; de = distance alphabet root
 ; ix = reader

          
@@ 236,6 239,7 @@ Inflate_DecodeLiteralLength:
 	jp hl
 
 ; Literal/length alphabet symbols 0-255
+; bc = inline bit reader state
 ; hl = literal/length alphabet root
 ; de = distance alphabet root
 ; ix = reader

          
@@ 246,6 250,7 @@ Inflate_WriteLiteral: REPT 256, ?value
 	ENDM
 
 ; a = value
+; bc = inline bit reader state
 ; hl = literal/length alphabet root
 ; de = distance alphabet root
 ; ix = reader

          
@@ 257,6 262,7 @@ Inflate_WriteAndNext:
 	jp Inflate_DecodeLiteralLength
 
 ; Literal/length alphabet symbol 256
+; bc = inline bit reader state
 ; hl = literal/length alphabet root
 ; de = distance alphabet root
 ; ix = reader

          
@@ 265,6 271,7 @@ Inflate_EndBlock:
 	ret
 
 ; Literal/length alphabet symbols 257-285
+; bc = inline bit reader state
 ; hl = literal/length alphabet root
 ; de = distance alphabet root
 ; ix = reader

          
@@ 408,16 415,16 @@ Inflate_CopyLength.28:
 
 ; a = bits
 ; bc = length offset
+; bc' = inline bit reader state
 ; hl' = literal/length alphabet root
 ; de' = distance alphabet root
 ; ix = reader
 ; iy = writer
 ; bc <- value
 Inflate_ReadExtraLengthBits:
-	push bc
-	ld b,a
-	call Reader_ReadBits
-	pop bc
+	exx
+	call Reader_ReadBitsInline
+	exx
 	add a,c
 	ld c,a
 	jr nc,Inflate_DecodeDistance

          
@@ 425,6 432,7 @@ Inflate_ReadExtraLengthBits:
 	jp Inflate_DecodeDistance
 
 ; bc = length
+; bc' = inline bit reader state
 ; hl' = literal/length alphabet root
 ; de' = distance alphabet root
 ; ix = reader

          
@@ 435,6 443,7 @@ Inflate_DecodeDistance:
 	jp hl
 
 ; Distance alphabet symbols 0-29
+; bc = inline bit reader state
 ; de = literal/length alphabet root
 ; hl = distance alphabet root
 ; ix = reader

          
@@ 537,72 546,106 @@ Inflate_CopyDistance.19:
 	jp Inflate_ReadExtraDistanceBits
 Inflate_CopyDistance.20:
 	exx
-	ld a,9
+	ld a,9 - 8
 	ld de,1025
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.21:
 	exx
-	ld a,9
+	ld a,9 - 8
 	ld de,1537
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.22:
 	exx
-	ld a,10
+	ld a,10 - 8
 	ld de,2049
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.23:
 	exx
-	ld a,10
+	ld a,10 - 8
 	ld de,3073
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.24:
 	exx
-	ld a,11
+	ld a,11 - 8
 	ld de,4097
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.25:
 	exx
-	ld a,11
+	ld a,11 - 8
 	ld de,6145
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.26:
 	exx
-	ld a,12
+	ld a,12 - 8
 	ld de,8193
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.27:
 	exx
-	ld a,12
+	ld a,12 - 8
 	ld de,12289
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.28:
 	exx
-	ld a,13
+	ld a,13 - 8
 	ld de,16385
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 Inflate_CopyDistance.29:
 	exx
-	ld a,13
+	ld a,13 - 8
 	ld de,24577
-	jp Inflate_ReadExtraDistanceBits
+	jp Inflate_ReadExtraDistanceBitsPlus8
 
 ; a = bits
 ; bc = length (preserved)
 ; de = distance offset
+; bc' = inline bit reader state
 ; de' = literal/length alphabet root
 ; hl' = distance alphabet root
 ; ix = reader
 ; iy = writer
 ; de <- value
 Inflate_ReadExtraDistanceBits:
-	push bc
-	ld b,a
-	call Reader_ReadAndAddBits
-	pop bc
+	exx
+	call Reader_ReadBitsInline
+	exx
+	add a,e
+	ld e,a
+	jr nc,Inflate_CopyAndNext
+	inc d
 	jp Inflate_CopyAndNext
 
+; a = bits - 8
+; bc = length (preserved)
+; de = distance offset
+; bc' = inline bit reader state
+; de' = literal/length alphabet root
+; hl' = distance alphabet root
+; ix = reader
+; iy = writer
+; de <- value
+Inflate_ReadExtraDistanceBitsPlus8: PROC
+	ex af,af'
+	exx
+	ld a,8
+	call Reader_ReadBitsInline
+	exx
+	add a,e
+	ld e,a
+	jr nc,RemainingBits
+	inc d
+RemainingBits:
+	ex af,af'
+	exx
+	call Reader_ReadBitsInline
+	exx
+	add a,d
+	ld d,a
+	jp Inflate_CopyAndNext
+	ENDP
+
 ; bc = length
 ; de = distance
+; bc' = inline bit reader state
 ; de' = literal/length alphabet root
 ; hl' = distance alphabet root
 ; ix = reader

          
M src/Reader.asm +60 -19
@@ 113,6 113,66 @@ Reader_ReadBit:
 	pop bc
 	ret
 
+; b <- current byte (shifts)
+; c <- bits left + 1
+Reader_PrepareReadBitInline:
+	ld b,(ix + Reader.byte)
+	ld c,(ix + Reader.bitCounter)
+	ret
+
+; b <- current byte (shifts)
+; c <- bits left + 1
+Reader_FinishReadBitInline:
+	ld (ix + Reader.byte),b
+	ld (ix + Reader.bitCounter),c
+	ret
+
+; b = current byte (shifts)
+; c = bits left + 1
+; b <- current byte (shifts)
+; c <- bits left + 1
+; f <- c: bit
+Reader_ReadBitInline: MACRO
+	dec c
+	call z,Reader_ReadBitInline_NextByte
+	rr b
+	ENDM
+
+; b <- current byte (shifts)
+; c <- bits left
+; f <- c: bit
+Reader_ReadBitInline_NextByte:
+	push hl
+	ld c,a
+	call Reader_Read
+	ld b,a
+	ld a,c
+	ld c,8
+	pop hl
+	ret
+
+; a = nr of bits to read (1-8)
+; bc = inline bit reader state
+; a <- value
+; bc <- inline bit reader state
+; Modifies: af
+Reader_ReadBitsInline: PROC
+	push de
+	ld d,a
+	ld e,1
+	xor a
+Loop:
+	Reader_ReadBitInline
+	jr nc,Zero
+	or e
+Zero:
+	rlc e
+	dec d
+	jp nz,Loop
+	pop de
+	ret
+	ENDP
+
 ; b = nr of bits to read (1-8)
 ; ix = this
 ; a <- value

          
@@ 141,25 201,6 @@ Reader_ReadBits_IY:
 	pop ix
 	ret
 
-; b = nr of bits to read (1-16)
-; de = base value
-; ix = this
-; de <- value
-; Modifies: af, bc, de, hl
-Reader_ReadAndAddBits: PROC
-	ld hl,1
-Loop:
-	call Reader_ReadBit
-	jr nc,Zero
-	ex de,hl
-	add hl,de
-	ex de,hl
-Zero:
-	add hl,hl
-	djnz Loop
-	ret
-	ENDP
-
 ; ix = this
 Reader_Align:
 	ld (ix + Reader.bitCounter),1

          
M src/alphabet/AlphabetTest.asm +5 -2
@@ 59,7 59,8 @@ AlphabetTest_TestTreeBuilding: PROC
 	call Reader_Construct
 	pop iy
 
-	ld b,7
+	call Reader_PrepareReadBitInline
+	ld d,7
 	ld hl,AlphabetTest_expectedSymbols
 Loop:
 	push hl

          
@@ 68,7 69,9 @@ Loop:
 	cp (hl)
 	call nz,System_ThrowException
 	inc hl
-	djnz Loop
+	dec d
+	jr nz,Loop
+	call Reader_FinishReadBitInline
 
 	push iy
 	call Reader_Destruct

          
M src/alphabet/Branch.asm +2 -2
@@ 1,12 1,12 @@ 
 ;
 ; Huffman tree node
 ;
-Branch_ID: equ 0CDH  ; First byte of a branch object, for ID purposes
+Branch_ID: equ 0DH  ; First byte of a branch object, for ID purposes
 
 Branch: MACRO
 	; ix = reader
 	Process:
-		call Reader_ReadBit
+		Reader_ReadBitInline
 		jp nc,0
 	zero: equ $ - 2
 		jp 0