e6ff4c0a76f9 — Laurens Holst 5 years ago
Writer: Improve Write_IY performance with self-modifying code.

Before, a write (iy) would take 88 cycles, after it takes 61.
As a nice bonus, hl is no longer modified by Write_IY.

Improves performance by 1.5%.
2 files changed, 21 insertions(+), 20 deletions(-)

M src/Inflate.asm
M src/Writer.asm
M src/Inflate.asm +0 -2
@@ 256,9 256,7 @@ Inflate_WriteLiteral: REPT 256, ?value
 ; ix = reader
 ; iy = writer
 Inflate_WriteAndNext:
-	exx
 	call Writer_Write_IY
-	exx
 	jp hl  ; jp Inflate_DecodeLiteralLength
 
 ; Literal/length alphabet symbol 256

          
M src/Writer.asm +21 -18
@@ 2,14 2,20 @@ 
 ; Memory buffer writer
 ;
 Writer: MACRO
+	; iy = this
+	Write_IY:
+		ld (0),a
+	bufferPosition: equ $ - 2
+		inc (iy + Writer.bufferPosition)
+		ret nz
+		jp Writer_Write_IY.Continue
+
 	bufferStart:
 		dw 0
 	bufferSize:
 		dw 0
 	bufferEnd:
 		dw 0
-	bufferPosition:
-		dw 0
 	flusher:
 		dw System_ThrowException
 	count:

          
@@ 77,30 83,27 @@ NextBlock:
 
 ; a = value
 ; iy = this
-; Modifies: hl
+; Modifies: none
 Writer_Write_IY: PROC
-	ld l,(iy + Writer.bufferPosition)
-	ld h,(iy + Writer.bufferPosition + 1)
-	ld (hl),a
-	inc l
-	ld (iy + Writer.bufferPosition),l
-	ret nz
-	inc h
-	ld l,a
-	ld a,(iy + Writer.bufferEnd + 1)
-	cp h
-	ld a,l
+	jp iy
+Continue:
+	push af
+	ld a,(iy + Writer.bufferPosition + 1)
+	inc a
+	cp (iy + Writer.bufferEnd + 1)
 	call z,NextBlock
-	ld (iy + Writer.bufferPosition + 1),h
+	ld (iy + Writer.bufferPosition + 1),a
+	pop af
 	ret
 NextBlock:
-	ld (iy + Writer.bufferPosition + 1),h
-	push af
+	ld (iy + Writer.bufferPosition + 1),a
+	push hl
 	push iy
 	ex (sp),ix
 	call Writer_FinishBlock
 	pop ix
-	pop af
+	pop hl
+	ld a,(iy + Writer.bufferPosition + 1)
 	ret
 	ENDP