Fast multiplication on the Game Boy!

You say: what are you talking about, Nick? The Game Boy (or even the Game Boy Color) wasn't designed to do anything fast?

To which I reply: when Dave Leitch & I were programming our "3D Pocket Pool" game ("3D Pool Allstars" in the US, for obvious reasons), we needed fast maths routines - so I had to come up with some. It didn't matter whether it was possible or not. :-) So here's what I devised...

Firstly, I don't think I'm breaking my Nintendo NDA by saying that the GB/GBC is powered by a low-power Z80 clone with the complex instructions taken out, but with other useful ops put in their place (any emulator site would tell you this). However, my solution didn't end up using any of the new instructions, so I don't feel too bad about posting my code openly. That's only relevant so I don't get emails saying "Hey Nick, why didn't you use LD HL,DE?"... it's because the GB doesn't have it. :-)

Anyway... overall, my approach to the problem was to figure out what the bare minimum set of instructions for any given multiply would be, and then try to achieve that. Also: because speed was so critical our title, my design aim was to fit it into the lowest (permanently banked in) ROM bank, so 16K was the upper size constraint... any solution I could construct in that space was fair game. :-)

In the end, though, I have to say that I created a monster of a routine. If you're expecting a petite 50-line masterpiece, look away now.

The code multiplies an 8-bit unsigned char (A) with a 16-bit unsigned short (DE), and here's how it begins:

Stage 1: build (from A) an offset (in HL) into a 256-element table, where each element holds up to 8 bytes of code. To avoid unnecessary shifts, the H and L values are simply masked from the A-value - so this isn't a simple "multiply by 8", it's also changing the order of the 256 elements, in a way which the table itself will have to be inversely ordered around. But it saves several run-time cycles for no actual cost, so I like this trick a lot. :-)

ld h,a
and $f8
ld l,a
xor h

Stage 2: add the start of the table into H, and set B = A = 0 (which many of the routines will rely on):

add a,>multiplyjumptable
ld h,a
xor a
ld b,a

Stage 3: jump directly to the appropriate code fragment - get on with it!

jp (hl)

All each of the 256 code fragments then needs to do is to multiply DE by a single fixed value: though I optimised most of these routines quite closely, there were (IIRC) a handful that could have been further tweaked - but I'd had quite enough of this accursed routine by that stage. :-)

Finally: it was absolutely essential that each element got padded up to 8 bytes, hence all the NOPx macro calls - for those routines that were longer than 8 bytes, the code jumps into various lumps of shared code, which I constructed by hand.

If you're writing a GB/GBC game and would like to use this gargantuan miracle of perseverance, please feel free to email me for a copy of the (verified) source code. Copyright remains mine, but I'm sure we can sort something out... :-)

%macro NOP1
{
	nop
}
%macro NOP2
{
	nop
	nop
}	
%macro NOP3
{
	nop
	nop
	nop
}
%macro NOP4
{
	nop
	nop
	nop
	nop
}
%macro NOP5
{
	nop
	nop
	nop
	nop
	nop
}
%macro NOP6
{
	nop
	nop
	nop
	nop
	nop
	nop
}

 

%macro JUMP0
{
	ld h,a
	;ld l,a 
;L is already zero!
;[this is extremely tricksy!]
	ret
	NOP6
}
%macro JUMP0_1
{
	ld h,d
	ld l,e ;HL = DE
	ret
	NOP5
}
%macro JUMP0_2
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	ret
	NOP3 
}
%macro JUMP0_3
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,de
	adc b
	ret 
	nop
}
%macro JUMP0_4
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,hl
	rla
	ret 
	nop
}
%macro JUMP0_5
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla ;AHL <<= 1
	add hl,hl 
	jp .exit05
}
%macro JUMP0_6
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla ;AHL <<= 1
	add hl,de
	jp .exit06
}
%macro JUMP0_7
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla	  ;AHL <<= 1
	add hl,de
	jp .exit07
}
%macro JUMP0_8
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,hl
	jp .exit08

}
%macro JUMP0_9
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,hl
	jp .exit09
}
%macro JUMP0_10
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,hl
	jp .exit010
}
%macro JUMP0_11
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,hl
	jp .exit011
}
%macro JUMP0_12
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,de
	jp .exit012
}

%macro JUMP0_13
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,de
	jp .exit013
}
%macro JUMP0_14
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,de
	jp .exit014
}
%macro JUMP0_15
{
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla
	add hl,de
	jp .exit015
}
 
%macro JUMP0
{
	JUMP0_@1
}
%macro JUMP1
{
	ld h,d
	ld l,e ;HL = DE
	jp multby@1
	NOP3
}
%macro JUMP2
{
	ld h,d
	ld l,e ;HL = DE 
	add hl,hl
	rla 	;AHL <<= 1
	jp multby@1
	nop
}
%macro JUMP3
{
;;;;if n * 0x3F...
  %if (@1)=15
  {
	ld a,d
	ld h,e
	rra
	rr h
	jp .exit63
  }
  %if (@1)<15
  {
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla ;AHL <<= 1
	add hl,de
	jp multby@1_3
  }
}
%macro JUMP4
{
;;;;if n * 0x40...
 %if (@1)=0
 {
	ld a,d
	ld h,e
	rra
	rr h
	jp .exit64
 }
;;;;if n * 0x41...
 %if (@1)=1
 {
	ld a,d
	ld h,e
	rra
	rr h 
	jp .exit65
 }
 %if (@1)>1
 {
	ld h,d
	ld l,e ;HL = DE
	add hl,hl
	rla ;AHL <<= 1
	add hl,hl
	jp multby@1_4
 }
}
%macro JUMP5
{
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,hl 
	jp multby@1_5
}
%macro JUMP6
{
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,de
	jp multby@1_6 
}
%macro JUMP7
{
;;;;if n * 0x7F...
  %if (@1)=15
  {
	ld a,d
	ld h,e 
	rra
	rr h
	jp .exit127
  }
  %if (@1)<15
  {
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,de
	jp multby@1_7
  }
}
%macro JUMP8
{
;;;;;if n * 0x80...
 %if (@1)=0
 {
	ld a,d
	ld h,e
	rra
	rr h
	jp .exit128
 }
;;;;if n * 0x81...
 %if (@1)=1
 {
	ld a,d
	ld h,e
	rra
	rr h
	jp .exit129
 }
;;;;if n * 0x82... 
 %if (@1)=2
 {
	ld a,d
	ld h,e
	rra
	rr h
	jp .exit130
 }
 %if (@1)>2
 {
	ld h,d
	ld l,e
	add hl,hl 
	rla
	add hl,hl
	jp multby@1_8
 }
}
%macro JUMP9
{
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,hl
	jp multby@1_9
}
%macro JUMP10
{
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,hl 
	jp multby@1_10
}
%macro JUMP11
{
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,hl
	jp multby@1_11 
}
%macro JUMP12
{
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,de
	jp multby@1_12
}
%macro JUMP13
{
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,de
	jp multby@1_13
}
%macro JUMP14 
{
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,de
	jp multby@1_14
}
%macro JUMP15
{
;;;;if n * 255...
 %if (@1)=15
 {
	;xor a ;A = 0 already!
	sub e
	ld l,a 
	ld a,e
	sbc d
	ld h,a
	ld a,d
	sbc b
	ret ;AHL = DEB - BDE
 }
;;;;if n * 254...
 %if (@1)=14
 {
	ld c,d
	ld h,e
	sla e
	jp .exit254
	nop
 }
 %if (@1)<14
 {
	ld h,d
	ld l,e
	add hl,hl
	rla
	add hl,de
	jp multby@1_15
 }
}

%macro JUMPSET { JUMP0 @1 JUMP0 @2 JUMP1 @1 JUMP1 @2 JUMP2 @1 JUMP2 @2 JUMP3 @1 JUMP3 @2 JUMP4 @1 JUMP4 @2 JUMP5 @1 JUMP5 @2 JUMP6 @1 JUMP6 @2 JUMP7 @1 JUMP7 @2 JUMP8 @1 JUMP8 @2 JUMP9 @1 JUMP9 @2 JUMP10 @1 JUMP10 @2 JUMP11 @1 JUMP11 @2 JUMP12 @1 JUMP12 @2 JUMP13 @1 JUMP13 @2 JUMP14 @1 JUMP14 @2 JUMP15 @1 JUMP15 @2 } A256 ;CODE GENERATION STARTS, ALIGN FOLLOWING TO PAGE BOUNDARY multiplyjumptable public JUMPSET 0,8 JUMPSET 1,9 JUMPSET 2,10 JUMPSET 3,11 JUMPSET 4,12 JUMPSET 5,13 JUMPSET 6,14 JUMPSET 7,15 .exit012 adc b add hl,hl .exit08 rla add hl,hl rla ret .exit013 adc b add hl,hl .exit09 rla add hl,hl rla add hl,de adc b ret .exit014 adc b add hl,hl .exit010 rla add hl,de .exit06 adc b add hl,hl rla ret .exit015 adc b add hl,hl .exit011 rla add hl,de .exit07 adc b add hl,hl .exit05 rla add hl,de adc b ret .exit63 ld c,a ld a,b rra ;CHA = DE * 128 rr c rr h rra ;CHA = DE * 64 sub e ld l,a ld a,h sbc d ld h,a ld a,c sbc b ;AHL = CHA - BDE ret .exit64 ld l,b rr l rra rr h rr l ret .exit65 ld l,b rr l rra rr h rr l add hl,de adc b ret .exit127 ld c,a ld a,b rra ;CHA = DE * 128 sub e ld l,a ld a,h sbc d ld h,a ld a,c sbc b ;AHL = CHA - BDE ret .exit128 ld l,b rr l ret .exit129 ld l,b rr l add hl,de adc b ret .exit130 ld l,b rr l add hl,de adc b add hl,de adc b ret .exit254 rl d rl b ;CHA = DE * 256, BDE = DE * 2 ;xor a ;A = 0 already! sub e ld l,a ld a,h sbc d ld h,a ld a,c sbc b ret ;AHL = DEB - (DE * 2) ;---------------------------------------------------------------------------- ; FUNCTION: MulU16x8 ; DOES... ; Multiplies a 16 bit value by an 8 bit one (both unsigned). ; IN: DE = the 16 bit value ; A = the 8 bit one ; OUT: AHL = the product ; Everything else trashed ;---------------------------------------------------------------------------- MulU16x8 proc public ld h,a and $f8 ld l,a xor h add a,>multiplyjumptable ld h,a xor a ld b,a jp (hl) MulU16x8 endproc ;--------------------------- ;--------------------------- %macro INTRO { @1_14 adc b add hl,hl @1_10 rla add hl,de @1_6 adc b add hl,hl @1_4 rla jr @1 @1_12 adc b add hl,hl @1_8 rla add hl,hl rla jr @1 @1_13 adc b add hl,hl @1_9 rla jr @1_x @1_15 adc b add hl,hl @1_11 rla add hl,de @1_7 adc b @1_x add hl,hl @1_5 rla add hl,de @1_3 adc b @1 @2 ;call the shiftN macro @1_0 }

%macro SHIFT1
{
	add hl,hl
	rla
}
%macro SHIFT2 
{
	SHIFT1
	SHIFT1
}
%macro SHIFT3
{
	SHIFT2
	SHIFT1
}
%macro SHIFT4
{
	SHIFT2
	SHIFT2 
}

%macro MYRET ;edit this macro to change the return register convention { ;RETURN VALUES ARE NOW IN AHL SO... ;ld c,a ;we're expecting value to be returned in CHL, not AHL ret }

	INTRO multby1,SHIFT4
	add hl,de
	adc b
	MYRET
	INTRO multby2,SHIFT3
	add hl,de
	adc b
	SHIFT1
	MYRET
	INTRO multby3,SHIFT3
	add hl,de
	adc b
	SHIFT1
	add hl,de
	adc b
	MYRET
	INTRO multby4,SHIFT2
	add hl,de
	adc b
	SHIFT2
	MYRET
	INTRO multby5,SHIFT2
	add hl,de
	adc b
	SHIFT2
	add hl,de
	adc b
	MYRET
	INTRO multby6,SHIFT2
	add hl,de
	adc b
	SHIFT1
	add hl,de
	adc b
	SHIFT1
	MYRET
	INTRO multby7,SHIFT2
	add hl,de
	adc b
	SHIFT1
	add hl,de
	adc b
	SHIFT1
	add hl,de 
	adc b
	MYRET
	INTRO multby8,SHIFT1
	add hl,de
	adc b
	SHIFT3
	MYRET
	INTRO multby9,SHIFT1
	add hl,de
	adc b
	SHIFT3
	add hl,de
	adc b
	MYRET
	INTRO multby10,SHIFT1 
	add hl,de
	adc b
	SHIFT2
	add hl,de
	adc b
	SHIFT1
	MYRET
	INTRO multby11,SHIFT1
	add hl,de
	adc b
	SHIFT2
	add hl,de
	adc b
	SHIFT1
	add hl,de 
	adc b
	MYRET
	INTRO multby12,SHIFT1
	add hl,de
	adc b
	SHIFT1
	add hl,de
	adc b
	SHIFT2
	MYRET
	INTRO multby13,SHIFT1 
	add hl,de
	adc b
	SHIFT1
	add hl,de
	adc b
	SHIFT2
	add hl,de
	adc b
	MYRET
	INTRO multby14,SHIFT1
	add hl,de
	adc b
	SHIFT1
	add hl,de
	adc b
	SHIFT1
	add hl,de 
	adc b
	SHIFT1
	MYRET
	INTRO multby15,SHIFT1
	add hl,de 
	adc b
	SHIFT1
	add hl,de
	adc b
	SHIFT1
	add hl,de
	adc b
	SHIFT1
	add hl,de
	adc b
	MYRET 
?