Faster 32-bit mul for IAR 78K0 3.34B
CASTalk.com Forum Index CASTalk.com
Discussion of DSP, FPGA, storage and embedded system.
 
 FAQFAQ   MemberlistMemberlist     RegisterRegister 
 ProfileProfile   Log in to check your private messagesLog in to check your private messages   Log inLog in 
 
Google
 
Web castalk.com
Faster 32-bit mul for IAR 78K0 3.34B

 
Post new topic   Reply to topic    CASTalk.com Forum Index -> Embedded System
Author Message
Jyrki Holopainen
Guest





Posted: Sat Dec 10, 2005 1:15 am    Post subject: Faster 32-bit mul for IAR 78K0 3.34B Reply with quote

Hello world,

Browsed couple days ago throuh the IAR 78K0 C 3.34B libraries and was
somehow disapointed to notice that the the 32 bit multiplication was a
'traditional' one, i.e. one that does not use the microcontroller's own
multiply instruction. I ported a routine that I originally wrote for
Intel 805X and Dunfiled DDS C-compiler and here is the result.

My version is 3-5 times faster than the library multiplication. The real
life speed improvement factor is probably around 3.7. The C library
routine can make ~2600 - 4300 multiplications/sec (8Mhz system), while
this one handles ~13000 muls/sec. The numbers are not the absolute
truth, but give an estimate what these routines can do.

The function prototype is: uint32 uint32mul(uint32 left, uint32 right);
(typedef unsigned long uint32;) I think that it should be possible to
override the library function with this on with a linker-file definition.

I have not tested this with the new IAR C/C++ -compiler, but it
should(?) work.

File mul.s26:
;-----------------------------------------------------------------------------
;
; COPYRIGHT (c) 2005 Jyrki Holopainen, all rights reserved
;
; The copyright to the computer program(s) herein is the property of
; Jyrki Holopainen. The code may be used, copied or edited freely as
; long as the original copyright message is retained.
;
;-----------------------------------------------------------------------------
;
; Multiplies two unsigned 32-bit operands and sets status.
; For NEC 78K0 microcontrollers with MULU-instruction.
; Compiler: IAR 3.34B C-compiler/assembler
;
; Revision history:
; Original: 07.12.2005 Jyrki Holopainen jyrki_mul32 AT halo.pp.fi
; Last modified: 08.12.2005 Jyrki Holopainen
;
;-----------------------------------------------------------------------------
;
; Input: AX, BC Operand 1
; [SP+0:1] Return address
; [SP+2:5] Operand 2
;
; Output: AX, BC Op1 mul Op2
; Z-flag ?
;
;
; Function prototype:
; uint32 uint32mul(uint32 left, uint32 right);
;
;-----------------------------------------------------------------------------
;
; Compare compiler 32 * 32 -> 32 multiplication with this one
; (IAR 3.34, NEC 78K0 8 Mhz):
;
; lib* this Ratio Operation
; -------------------------------------------------------
; ops/sec 4286 13100 3.1 0 * 0 (var * var)
; 3673 13057 3.6 999 * 999 (var * var)
; 3546 13036 3.7 99999 * 99999 (var * var)
; 3313 13015 3.9 9999999 * 9999999 (var * var)
; 2932 12909 4.4 999999999 * 999999999 (var * var)
;
; 3211 13057 4.1 0xffff * 0xffff (var * var)
; 2599 12960 5.0 0x7fffffff * 0x7fff...(var * var)
;
;------------------------------------------------------------------------------
MODULE LONG_MUL_L03_Fast

PUBLIC uint32mul

EXTERN ?L_F_DEALLOC_L06

RSEG RCODE

;-----------------------------------------------------------------------------
;
; Before starting the multiplication, the stack is set up following way:
; 14-17 Operand 2 ('right')
; 12-13 Return address
; 8 -11 Saved working registers
; 4 - 7 Operand 1 ('left')
; 0 - 3 Result
; . . . Saved hl
;
; At the beginning [hl] pointer points to the byte 0 of the result.
; The pointer is incrementd (byte 0 -> 3) at the outer loop as the
; multiplication proceeds.
;
; On the outer loop the 'result' and 'right' bytes are processed
; at the same rate, so the hl pointer and pointer + fixed offset
; can be used to access a byte from both of the variables.
;
; On the inner loop the 'result' and 'left' bytes are processed
; at the same rate, so the hl pointer and pointer + variable offset
; can be used to access a byte from both of the variables.
;
; Outer multiply loop registers
; Work:
; hl: base pointer to Nth byte of result
; b: Offset between base and 'left'. Used also as a loop counter.
;
;
; Inner multiply loop registers
; In:
; hl: Base pointer to Nth byte of result
; b: Distance between the base pointer and the left operand
; e: Current byte from the 'right'
; c: Number of bytes to process, used as loop counter
;
; Work:
; ax: Calc accumulator
; d: Carry byte
;
;
; The high byte on the add sequence on the inner loop cannot overflow:
;
; Worst case:
; 1) 0xff * 0xff = 0xfe01 temp = *left * rightByte;
; 2) 0xfe01 + 0xff = 0xff00 temp += mulCarry;
; 3) 0xff00 + 0xff = 0xffff temp += *sum
;
;

;
; Distance of 'result' and 'right' on the stack.
;
#define BASE_OP1_OFFSET 14

uint32mul:
push de ; Save working registers
push hl

push bc ; Push operand 1 (left) to stack
push ax

movw ax, #0 ; Allocate stack for the result & clear it
push ax
push ax

movw ax, sp
decw ax ; Point one byte past result (fixed later)
push ax ; Store the base pointer

mov b,#4 ; Difference between left (operand 1) and
; base pointer in memory

?mulOutLoop:
mov a, b ; bytes
mov c, a

pop hl ; Get the base pointer
incw hl ; Step to next
push hl

mov a, [hl + BASE_OP1_OFFSET] ; right
mov e, a ; e: rightByte


mov d, #0 ; d: mulCarry = 0;

?mulInLoop:
mov a, e ; right
mov x, a
mov a, [hl + b] ; left

mulu x ; temp = *left * rightByte;

xch a,x

add a, d ; temp += mulCarry;
bnc ?skipMulCarryAdd1
inc x
?skipMulCarryAdd1:

add a, [hl] ; temp += *sum
bnc ?skipMulCarryAdd2
inc x
?skipMulCarryAdd2:

mov [hl], a ; *sum = (uint8)temp;

mov a, x
mov d, a ; mulCarry = temp >> 8;

incw hl ; sum++, left++

dbnz c, ?mulInLoop ; while (--bytes);


dbnz b, ?mulOutLoop ; Decrement diff or quit

;
; Get the result & clean up
;
pop hl ; Drop the base pointer

pop bc ; Get the result
pop de

pop hl ; Drop left
pop hl

br ?L_F_DEALLOC_L06 ; Deallocate params and set status

;-----------------------------------------------------------------------------

#if 0

/*
** The above multiplication written in C
*/

typedef unsigned char uint8;
typedef unsigned int uint16;
typedef unsigned long uint32;

void
mul1(uint8* sum,
uint8* left,
uint8 rightByte,
uint8 bytes)
{
uint8 mulCarry;

mulCarry = 0;

do
{
uint16 temp;

temp = *left * rightByte; /* 8 * 8 -> 16 */
temp += *sum; /* 16 + 8 -> 16 */
temp += mulCarry; /* 16 + 8 -> 16 */
*sum = (uint8)temp; /* low byte */
mulCarry = temp >> 8; /* high byte */

left++;
sum++;
}
while (--bytes);
}


uint32
longmulC(uint32 left, uint32 right)
{
uint32 result32;
uint8* result;
uint8* rig;

result32 = 0;
result = (uint8*)&result32;
rig = (uint8*)&right;

mul1(&result[0], (uint8*)&left, rig[0], 4);
mul1(&result[1], (uint8*)&left, rig[1], 3);
mul1(&result[2], (uint8*)&left, rig[2], 2);
mul1(&result[3], (uint8*)&left, rig[3], 1);

return result32;
}


#endif

;-----------------------------------------------------------------------------

END
Back to top
 
Post new topic   Reply to topic    CASTalk.com Forum Index -> Embedded System All times are GMT
Page 1 of 1

 
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum




VoIP Electronics Powered by phpBB