kernel_optimize_test/arch/microblaze/lib/muldi3.S
Michal Simek 4e07dba7cb microblaze: Add libgcc function directly to kernel
Replaced libgcc functions with asm optimized implementation.

Signed-off-by: Michal Simek <monstr@monstr.eu>
2010-10-21 15:51:42 +10:00

122 lines
2.7 KiB
ArmAsm

#include <linux/linkage.h>
/*
* Multiply operation for 64 bit integers, for devices with hard multiply
* Input : Operand1[H] in Reg r5
* Operand1[L] in Reg r6
* Operand2[H] in Reg r7
* Operand2[L] in Reg r8
* Output: Result[H] in Reg r3
* Result[L] in Reg r4
*
* Explaination:
*
* Both the input numbers are divided into 16 bit number as follows
* op1 = A B C D
* op2 = E F G H
* result = D * H
* + (C * H + D * G) << 16
* + (B * H + C * G + D * F) << 32
* + (A * H + B * G + C * F + D * E) << 48
*
* Only 64 bits of the output are considered
*/
.text
.globl __muldi3
.type __muldi3, @function
.ent __muldi3
__muldi3:
addi r1, r1, -40
/* Save the input operands on the caller's stack */
swi r5, r1, 44
swi r6, r1, 48
swi r7, r1, 52
swi r8, r1, 56
/* Store all the callee saved registers */
sw r20, r1, r0
swi r21, r1, 4
swi r22, r1, 8
swi r23, r1, 12
swi r24, r1, 16
swi r25, r1, 20
swi r26, r1, 24
swi r27, r1, 28
/* Load all the 16 bit values for A thru H */
lhui r20, r1, 44 /* A */
lhui r21, r1, 46 /* B */
lhui r22, r1, 48 /* C */
lhui r23, r1, 50 /* D */
lhui r24, r1, 52 /* E */
lhui r25, r1, 54 /* F */
lhui r26, r1, 56 /* G */
lhui r27, r1, 58 /* H */
/* D * H ==> LSB of the result on stack ==> Store1 */
mul r9, r23, r27
swi r9, r1, 36 /* Pos2 and Pos3 */
/* Hi (Store1) + C * H + D * G ==> Store2 ==> Pos1 and Pos2 */
/* Store the carry generated in position 2 for Pos 3 */
lhui r11, r1, 36 /* Pos2 */
mul r9, r22, r27 /* C * H */
mul r10, r23, r26 /* D * G */
add r9, r9, r10
addc r12, r0, r0
add r9, r9, r11
addc r12, r12, r0 /* Store the Carry */
shi r9, r1, 36 /* Store Pos2 */
swi r9, r1, 32
lhui r11, r1, 32
shi r11, r1, 34 /* Store Pos1 */
/* Hi (Store2) + B * H + C * G + D * F ==> Store3 ==> Pos0 and Pos1 */
mul r9, r21, r27 /* B * H */
mul r10, r22, r26 /* C * G */
mul r7, r23, r25 /* D * F */
add r9, r9, r11
add r9, r9, r10
add r9, r9, r7
swi r9, r1, 32 /* Pos0 and Pos1 */
/* Hi (Store3) + A * H + B * G + C * F + D * E ==> Store3 ==> Pos0 */
lhui r11, r1, 32 /* Pos0 */
mul r9, r20, r27 /* A * H */
mul r10, r21, r26 /* B * G */
mul r7, r22, r25 /* C * F */
mul r8, r23, r24 /* D * E */
add r9, r9, r11
add r9, r9, r10
add r9, r9, r7
add r9, r9, r8
sext16 r9, r9 /* Sign extend the MSB */
shi r9, r1, 32
/* Move results to r3 and r4 */
lhui r3, r1, 32
add r3, r3, r12
shi r3, r1, 32
lwi r3, r1, 32 /* Hi Part */
lwi r4, r1, 36 /* Lo Part */
/* Restore Callee saved registers */
lw r20, r1, r0
lwi r21, r1, 4
lwi r22, r1, 8
lwi r23, r1, 12
lwi r24, r1, 16
lwi r25, r1, 20
lwi r26, r1, 24
lwi r27, r1, 28
/* Restore Frame and return */
rtsd r15, 8
addi r1, r1, 40
.size __muldi3, . - __muldi3
.end __muldi3