pico-sdk/src/rp2_common/pico_float/float_v1_rom_shim.S

/*
 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "pico/asm_helper.S"

#if PICO_FLOAT_SUPPORT_ROM_V1
.syntax unified
.cpu cortex-m0plus
.thumb

#ifndef PICO_FLOAT_IN_RAM
#define PICO_FLOAT_IN_RAM 0
#endif

.macro float_section name
// todo separate flag for shims?
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm

float_section float_table_shim_on_use_helper
regular_func float_table_shim_on_use_helper
    push {r0-r2, lr}
    mov r0, ip
#ifndef NDEBUG
    // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro
    cmp r0, #0
    bne 1f
    bkpt #0
#endif
1:
    ldrh r1, [r0]
    lsrs r2, r1, #8
    adds r0, #2
    cmp r2, #0xdf
    bne 1b
    uxtb r1, r1 // r1 holds table offset
    lsrs r2, r0, #2
    bcc 1f
    // unaligned
    ldrh r2, [r0, #0]
    ldrh r0, [r0, #2]
    lsls r0, #16
    orrs r0, r2
    b 2f
1:
    ldr r0, [r0]
2:
    ldr r2, =sf_table
    str r0, [r2, r1]
    str r0, [sp, #12]
    pop {r0-r2, pc}

float_section 642float_shims

@ convert uint64 to float, rounding
regular_func uint642float_shim
 movs r2,#0       @ fall through

@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
regular_func ufix642float_shim
 push {r4,r5,r14}
 cmp r1,#0
 bpl 3f          @ positive? we can use signed code
 lsls r5,r1,#31  @ contribution to sticky bits
 orrs r5,r0
 lsrs r0,r1,#1
 subs r2,#1
 b 4f

@ convert int64 to float, rounding
regular_func int642float_shim
 movs r2,#0       @ fall through

@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
regular_func fix642float_shim
 push {r4,r5,r14}
3:
 movs r5,r0
 orrs r5,r1
 beq ret_pop45   @ zero? return +0
 asrs r5,r1,#31  @ sign bits
2:
 asrs r4,r1,#24  @ try shifting 7 bits at a time
 cmp r4,r5
 bne 1f          @ next shift will overflow?
 lsls r1,#7
 lsrs r4,r0,#25
 orrs r1,r4
 lsls r0,#7
 adds r2,#7
 b 2b
1:
 movs r5,r0
 movs r0,r1
4:
 rsbs r2,#0
 adds r2,#32+29

 // bl packx
 ldr r1, =0x29ef // packx
 blx r1
ret_pop45:
 pop {r4,r5,r15}

float_section fatan2_shim
regular_func fatan2_shim
 push {r4,r5,r14}

 ldr r4, =0x29c1 // unpackx
 mov ip, r4
@ unpack arguments and shift one down to have common exponent
 blx ip
 mov r4,r0
 mov r0,r1
 mov r1,r4
 mov r4,r2
 mov r2,r3
 mov r3,r4
 blx ip
 lsls r0,r0,#5  @ Q28
 lsls r1,r1,#5  @ Q28
 adds r4,r2,r3  @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
 asrs r4,#9
 adds r4,#1
 bmi 2f         @ force y to 0 proper, so result will be zero
 subs r4,r2,r3  @ calculate shift
 bge 1f         @ ex>=ey?
 rsbs r4,#0     @ make shift positive
 asrs r0,r4
 cmp r4,#28
 blo 3f
 asrs r0,#31
 b 3f
1:
 asrs r1,r4
 cmp r4,#28
 blo 3f
2:
@ here |x|>>|y| or both x and y are ±0
 cmp r0,#0
 bge 4f         @ x positive, return signed 0
 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
 ldr r0,[r3]    @ x negative, return +/- pi
 asrs r1,#31
 eors r0,r1
 b 7f
4:
 asrs r0,r1,#31
 b 7f
3:
 movs r2,#0              @ initial angle
 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
 cmp r0,#0               @ x negative
 bge 5f
 rsbs r0,#0              @ rotate to 1st/4th quadrants
 rsbs r1,#0
 ldr r2,[r3]             @ pi Q29
5:
 movs r4,#1              @ m=1
 ldr r5, =0x2b97         @ cordic_vec
 blx r5                  @ also produces magnitude (with scaling factor 1.646760119), which is discarded
 mov r0,r2               @ result here is -pi/2..3pi/2 Q29
@ asrs r2,#29
@ subs r0,r2
 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
 ldr r2,[r3]             @ pi Q29
 adds r4,r0,r2           @ attempt to fix -3pi/2..-pi case
 bcs 6f                  @ -pi/2..0? leave result as is
 subs r4,r0,r2           @ <pi? leave as is
 bmi 6f
 subs r0,r4,r2           @ >pi: take off 2pi
6:
 subs r0,#1              @ fiddle factor so atan2(0,1)==0
7:
 movs r2,#0              @ exponent for pack
 ldr r3, =0x2b19
 bx r3

float_section float232_shims

regular_func float2int_shim
     movs r1,#0                    @ fall through
regular_func float2fix_shim
     // check for -0 or -denormal upfront
     asrs r2, r0, #23
     adds r2, #128
     adds r2, #128
     beq 1f
     // call original
     ldr r2, =0x2acd
     bx r2
     1:
     movs r0, #0
     bx lr

float_section float264_shims

regular_func float2int64_shim
 movs r1,#0                    @ and fall through
regular_func float2fix64_shim
 push {r14}
 bl f2fix
 b d2f64_a

regular_func float2uint64_shim
 movs r1,#0                    @ and fall through
regular_func float2ufix64_shim
 asrs r3,r0,#23                @ negative? return 0
 bmi ret_dzero
@ and fall through

@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
@ result clamped so that r3 can only be 0 or -1
@ trashes r12
.thumb_func
f2fix:
 push {r4,r14}
 mov r12,r1
 asrs r3,r0,#31
 lsls r0,#1
 lsrs r2,r0,#24
 beq 1f                        @ zero?
 cmp r2,#0xff                  @ Inf?
 beq 2f
 subs r1,r2,#1
 subs r2,#0x7f                 @ remove exponent bias
 lsls r1,#24
 subs r0,r1                    @ insert implied 1
 eors r0,r3
 subs r0,r3                    @ top two's complement
 asrs r1,r0,#4                 @ convert to double format
 lsls r0,#28
 ldr r4, =d2fix_a
 bx r4
1:
 movs r0,#0
 movs r1,r0
 movs r3,r0
 pop {r4,r15}
2:
 mvns r0,r3                    @ return max/min value
 mvns r1,r3
 pop {r4,r15}

ret_dzero:
 movs r0,#0
 movs r1,#0
 bx r14

float_section d2fix_a_float

.weak d2fix_a // weak because it exists in float shims too
.thumb_func
d2fix_a:
@ here
@ r0:r1 two's complement mantissa
@ r2    unbaised exponent
@ r3    mantissa sign extension bits
 add r2,r12                    @ exponent plus offset for required binary point position
 subs r2,#52                   @ required shift
 bmi 1f                        @ shift down?
@ here a shift up by r2 places
 cmp r2,#12                    @ will clamp?
 bge 2f
 movs r4,r0
 lsls r1,r2
 lsls r0,r2
 rsbs r2,#0
 adds r2,#32                   @ complementary shift
 lsrs r4,r2
 orrs r1,r4
 pop {r4,r15}
2:
 mvns r0,r3
 mvns r1,r3                    @ overflow: clamp to extreme fixed-point values
 pop {r4,r15}
1:
@ here a shift down by -r2 places
 adds r2,#32
 bmi 1f                        @ long shift?
 mov r4,r1
 lsls r4,r2
 rsbs r2,#0
 adds r2,#32                   @ complementary shift
 asrs r1,r2
 lsrs r0,r2
 orrs r0,r4
 pop {r4,r15}
1:
@ here a long shift down
 movs r0,r1
 asrs r1,#31                   @ shift down 32 places
 adds r2,#32
 bmi 1f                        @ very long shift?
 rsbs r2,#0
 adds r2,#32
 asrs r0,r2
 pop {r4,r15}
1:
 movs r0,r3                    @ result very near zero: use sign extension bits
 movs r1,r3
 pop {r4,r15}
d2f64_a:
 asrs r2,r1,#31
 cmp r2,r3
 bne 1f                        @ sign extension bits fail to match sign of result?
 pop {r15}
1:
 mvns r0,r3
 movs r1,#1
 lsls r1,#31
 eors r1,r1,r0                 @ generate extreme fixed-point values
 pop {r15}

float_section float2double_shim
regular_func float2double_shim
 lsrs r3,r0,#31                @ sign bit
 lsls r3,#31
 lsls r1,r0,#1
 lsrs r2,r1,#24                @ exponent
 beq 1f                        @ zero?
 cmp r2,#0xff                  @ Inf?
 beq 2f
 lsrs r1,#4                    @ exponent and top 20 bits of mantissa
 ldr r2,=#(0x3ff-0x7f)<<20     @ difference in exponent offsets
 adds r1,r2
 orrs r1,r3
 lsls r0,#29                   @ bottom 3 bits of mantissa
 bx r14
1:
 movs r1,r3                    @ return signed zero
3:
 movs r0,#0
 bx r14
2:
 ldr r1,=#0x7ff00000           @ return signed infinity
 adds r1,r3
 b 3b

#endif
Initial Release 2021-01-20 16:44:27 +00:00			`/*`
			`* Copyright (c) 2020 Raspberry Pi (Trading) Ltd.`
			`*`
			`* SPDX-License-Identifier: BSD-3-Clause`
			`*/`

			`#include "pico/asm_helper.S"`

			`#if PICO_FLOAT_SUPPORT_ROM_V1`
			`.syntax unified`
			`.cpu cortex-m0plus`
			`.thumb`

			`#ifndef PICO_FLOAT_IN_RAM`
			`#define PICO_FLOAT_IN_RAM 0`
			`#endif`

			`.macro float_section name`
			`// todo separate flag for shims?`
			`#if PICO_FLOAT_IN_RAM`
			`.section RAM_SECTION_NAME(\name), "ax"`
			`#else`
			`.section SECTION_NAME(\name), "ax"`
			`#endif`
			`.endm`

			`float_section float_table_shim_on_use_helper`
			`regular_func float_table_shim_on_use_helper`
			`push {r0-r2, lr}`
			`mov r0, ip`
			`#ifndef NDEBUG`
			`// sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro`
			`cmp r0, #0`
			`bne 1f`
			`bkpt #0`
			`#endif`
			`1:`
			`ldrh r1, [r0]`
			`lsrs r2, r1, #8`
			`adds r0, #2`
			`cmp r2, #0xdf`
			`bne 1b`
			`uxtb r1, r1 // r1 holds table offset`
			`lsrs r2, r0, #2`
			`bcc 1f`
			`// unaligned`
			`ldrh r2, [r0, #0]`
			`ldrh r0, [r0, #2]`
			`lsls r0, #16`
			`orrs r0, r2`
			`b 2f`
			`1:`
			`ldr r0, [r0]`
			`2:`
			`ldr r2, =sf_table`
			`str r0, [r2, r1]`
			`str r0, [sp, #12]`
			`pop {r0-r2, pc}`

			`float_section 642float_shims`

			`@ convert uint64 to float, rounding`
			`regular_func uint642float_shim`
			`movs r2,#0 @ fall through`

			`@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2`
			`regular_func ufix642float_shim`
			`push {r4,r5,r14}`
			`cmp r1,#0`
			`bpl 3f @ positive? we can use signed code`
			`lsls r5,r1,#31 @ contribution to sticky bits`
			`orrs r5,r0`
			`lsrs r0,r1,#1`
			`subs r2,#1`
			`b 4f`

			`@ convert int64 to float, rounding`
			`regular_func int642float_shim`
			`movs r2,#0 @ fall through`

			`@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2`
			`regular_func fix642float_shim`
			`push {r4,r5,r14}`
			`3:`
			`movs r5,r0`
			`orrs r5,r1`
			`beq ret_pop45 @ zero? return +0`
			`asrs r5,r1,#31 @ sign bits`
			`2:`
			`asrs r4,r1,#24 @ try shifting 7 bits at a time`
			`cmp r4,r5`
			`bne 1f @ next shift will overflow?`
			`lsls r1,#7`
			`lsrs r4,r0,#25`
			`orrs r1,r4`
			`lsls r0,#7`
			`adds r2,#7`
			`b 2b`
			`1:`
			`movs r5,r0`
			`movs r0,r1`
			`4:`
			`rsbs r2,#0`
			`adds r2,#32+29`

			`// bl packx`
			`ldr r1, =0x29ef // packx`
			`blx r1`
			`ret_pop45:`
			`pop {r4,r5,r15}`

			`float_section fatan2_shim`
			`regular_func fatan2_shim`
			`push {r4,r5,r14}`

			`ldr r4, =0x29c1 // unpackx`
			`mov ip, r4`
			`@ unpack arguments and shift one down to have common exponent`
			`blx ip`
			`mov r4,r0`
			`mov r0,r1`
			`mov r1,r4`
			`mov r4,r2`
			`mov r2,r3`
			`mov r3,r4`
			`blx ip`
			`lsls r0,r0,#5 @ Q28`
			`lsls r1,r1,#5 @ Q28`
			`adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise`
			`asrs r4,#9`
			`adds r4,#1`
			`bmi 2f @ force y to 0 proper, so result will be zero`
			`subs r4,r2,r3 @ calculate shift`
			`bge 1f @ ex>=ey?`
			`rsbs r4,#0 @ make shift positive`
			`asrs r0,r4`
			`cmp r4,#28`
			`blo 3f`
			`asrs r0,#31`
			`b 3f`
			`1:`
			`asrs r1,r4`
			`cmp r4,#28`
			`blo 3f`
			`2:`
			`@ here \|x\|>>\|y\| or both x and y are ±0`
			`cmp r0,#0`
			`bge 4f @ x positive, return signed 0`
			`ldr r3, =0x2cfc @ &pi_q29, circular coefficients`
			`ldr r0,[r3] @ x negative, return +/- pi`
			`asrs r1,#31`
			`eors r0,r1`
			`b 7f`
			`4:`
			`asrs r0,r1,#31`
			`b 7f`
			`3:`
			`movs r2,#0 @ initial angle`
			`ldr r3, =0x2cfc @ &pi_q29, circular coefficients`
			`cmp r0,#0 @ x negative`
			`bge 5f`
			`rsbs r0,#0 @ rotate to 1st/4th quadrants`
			`rsbs r1,#0`
			`ldr r2,[r3] @ pi Q29`
			`5:`
			`movs r4,#1 @ m=1`
			`ldr r5, =0x2b97 @ cordic_vec`
			`blx r5 @ also produces magnitude (with scaling factor 1.646760119), which is discarded`
			`mov r0,r2 @ result here is -pi/2..3pi/2 Q29`
			`@ asrs r2,#29`
			`@ subs r0,r2`
			`ldr r3, =0x2cfc @ &pi_q29, circular coefficients`
			`ldr r2,[r3] @ pi Q29`
			`adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case`
			`bcs 6f @ -pi/2..0? leave result as is`
			`subs r4,r0,r2 @ <pi? leave as is`
			`bmi 6f`
			`subs r0,r4,r2 @ >pi: take off 2pi`
			`6:`
			`subs r0,#1 @ fiddle factor so atan2(0,1)==0`
			`7:`
			`movs r2,#0 @ exponent for pack`
			`ldr r3, =0x2b19`
			`bx r3`

			`float_section float232_shims`

			`regular_func float2int_shim`
			`movs r1,#0 @ fall through`
			`regular_func float2fix_shim`
			`// check for -0 or -denormal upfront`
			`asrs r2, r0, #23`
			`adds r2, #128`
			`adds r2, #128`
			`beq 1f`
			`// call original`
			`ldr r2, =0x2acd`
			`bx r2`
			`1:`
			`movs r0, #0`
			`bx lr`

			`float_section float264_shims`

			`regular_func float2int64_shim`
			`movs r1,#0 @ and fall through`
			`regular_func float2fix64_shim`
			`push {r14}`
			`bl f2fix`
			`b d2f64_a`

			`regular_func float2uint64_shim`
			`movs r1,#0 @ and fall through`
			`regular_func float2ufix64_shim`
			`asrs r3,r0,#23 @ negative? return 0`
			`bmi ret_dzero`
			`@ and fall through`

			`@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf`
			`@ result clamped so that r3 can only be 0 or -1`
			`@ trashes r12`
			`.thumb_func`
			`f2fix:`
			`push {r4,r14}`
			`mov r12,r1`
			`asrs r3,r0,#31`
			`lsls r0,#1`
			`lsrs r2,r0,#24`
			`beq 1f @ zero?`
			`cmp r2,#0xff @ Inf?`
			`beq 2f`
			`subs r1,r2,#1`
			`subs r2,#0x7f @ remove exponent bias`
			`lsls r1,#24`
			`subs r0,r1 @ insert implied 1`
			`eors r0,r3`
			`subs r0,r3 @ top two's complement`
			`asrs r1,r0,#4 @ convert to double format`
			`lsls r0,#28`
			`ldr r4, =d2fix_a`
			`bx r4`
			`1:`
			`movs r0,#0`
			`movs r1,r0`
			`movs r3,r0`
			`pop {r4,r15}`
			`2:`
			`mvns r0,r3 @ return max/min value`
			`mvns r1,r3`
			`pop {r4,r15}`

			`ret_dzero:`
			`movs r0,#0`
			`movs r1,#0`
			`bx r14`

			`float_section d2fix_a_float`

			`.weak d2fix_a // weak because it exists in float shims too`
			`.thumb_func`
			`d2fix_a:`
			`@ here`
			`@ r0:r1 two's complement mantissa`
			`@ r2 unbaised exponent`
			`@ r3 mantissa sign extension bits`
			`add r2,r12 @ exponent plus offset for required binary point position`
			`subs r2,#52 @ required shift`
			`bmi 1f @ shift down?`
			`@ here a shift up by r2 places`
			`cmp r2,#12 @ will clamp?`
			`bge 2f`
			`movs r4,r0`
			`lsls r1,r2`
			`lsls r0,r2`
			`rsbs r2,#0`
			`adds r2,#32 @ complementary shift`
			`lsrs r4,r2`
			`orrs r1,r4`
			`pop {r4,r15}`
			`2:`
			`mvns r0,r3`
			`mvns r1,r3 @ overflow: clamp to extreme fixed-point values`
			`pop {r4,r15}`
			`1:`
			`@ here a shift down by -r2 places`
			`adds r2,#32`
			`bmi 1f @ long shift?`
			`mov r4,r1`
			`lsls r4,r2`
			`rsbs r2,#0`
			`adds r2,#32 @ complementary shift`
			`asrs r1,r2`
			`lsrs r0,r2`
			`orrs r0,r4`
			`pop {r4,r15}`
			`1:`
			`@ here a long shift down`
			`movs r0,r1`
			`asrs r1,#31 @ shift down 32 places`
			`adds r2,#32`
			`bmi 1f @ very long shift?`
			`rsbs r2,#0`
			`adds r2,#32`
			`asrs r0,r2`
			`pop {r4,r15}`
			`1:`
			`movs r0,r3 @ result very near zero: use sign extension bits`
			`movs r1,r3`
			`pop {r4,r15}`
			`d2f64_a:`
			`asrs r2,r1,#31`
			`cmp r2,r3`
			`bne 1f @ sign extension bits fail to match sign of result?`
			`pop {r15}`
			`1:`
			`mvns r0,r3`
			`movs r1,#1`
			`lsls r1,#31`
			`eors r1,r1,r0 @ generate extreme fixed-point values`
			`pop {r15}`

			`float_section float2double_shim`
			`regular_func float2double_shim`
			`lsrs r3,r0,#31 @ sign bit`
			`lsls r3,#31`
			`lsls r1,r0,#1`
			`lsrs r2,r1,#24 @ exponent`
			`beq 1f @ zero?`
			`cmp r2,#0xff @ Inf?`
			`beq 2f`
			`lsrs r1,#4 @ exponent and top 20 bits of mantissa`
			`ldr r2,=#(0x3ff-0x7f)<<20 @ difference in exponent offsets`
			`adds r1,r2`
			`orrs r1,r3`
			`lsls r0,#29 @ bottom 3 bits of mantissa`
			`bx r14`
			`1:`
			`movs r1,r3 @ return signed zero`
			`3:`
			`movs r0,#0`
			`bx r14`
			`2:`
			`ldr r1,=#0x7ff00000 @ return signed infinity`
			`adds r1,r3`
			`b 3b`

			`#endif`