/* * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. * * SPDX-License-Identifier: BSD-3-Clause */ #include "pico/asm_helper.S" #if PICO_FLOAT_SUPPORT_ROM_V1 .syntax unified .cpu cortex-m0plus .thumb #ifndef PICO_FLOAT_IN_RAM #define PICO_FLOAT_IN_RAM 0 #endif .macro float_section name // todo separate flag for shims? #if PICO_FLOAT_IN_RAM .section RAM_SECTION_NAME(\name), "ax" #else .section SECTION_NAME(\name), "ax" #endif .endm float_section float_table_shim_on_use_helper regular_func float_table_shim_on_use_helper push {r0-r2, lr} mov r0, ip #ifndef NDEBUG // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro cmp r0, #0 bne 1f bkpt #0 #endif 1: ldrh r1, [r0] lsrs r2, r1, #8 adds r0, #2 cmp r2, #0xdf bne 1b uxtb r1, r1 // r1 holds table offset lsrs r2, r0, #2 bcc 1f // unaligned ldrh r2, [r0, #0] ldrh r0, [r0, #2] lsls r0, #16 orrs r0, r2 b 2f 1: ldr r0, [r0] 2: ldr r2, =sf_table str r0, [r2, r1] str r0, [sp, #12] pop {r0-r2, pc} float_section 642float_shims @ convert uint64 to float, rounding regular_func uint642float_shim movs r2,#0 @ fall through @ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2 regular_func ufix642float_shim push {r4,r5,r14} cmp r1,#0 bpl 3f @ positive? we can use signed code lsls r5,r1,#31 @ contribution to sticky bits orrs r5,r0 lsrs r0,r1,#1 subs r2,#1 b 4f @ convert int64 to float, rounding regular_func int642float_shim movs r2,#0 @ fall through @ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2 regular_func fix642float_shim push {r4,r5,r14} 3: movs r5,r0 orrs r5,r1 beq ret_pop45 @ zero? return +0 asrs r5,r1,#31 @ sign bits 2: asrs r4,r1,#24 @ try shifting 7 bits at a time cmp r4,r5 bne 1f @ next shift will overflow? lsls r1,#7 lsrs r4,r0,#25 orrs r1,r4 lsls r0,#7 adds r2,#7 b 2b 1: movs r5,r0 movs r0,r1 4: rsbs r2,#0 adds r2,#32+29 // bl packx ldr r1, =0x29ef // packx blx r1 ret_pop45: pop {r4,r5,r15} float_section fatan2_shim regular_func fatan2_shim push {r4,r5,r14} ldr r4, =0x29c1 // unpackx mov ip, r4 @ unpack arguments and shift one down to have common exponent blx ip mov r4,r0 mov r0,r1 mov r1,r4 mov r4,r2 mov r2,r3 mov r3,r4 blx ip lsls r0,r0,#5 @ Q28 lsls r1,r1,#5 @ Q28 adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise asrs r4,#9 adds r4,#1 bmi 2f @ force y to 0 proper, so result will be zero subs r4,r2,r3 @ calculate shift bge 1f @ ex>=ey? rsbs r4,#0 @ make shift positive asrs r0,r4 cmp r4,#28 blo 3f asrs r0,#31 b 3f 1: asrs r1,r4 cmp r4,#28 blo 3f 2: @ here |x|>>|y| or both x and y are ±0 cmp r0,#0 bge 4f @ x positive, return signed 0 ldr r3, =0x2cfc @ &pi_q29, circular coefficients ldr r0,[r3] @ x negative, return +/- pi asrs r1,#31 eors r0,r1 b 7f 4: asrs r0,r1,#31 b 7f 3: movs r2,#0 @ initial angle ldr r3, =0x2cfc @ &pi_q29, circular coefficients cmp r0,#0 @ x negative bge 5f rsbs r0,#0 @ rotate to 1st/4th quadrants rsbs r1,#0 ldr r2,[r3] @ pi Q29 5: movs r4,#1 @ m=1 ldr r5, =0x2b97 @ cordic_vec blx r5 @ also produces magnitude (with scaling factor 1.646760119), which is discarded mov r0,r2 @ result here is -pi/2..3pi/2 Q29 @ asrs r2,#29 @ subs r0,r2 ldr r3, =0x2cfc @ &pi_q29, circular coefficients ldr r2,[r3] @ pi Q29 adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case bcs 6f @ -pi/2..0? leave result as is subs r4,r0,r2 @ pi: take off 2pi 6: subs r0,#1 @ fiddle factor so atan2(0,1)==0 7: movs r2,#0 @ exponent for pack ldr r3, =0x2b19 bx r3 float_section float232_shims regular_func float2int_shim movs r1,#0 @ fall through regular_func float2fix_shim // check for -0 or -denormal upfront asrs r2, r0, #23 adds r2, #128 adds r2, #128 beq 1f // call original ldr r2, =0x2acd bx r2 1: movs r0, #0 bx lr float_section float264_shims regular_func float2int64_shim movs r1,#0 @ and fall through regular_func float2fix64_shim push {r14} bl f2fix b d2f64_a regular_func float2uint64_shim movs r1,#0 @ and fall through regular_func float2ufix64_shim asrs r3,r0,#23 @ negative? return 0 bmi ret_dzero @ and fall through @ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf @ result clamped so that r3 can only be 0 or -1 @ trashes r12 .thumb_func f2fix: push {r4,r14} mov r12,r1 asrs r3,r0,#31 lsls r0,#1 lsrs r2,r0,#24 beq 1f @ zero? cmp r2,#0xff @ Inf? beq 2f subs r1,r2,#1 subs r2,#0x7f @ remove exponent bias lsls r1,#24 subs r0,r1 @ insert implied 1 eors r0,r3 subs r0,r3 @ top two's complement asrs r1,r0,#4 @ convert to double format lsls r0,#28 ldr r4, =d2fix_a bx r4 1: movs r0,#0 movs r1,r0 movs r3,r0 pop {r4,r15} 2: mvns r0,r3 @ return max/min value mvns r1,r3 pop {r4,r15} ret_dzero: movs r0,#0 movs r1,#0 bx r14 float_section d2fix_a_float .weak d2fix_a // weak because it exists in float shims too .thumb_func d2fix_a: @ here @ r0:r1 two's complement mantissa @ r2 unbaised exponent @ r3 mantissa sign extension bits add r2,r12 @ exponent plus offset for required binary point position subs r2,#52 @ required shift bmi 1f @ shift down? @ here a shift up by r2 places cmp r2,#12 @ will clamp? bge 2f movs r4,r0 lsls r1,r2 lsls r0,r2 rsbs r2,#0 adds r2,#32 @ complementary shift lsrs r4,r2 orrs r1,r4 pop {r4,r15} 2: mvns r0,r3 mvns r1,r3 @ overflow: clamp to extreme fixed-point values pop {r4,r15} 1: @ here a shift down by -r2 places adds r2,#32 bmi 1f @ long shift? mov r4,r1 lsls r4,r2 rsbs r2,#0 adds r2,#32 @ complementary shift asrs r1,r2 lsrs r0,r2 orrs r0,r4 pop {r4,r15} 1: @ here a long shift down movs r0,r1 asrs r1,#31 @ shift down 32 places adds r2,#32 bmi 1f @ very long shift? rsbs r2,#0 adds r2,#32 asrs r0,r2 pop {r4,r15} 1: movs r0,r3 @ result very near zero: use sign extension bits movs r1,r3 pop {r4,r15} d2f64_a: asrs r2,r1,#31 cmp r2,r3 bne 1f @ sign extension bits fail to match sign of result? pop {r15} 1: mvns r0,r3 movs r1,#1 lsls r1,#31 eors r1,r1,r0 @ generate extreme fixed-point values pop {r15} float_section float2double_shim regular_func float2double_shim lsrs r3,r0,#31 @ sign bit lsls r3,#31 lsls r1,r0,#1 lsrs r2,r1,#24 @ exponent beq 1f @ zero? cmp r2,#0xff @ Inf? beq 2f lsrs r1,#4 @ exponent and top 20 bits of mantissa ldr r2,=#(0x3ff-0x7f)<<20 @ difference in exponent offsets adds r1,r2 orrs r1,r3 lsls r0,#29 @ bottom 3 bits of mantissa bx r14 1: movs r1,r3 @ return signed zero 3: movs r0,#0 bx r14 2: ldr r1,=#0x7ff00000 @ return signed infinity adds r1,r3 b 3b #endif