Fixup divider save_restore for floating point too; improve tests (#405)

- The divider state needs to be saved for __aeabi_ddiv, __aeabi_fdiv, __aeabi_dtan and __aeabi_ftan or they won't work in interrupts *(probably not used much youd hope), or on an RTOS context switch - Refactored code out for the integer and floating point cases - Improved the floating point 'tests' in passing to check more return values against GCC implementations - Added floating point usage to the IRQ nesting test case
2021-05-13 07:38:42 -05:00
parent c6c4eeb122
commit 574fdee37b
9 changed files with 501 additions and 158 deletions
--- a/src/rp2_common/hardware_divider/include/hardware/divider_helper.S
+++ b/src/rp2_common/hardware_divider/include/hardware/divider_helper.S
@ -0,0 +1,68 @@
 /*
 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */
 #include "hardware/regs/addressmap.h"
 #include "hardware/regs/sio.h"
 #if SIO_DIV_CSR_READY_LSB == 0
 .equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
 #else
 need to change SHIFT above
 #endif
 #if SIO_DIV_CSR_DIRTY_LSB == 1
 .equ SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY, 2
 #else
 need to change SHIFT above
 #endif
 // SIO_BASE ptr in r2; pushes r4-r7, lr to stack
 // requires that division started at least 2 cycles prior to the start of the macro
 .macro save_div_state_and_lr
 // originally we did this, however a) it uses r3, and b) the push takes 6 cycles, b)
 // any IRQ which uses the divider will necessarily put the data back, which will
 // immediately make it ready
 //
 //    // ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
 //    // // wait for results as we can't save signed-ness of operation
 //    // 1:
 //    //     lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
 //    //     bcc 1b
 // 6 cycles
 push {r4, r5, r6, r7, lr}
 // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
 ldr r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
 ldr r5, [r2, #SIO_DIV_UDIVISOR_OFFSET]
 ldr r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
 ldr r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
 .endm
 // restores divider state from r4-r7, then pops them and pc
 .macro restore_div_state_and_return
 // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
 //
 // it is worth considering what happens if we are interrupted
 //
 // after writing r4: we are DIRTY and !READY
 //    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
 //        saved/restored correctly and we'll restore the rest ourselves
 // after writing r4, r5: we are DIRTY and !READY
 //    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
 //        at least will be saved/restored correctly and and we'll restore the rest ourselves
 // after writing r4, r5, r6: we are DIRTY and READY
 //    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
 //        and we'll restore the remainder after the fact
 // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
 //      and so 4 reads is cheaper (and we don't have to adjust r2)
 // note also, that we must restore via UDIVI* rather than SDIVI* to prevent the quotient/remainder being negated on read based
 //      on the signs of the inputs
 str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
 str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET]
 str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
 str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
 pop {r4, r5, r6, r7, pc}
 .endm
--- a/src/rp2_common/pico_divider/divider.S
+++ b/src/rp2_common/pico_divider/divider.S
@ -4,8 +4,8 @@
 * SPDX-License-Identifier: BSD-3-Clause
 */
 #include "hardware/regs/sio.h"
 #include "hardware/regs/addressmap.h"
 #include "hardware/divider_helper.S"
 .syntax unified
 .cpu cortex-m0plus
@ -34,17 +34,6 @@
 #endif
 .endm
 #if SIO_DIV_CSR_READY_LSB == 0
 .equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
 #else
 need to change SHIFT above
 #endif
 #if SIO_DIV_CSR_DIRTY_LSB == 1
 .equ SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY, 2
 #else
 need to change SHIFT above
 #endif
@ wait 8-n cycles for the hardware divider
 .macro wait_div n
 .rept (8-\n) / 2
@ -56,58 +45,17 @@ need to change SHIFT above
 .endif
 .endm
 #if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4)
 #error register layout has changed - we rely on this order to make sure we save/restore in the right order
 #endif
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
 # SIO_BASE ptr in r2
 .macro save_div_state_and_lr
    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
    # wait for results as we can't save signed-ness of operation
 1:
    lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
    bcc 1b
    push {r4, r5, r6, r7, lr}
    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
    ldr r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
    ldr r5, [r2, #SIO_DIV_SDIVISOR_OFFSET]
    ldr r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
    ldr r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
 .endm
 .macro restore_div_state_and_return
    // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
    //
    // it is worth considering what happens if we are interrupted
    //
    // after writing r4: we are DIRTY and !READY
    //    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
    //        saved/restored correctly and we'll restore the rest ourselves
    // after writing r4, r5: we are DIRTY and !READY
    //    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
    //        at least will be saved/restored correctly and and we'll restore the rest ourselves
    // after writing r4, r5, r6: we are DIRTY and READY
    //    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
    //        and we'll restore the remainder after the fact
    // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
    // and so 4 reads is cheaper (and we don't have to adjust r2)
    str r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
    str r5, [r2, #SIO_DIV_SDIVISOR_OFFSET]
    str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
    str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
    pop {r4, r5, r6, r7, pc}
 .endm
 .macro save_div_state_and_lr_64
    push {r4, r5, r6, r7, lr}
    ldr r6, =SIO_BASE
 1:
    ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
-    # wait for results as we can't save signed-ness of operation
+    // wait for results as we can't save signed-ness of operation
    lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
    bcc 1b
    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
@ -154,17 +102,18 @@ wrapper_func __aeabi_idivmod
 regular_func div_s32s32
 regular_func divmod_s32s32
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
    ldr r2, =(SIO_BASE)
    # to support IRQ usage we must save/restore
    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    bcs divmod_s32s32_savestate
 regular_func divmod_s32s32_unsafe
 #else
-# to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
+// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
-# in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
+// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
-# are the hardware_divider functions that can be used instead anyway
+// are the hardware_divider functions that can be used instead anyway
 regular_func divmod_s32s32_unsafe
    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
    ldr r2, =(SIO_BASE)
    mrs r3, PRIMASK
    cpsid i
@ -203,6 +152,8 @@ regular_func divmod_s32s32_unsafe
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
 .align 2
 regular_func divmod_s32s32_savestate
    // note that we must be at least 2 cycles into division at this point,
    // which we are because of the firty check before getting here (and of course the function call before that)
    save_div_state_and_lr
    bl divmod_s32s32_unsafe
    restore_div_state_and_return
@ -215,17 +166,18 @@ regular_func divmod_u32u32
 wrapper_func __aeabi_uidiv
 wrapper_func __aeabi_uidivmod
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
    ldr r2, =(SIO_BASE)
    # to support IRQ usage we must save/restore
    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    bcs divmod_u32u32_savestate
 regular_func divmod_u32u32_unsafe
 #else
-# to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
+// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
-# in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
+// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
-# are the hardware_divider functions that can be used instead anyway
+// are the hardware_divider functions that can be used instead anyway
 regular_func divmod_u32u32_unsafe
    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
    ldr r2, =(SIO_BASE)
    mrs r3, PRIMASK
    cpsid i
@ -273,9 +225,9 @@ wrapper_func __aeabi_ldivmod
 regular_func div_s64s64
 regular_func divmod_s64s64
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
    mov ip, r2
    ldr r2, =(SIO_BASE)
    # to support IRQ usage we must save/restore
    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    mov r2, ip
@ -287,6 +239,7 @@ divmod_s64s64_savestate:
    bl divmod_s64s64_unsafe
    restore_div_state_and_return_64
 #else
    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
    push {r4, lr}
    mrs r4, PRIMASK
    cpsid i
@ -300,9 +253,9 @@ wrapper_func __aeabi_uldivmod
 regular_func div_u64u64
 regular_func divmod_u64u64
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
    mov ip, r2
    ldr r2, =(SIO_BASE)
    # to support IRQ usage we must save/restore
    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    mov r2, ip
@ -314,6 +267,7 @@ regular_func divmod_u64u64_savestate
    bl divmod_u64u64_unsafe
    restore_div_state_and_return_64
 #else
    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
    push {r4, lr}
    mrs r4, PRIMASK
    cpsid i
--- a/src/rp2_common/pico_double/double_aeabi.S
+++ b/src/rp2_common/pico_double/double_aeabi.S
@ -6,6 +6,7 @@
 #include "pico/asm_helper.S"
 #include "pico/bootrom/sf_table.h"
 #include "hardware/divider_helper.S"
 __pre_init __aeabi_double_init, 00020
@ -131,16 +132,16 @@ regular_func pop_r8_r11
 mov r11,r7
 bx r14
-# note generally each function is in a separate section unless there is fall thru or branching between them
+// note generally each function is in a separate section unless there is fall thru or branching between them
-# note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
+// note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
-# note functions are word aligned except where they are an odd number of linear instructions
+// note functions are word aligned except where they are an odd number of linear instructions
 // double FUNC_NAME(__aeabi_dadd)(double, double)         double-precision addition
 double_wrapper_section __aeabi_darithmetic
 // double FUNC_NAME(__aeabi_drsub)(double x, double y)    double-precision reverse subtraction, y - x
-# frsub first because it is the only one that needs alignment
+// frsub first because it is the only one that needs alignment
 .align 2
 wrapper_func __aeabi_drsub
    eors r0, r1
@ -177,7 +178,35 @@ wrapper_func_d2 __aeabi_ddiv
    b ddiv_dsub_nan_helper
 1:
 #endif
-   shimmable_table_tail_call SF_TABLE_FDIV ddiv_shim
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
    mov ip, r2
    ldr r2, =(SIO_BASE)
    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    bcs ddiv_save_state
    mov r2, ip
 #else
    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
    push {r4, lr}
    mrs r4, PRIMASK
    cpsid i
    bl ddiv_shim_call
    msr PRIMASK, r4
    pop {r4, pc}
 #endif
 ddiv_shim_call:
    shimmable_table_tail_call SF_TABLE_FDIV ddiv_shim
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
 ddiv_save_state:
    ldr r2, =(SIO_BASE)
    save_div_state_and_lr
    mov r2, ip
    bl ddiv_shim_call
    ldr r2, =(SIO_BASE)
    restore_div_state_and_return
 #endif
 ddiv_dsub_nan_helper:
 #if PICO_DOUBLE_PROPAGATE_NANS
@ -592,6 +621,8 @@ regular_func sincostan_remainder
    ldr r2, =0x54442D18 // 2 * M_PI
    ldr r3, =0x401921FB
    push {lr}
    // note remainder only uses the divider thru integer divider functions
    // which save and restore themselves
    bl remainder
    pop {pc}
@ -752,13 +783,40 @@ double_wrapper_section tan
 wrapper_func tan
    // rom version only works for -1024 < angle < 1024
    lsls r2, r1, #2
-    bcc 1f
+    bcc dtan_in_range
    lsrs r2, #22
    cmp r2, #9
-    bge 2f
+    bge dtan_angle_out_of_range
-1:
+dtan_in_range:
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
    mov ip, r2
    ldr r2, =(SIO_BASE)
    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    bcs dtan_save_state
    mov r2, ip
 #else
    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
    push {r4, lr}
    mrs r4, PRIMASK
    cpsid i
    bl dtan_shim_call
    msr PRIMASK, r4
    pop {r4, pc}
 #endif
 dtan_shim_call:
    shimmable_table_tail_call SF_TABLE_FTAN dtan_shim
-2:
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
 dtan_save_state:
    ldr r2, =(SIO_BASE)
    save_div_state_and_lr
    mov r2, ip
    bl dtan_shim_call
    ldr r2, =(SIO_BASE)
    restore_div_state_and_return
 #endif
 dtan_angle_out_of_range:
 #if PICO_DOUBLE_PROPAGATE_NANS
    lsls r2, r1, #1
    asrs r2, #21
@ -775,7 +833,7 @@ wrapper_func tan
    bl sincostan_remainder
    pop {r2}
    mov lr, r2
-    b 1b
+    b dtan_in_range
 double_wrapper_section atan2
 wrapper_func_d2 atan2
--- a/src/rp2_common/pico_float/float_aeabi.S
+++ b/src/rp2_common/pico_float/float_aeabi.S
@ -6,6 +6,7 @@
 #include "pico/asm_helper.S"
 #include "pico/bootrom/sf_table.h"
 #include "hardware/divider_helper.S"
 __pre_init __aeabi_float_init, 00020
@ -104,16 +105,16 @@ __check_nan_f2:
 .endm
-# note generally each function is in a separate section unless there is fall thru or branching between them
+// note generally each function is in a separate section unless there is fall thru or branching between them
-# note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
+// note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
-# note functions are word aligned except where they are an odd number of linear instructions
+// note functions are word aligned except where they are an odd number of linear instructions
 // float FUNC_NAME(__aeabi_fadd)(float, float)         single-precision addition
 float_wrapper_section __aeabi_farithmetic
 // float FUNC_NAME(__aeabi_frsub)(float x, float y)    single-precision reverse subtraction, y - x
-# frsub first because it is the only one that needs alignment
+// frsub first because it is the only one that needs alignment
 .align 2
 wrapper_func __aeabi_frsub
    eors r0, r1
@ -146,7 +147,30 @@ wrapper_func_f2 __aeabi_fdiv
    b fdiv_fsub_nan_helper
 1:
 #endif
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
    ldr r2, =(SIO_BASE)
    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    bcs fdiv_save_state
 #else
    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
    push {r4, lr}
    mrs r4, PRIMASK
    cpsid i
    bl fdiv_shim_call
    msr PRIMASK, r4
    pop {r4, pc}
 #endif
 fdiv_shim_call:
    table_tail_call SF_TABLE_FDIV
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
 fdiv_save_state:
    save_div_state_and_lr
    bl fdiv_shim_call
    ldr r2, =(SIO_BASE)
    restore_div_state_and_return
 #endif
 fdiv_fsub_nan_helper:
 #if PICO_FLOAT_PROPAGATE_NANS
@ -689,10 +713,33 @@ wrapper_func tanf
    lsls r1, r0, #1
    lsrs r1, #24
    cmp r1, #127 + 7
-    bge 1f
+    bge ftan_out_of_range
-2:
+ftan_in_range:
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
    ldr r2, =(SIO_BASE)
    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
    bcs ftan_save_state
 #else
    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
    push {r4, lr}
    mrs r4, PRIMASK
    cpsid i
    bl ftan_shim_call
    msr PRIMASK, r4
    pop {r4, pc}
 #endif
 ftan_shim_call:
    table_tail_call SF_TABLE_FTAN
-1:
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
 ftan_save_state:
    save_div_state_and_lr
    bl ftan_shim_call
    ldr r2, =(SIO_BASE)
    restore_div_state_and_return
 #endif
 ftan_out_of_range:
 #if PICO_FLOAT_PROPAGATE_NANS
    // also check for infinites
    cmp r1, #255
@ -709,7 +756,7 @@ wrapper_func tanf
    bl remainderf
    pop {r1}
    mov lr, r1
-    b 2b
+    b ftan_in_range
 float_wrapper_section atan2f
 wrapper_func_f2 atan2f
--- a/test/pico_divider_test/CMakeLists.txt
+++ b/test/pico_divider_test/CMakeLists.txt
@ -12,7 +12,7 @@ if (PICO_ON_DEVICE)
    pico_add_extra_outputs(pico_divider_test)
    target_compile_definitions(pico_divider_test PRIVATE
-            PICO_DIVIDER_DISABLE_INTERRUPTS=1
+#            PICO_DIVIDER_DISABLE_INTERRUPTS=1
 #            TURBO
    )
--- a/test/pico_divider_test/pico_divider_nesting_test.c
+++ b/test/pico_divider_test/pico_divider_nesting_test.c
@ -15,6 +15,9 @@ volatile bool failed;
 volatile uint32_t count[3];
 volatile bool done;
 #define FAILED() ({ failed = true; })
 //#define FAILED() ({ failed = true; __breakpoint(); })
 bool timer_callback(repeating_timer_t *t) {
    count[0]++;
    static int z;
@ -23,9 +26,27 @@ bool timer_callback(repeating_timer_t *t) {
        int a = z / 7;
        int b = z % 7;
        if (z != a * 7 + b) {
-            failed = true;
+            FAILED();
        }
        a = z / -7;
        b = z % -7;
        if (z != a * -7 + b) {
            FAILED();
        }
    }
    float fz = z;
    float fa = fz / 11.0f;
    float fb = fmodf(fz, 11.0f);
    if (fabsf(fz - (fa * 11.0 + fb) > 1e-9)) {
        FAILED();
    }
    double dz = z;
    double da = dz / 11.0;
    double db = fmod(dz, 11.0);
    if (fabsf(dz - (da * 11.0 + db) > 1e-9)) {
        FAILED();
    }
    return !done;
 }
@ -41,16 +62,20 @@ void do_dma_start(uint ch) {
    dma_channel_configure(ch, &c, &word[ch], &word[ch], 513 + ch * 23, true);
 }
 double d0c, d0s, d0t, dz;
 float f0c, f0s, f0t, fz;
 void test_irq_handler0() {
    count[1]++;
    dma_hw->ints0 |= 1u;
    static uint z;
    static uint dz;
    for (int i=0; i<80;i++) {
        z += 31;
        uint a = z / 11;
        uint b = z % 11;
        if (z != a * 11 + b) {
-            failed = true;
+            FAILED();
        }
    }
    if (done) dma_channel_abort(0);
@ -66,16 +91,17 @@ void test_irq_handler1() {
        uint a = z / -13;
        uint b = z % -13;
        if (z != a * -13 + b) {
-            failed = true;
+            FAILED();
        }
        static uint64_t z64;
        z64 -= 47;
        uint64_t a64 = z64 / -13;
        uint64_t b64 = z64 % -13;
        if (z64 != a64 * -13 + b64) {
-            failed = true;
+            FAILED();
        }
    }
    if (done) dma_channel_abort(1);
    else      do_dma_start(1);
 }
@ -89,7 +115,7 @@ void test_nesting() {
    // They all busily make use of the dividers, to expose any issues with nested use
    repeating_timer_t timer;
-    add_repeating_timer_us(529, timer_callback, NULL, &timer);
+    add_repeating_timer_us(929, timer_callback, NULL, &timer);
    irq_set_exclusive_handler(DMA_IRQ_0, test_irq_handler0);
    irq_set_exclusive_handler(DMA_IRQ_1, test_irq_handler1);
@ -101,7 +127,7 @@ void test_nesting() {
    irq_set_enabled(DMA_IRQ_1, 1);
    do_dma_start(0);
    do_dma_start(1);
-    absolute_time_t end = delayed_by_ms(get_absolute_time(), 2000);
+    absolute_time_t end = delayed_by_ms(get_absolute_time(), 10000);
    int count_local=0;
    while (!time_reached(end)) {
        for(uint i=0;i<100;i++) {
@ -109,8 +135,39 @@ void test_nesting() {
            uint a = z / 11;
            uint b = z % 11;
            if (z != a * 11 + b) {
-                failed = true;
+                FAILED();
            }
            int zz = (int)z;
            int aa = zz / -11;
            int bb = zz % -11;
            if (zz != aa * -11 + bb) {
                FAILED();
            }
            aa = -zz / -11;
            bb = -zz % -11;
            if (-zz != aa * -11 + bb) {
                FAILED();
            }
            aa = -zz / 11;
            bb = -zz % 11;
            if (-zz != aa * 11 + bb) {
                FAILED();
            }
            a = 0xffffffffu / 11;
            b = 0xffffffffu % 11;
            if (0xffffffffu != a * 11 + b) {
                FAILED();
            }
        }
        // these use the divider
        for(uint i=0;i<=100;i+=20) {
            // both in and out bootrom range (we perform mod in wrapper code if necessarry)
            f0t = tanf(i * 50);
            f0c = cosf(i * 50);
            f0s = sinf(i * 50);
            d0t = tan(i * 1000);
            d0c = cos(i * 1000);
            d0s = sin(i * 1000);
        }
        count_local++;
    }
--- a/test/pico_float_test/CMakeLists.txt
+++ b/test/pico_float_test/CMakeLists.txt
@ -11,15 +11,19 @@ add_executable(pico_double_test
        )
 #todo split out variants with different flags
 target_compile_definitions(pico_float_test PRIVATE
        PICO_USE_CRT_PRINTF=1 # want full precision output
 #        PICO_FLOAT_PROPAGATE_NANS=1
 #        PICO_DIVIDER_DISABLE_INTERRUPTS=1
 )
 #todo split out variants with different flags
 target_compile_definitions(pico_double_test PRIVATE
        PICO_USE_CRT_PRINTF=1 # want full precision output
                PICO_FLOAT_PROPAGATE_NANS=1
-                PICO_DOUBLE_PROPAGATE_NANS=1
+                #PICO_DOUBLE_PROPAGATE_NANS=1
                #PICO_DIVIDER_DISABLE_INTERRUPTS=1
        )
 # handy for testing we aren't pulling in extra stuff
--- a/test/pico_float_test/pico_double_test.c
+++ b/test/pico_float_test/pico_double_test.c
@ -282,6 +282,51 @@ int test_dcmpun() {
    return 0;
 }
 #define assert_nan(a) assert(isnan(a))
 #define check_nan(a) ({ assert_nan(a); a; })
 double __aeabi_i2d(int32_t);
 double __aeabi_ui2d(int32_t);
 double __aeabi_l2d(int64_t);
 double __aeabi_ul2d(int64_t);
 int32_t __aeabi_d2iz(double);
 int64_t __aeabi_d2lz(double);
 double __aeabi_dmul(double, double);
 double __aeabi_ddiv(double, double);
 #if LIB_PICO_DOUBLE_PICO
 double __real___aeabi_i2d(int);
 double __real___aeabi_ui2d(int);
 double __real___aeabi_l2d(int64_t);
 double __real___aeabi_ul2d(int64_t);
 double __real___aeabi_dmul(double, double);
 double __real___aeabi_ddiv(double, double);
 int32_t __real___aeabi_d2iz(double);
 int64_t __real___aeabi_d2lz(double);
 double __real_sqrt(double);
 double __real_cos(double);
 double __real_sin(double);
 double __real_tan(double);
 double __real_exp(double);
 double __real_log(double);
 double __real_atan2(double, double);
 double __real_pow(double, double);
 double __real_trunc(double);
 double __real_ldexp(double, int);
 double __real_fmod(double, double);
 #define EPSILON 1e-9
 #define assert_close(a, b) assert(((b - a) < EPSILON || (a - b) < EPSILON) || (isinf(a) && isinf(b) && (a < 0) == (b < 0)))
 #define check1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); assert(r == r2); r; })
 #define check2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); assert(r == r2); r; })
 #define check_close1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); if (isnan(p0)) assert_nan(r); else assert_close(r, r2); r; })
 #define check_close2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); if (isnan(p0) || isnan(p1)) assert_nan(r); else assert_close(r, r2); r; })
 #else
 #define check1(func,p0) func(p0)
 #define check2(func,p0,p1) func(p0,p1)
 #define check_close1(func,p0) func(p0)
 #define check_close2(func,p0,p1) func(p0,p1)
 #endif
 double aa = 0.5;
 double bb = 1;
@ -305,14 +350,18 @@ int main() {
 #if 1
    for (double x = 0; x < 3; x++) {
        printf("\n ----- %g\n", x);
-        printf("SQRT %10.18g\n", sqrt(x));
+        printf("SQRT %10.18g\n", check_close1(sqrt, x));
-        printf("COS %10.18g\n", cos(x));
+        printf("COS %10.18g\n", check_close1(cos, x));
-        printf("SIN %10.18g\n", sin(x));
+        printf("SIN %10.18g\n", check_close1(sin, x));
-        printf("TAN %10.18g\n", tan(x));
+        printf("TAN %10.18g\n", check_close1(tan, x));
-        printf("ATAN2 %10.18g\n", atan2(x, 10));
+        printf("ATAN2 %10.18g\n", check_close2(atan2, x, 10.0));
-        printf("ATAN2 %10.18g\n", atan2(10, x));
+        printf("ATAN2 %10.18g\n", check_close2(atan2, 10.0, x));
-        printf("EXP %10.18g\n", exp(x));
+        printf("EXP %10.18g\n", check_close1(exp, x));
-        printf("LN %10.18g\n", log(x));
+        printf("LN %10.18g\n", check_close1(log, x));
        printf("POW %10.18f\n", check_close2(pow, x, x));
        printf("TRUNC %10.18f\n", check_close1(trunc, x));
        printf("LDEXP %10.18f\n", check_close2(ldexp, x, x));
        printf("FMOD %10.18f\n", check_close2(fmod, x, 3.0f));
        double s, c;
        sincos(x, &s, &c);
        printf("SINCOS %10.18f %10.18f\n", s, c);
@ -325,22 +374,21 @@ int main() {
 #if PICO_DOUBLE_PROPAGATE_NANS
    {
        float x = NAN;
-        printf("NANO %10.18f\n", x);
+        printf("SQRT %10.18g\n", check_close1(sqrt, x));
-        printf("SQRT %10.18f\n", sqrt(x));
+        printf("COS %10.18g\n", check_close1(cos, x));
-        printf("COS %10.18f\n", cos(x));
+        printf("SIN %10.18g\n", check_close1(sin, x));
-        printf("SIN %10.18f\n", sin(x));
+        printf("TAN %10.18g\n", check_close1(tan, x));
-        printf("TAN %10.18f\n", tan(x));
+        printf("ATAN2 %10.18g\n", check_close2(atan2, x, 10.0));
-        printf("ATAN2 %10.18f\n", atan2(x, 10));
+        printf("ATAN2 %10.18g\n", check_close2(atan2, 10.0, x));
-        printf("ATAN2 %10.18f\n", atan2(10, x));
+        printf("EXP %10.18g\n", check_close1(exp, x));
-        printf("EXP %10.18f\n", exp(x));
+        printf("LN %10.18g\n", check_close1(log, x));
-        printf("LN %10.18f\n", log(x));
+        printf("POW %10.18f\n", check_nan(pow(x, x)));
-        printf("POW %10.18f\n", pow(x, x));
+        printf("TRUNC %10.18f\n", check_nan(trunc(x)));
-        printf("TRUNC %10.18f\n", trunc(x));
+        printf("LDEXP %10.18f\n", check_nan(ldexp(x, x)));
-        printf("LDEXP %10.18f\n", ldexp(x, x));
+        printf("FMOD %10.18f\n", check_nan(fmod(x, 3.0f)));
        printf("FMOD %10.18f\n", fmod(x, 3.0f));
        double s, c;
        sincos(x, &s, &c);
-        printf("SINCOS %10.18f %10.18f\n", s, c);
+        printf("SINCOS %10.18f %10.18f\n", check_nan(s), check_nan(c));
        for(int j=0;j<2;j++) {
            for (int i = 1; i < 4; i++) {
@ -372,17 +420,21 @@ int main() {
 //        }
        for (int32_t x = -1; x; x <<= 1) {
            printf("i %d->%f\n", x, (double) x);
            check1(__aeabi_i2d, x);
        }
        for (int32_t x = 1; x; x <<= 1) {
            printf("i %d->%f\n", x, (double) x);
            check1(__aeabi_i2d, x);
            y = x << 1;
        }
        for (int64_t x = 1; x; x <<= 1) {
            printf("i %lld->%f\n", x, (double) x);
            check1(__aeabi_l2d, x);
            y = x << 1;
        }
        for (int64_t x = -1; x; x <<= 1) {
            printf("i %lld->%f\n", x, (double) x);
            check1(__aeabi_l2d, x);
            y = x << 1;
        }
        printf("d %d->%f\n", y, (float) y);
@ -392,24 +444,40 @@ int main() {
        uint32_t y;
        for(uint32_t x = 1; x; x <<= 1) {
            printf("u %u->%f\n", x, (double)x);
            check1(__aeabi_ui2d, x);
            y = x << 1;
        }
        printf("u %u->%f\n", y, (double)y);
    }
    for(int64_t x = 1; x !=0; x <<= 1u) {
        printf("%lld->%f\n", x, (double)x);
        check1(__aeabi_l2d, x);
    }
-    for(double x = -4294967296.f * 4294967296.f; x<=-0.5f; x/=2.f) {
+    for(double x = -4294967296.f * 4294967296.f * 2.f; x<=-0.5f; x/=2.f) {
        printf("d2i64 %f->%lld\n", x, (int64_t)x);
        if (x < INT64_MIN) {
            // seems like there is a bug in the gcc version!
            assert(__aeabi_d2lz(x) == INT64_MIN);
        } else {
            check1(__aeabi_d2lz, x);
        }
    }
-    for(double x = 4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
+    for(double x = 4294967296.f * 4294967296.f * 2.f; x>=0.5f; x/=2.f) {
        printf("d2i64 %f->%lld\n", x, (int64_t)x);
        if (x >= INT64_MAX) {
            // seems like there is a bug in the gcc version!
            assert(__aeabi_d2lz(x) == INT64_MAX);
        } else {
            check1(__aeabi_d2lz, x);
        }
    }
    for(double x = -4294967296.f * 4294967296.f; x<=-0.5f; x/=2.f) {
        printf("d2i32 %f->%d\n", x, (int32_t)x);
        check1(__aeabi_d2iz, x);
    }
    for(double x = 4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
        printf("d2i32 %f->%d\n", x, (int32_t)x);
        check1(__aeabi_d2iz, x);
    }
    for (double x = 1; x < 11; x += 2) {
@ -417,6 +485,8 @@ int main() {
        double g = 1.0 / x;
        printf("%g %10.18g %10.18g, %10.18g, %10.18g %10.18g\n", x, f, x + 0.37777777777777777777777777777,
               x - 0.377777777777777777777777777777, g, 123456789.0 / x);
        check2(__aeabi_dmul, x, x);
        check2(__aeabi_ddiv, 1.0, x);
    }
    if (fail ||
--- a/test/pico_float_test/pico_float_test.c
+++ b/test/pico_float_test/pico_float_test.c
@ -16,7 +16,6 @@
 #include <stdlib.h>
 #include <math.h>
 #include <pico/float.h>
 //#include <pico/float.h>
 #include "pico/stdlib.h"
 #include "inttypes.h"
@ -283,12 +282,58 @@ int test_fcmpun() {
    return 0;
 }
 #define assert_nan(a) assert(isnan(a))
 #define check_nan(a) ({ assert_nan(a); a; })
 float __aeabi_i2f(int32_t);
 float __aeabi_ui2f(int32_t);
 float __aeabi_l2f(int64_t);
 float __aeabi_ul2f(int64_t);
 int32_t __aeabi_f2iz(float);
 int64_t __aeabi_f2lz(float);
 float __aeabi_fmul(float, float);
 float __aeabi_fdiv(float, float);
 #if LIB_PICO_FLOAT_PICO
 float __real___aeabi_i2f(int);
 float __real___aeabi_ui2f(int);
 float __real___aeabi_l2f(int64_t);
 float __real___aeabi_ul2f(int64_t);
 float __real___aeabi_fmul(float, float);
 float __real___aeabi_fdiv(float, float);
 int32_t __real___aeabi_f2iz(float);
 int64_t __real___aeabi_f2lz(float);
 float __real_sqrtf(float);
 float __real_cosf(float);
 float __real_sinf(float);
 float __real_tanf(float);
 float __real_expf(float);
 float __real_logf(float);
 float __real_atan2f(float, float);
 float __real_powf(float, float);
 float __real_truncf(float);
 float __real_ldexpf(float, int);
 float __real_fmodf(float, float);
 #define EPSILON 1e-9
 #define assert_close(a, b) assert(((b - a) < EPSILON || (a - b) < EPSILON) || (isinf(a) && isinf(b) && (a < 0) == (b < 0)))
 #define check1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); assert(r == r2); r; })
 #define check2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); assert(r == r2); r; })
 #define check_close1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); if (isnan(p0)) assert_nan(r); else assert_close(r, r2); r; })
 #define check_close2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); if (isnan(p0) || isnan(p1)) assert_nan(r); else assert_close(r, r2); r; })
 #else
 #define check1(func,p0) func(p0)
 #define check2(func,p0,p1) func(p0,p1)
 #define check_close1(func,p0) func(p0)
 #define check_close2(func,p0,p1) func(p0,p1)
 #endif
 double aa = 0.5;
 double bb = 1;
 int main() {
    setup_default_uart();
    bool fail = false;
    printf("%d\n", aa < bb);
    for(float a = -1; a <= 1; a++) {
        for(float b = -1; b <= 1; b++) {
@ -341,21 +386,27 @@ int main() {
 #if 1
    for (float x = 0; x < 3; x++) {
        printf("\n ----- %f\n", x);
-        printf("FSQRT %10.18f\n", sqrtf(x));
+        printf("FSQRT %10.18f\n", check_close1(sqrtf, x));
-        printf("FCOS %10.18f\n", cosf(x));
+        printf("FCOS %10.18f\n", check_close1(cosf, x));
-        printf("FSIN %10.18f\n", sinf(x));
+        printf("FSIN %10.18f\n", check_close1(sinf, x));
        float s, c;
        sincosf(x, &s, &c);
        printf("FSINCOS %10.18f %10.18f\n", s, c);
-        printf("FTAN %10.18f\n", tanf(x));
+        printf("FTAN %10.18f\n", check_close1(tanf, x));
-        printf("FATAN2 %10.18f\n", atan2f(x, 10));
+        printf("FATAN2 %10.18f\n", check_close2(atan2f, x, 10.f));
-        printf("FATAN2 %10.18f\n", atan2f(10, x));
+        printf("FATAN2 %10.18f\n", check_close2(atan2f, 10.f, x));
-        printf("FEXP %10.18f\n", expf(x));
+        printf("FEXP %10.18f\n", check_close1(expf, x));
-        printf("FLN %10.18f\n", logf(x));
+        printf("FLN %10.18f\n", check_close1(logf, x));
-        printf("POWF %10.18f\n", powf(x, x));
+        printf("POWF %10.18f\n", check_close2(powf, x, x));
-        printf("TRUNCF %10.18f\n", truncf(x));
+        printf("TRUNCF %10.18f\n", check_close1(truncf, x));
-        printf("LDEXPF %10.18f\n", ldexpf(x, x));
+        printf("LDEXPF %10.18f\n", check_close2(ldexpf, x, x));
-        printf("FMODF %10.18f\n", fmodf(x, 3.0f));
+        printf("FMODF %10.18f\n", check_close2(fmodf, x, 3.0f));
        sincosf(x, &s, &c);
        printf("SINCOS %10.18f %10.18f\n", s, c);
        if (s != sin(x) || c != cos(x)) {
            printf("SINCOS mismatch\n");
            fail = true;
        }
    }
    for (double x = 0; x < 3; x++) {
@ -390,18 +441,25 @@ int main() {
 //        sincosf(x, &s, &c);
        printf("FSINCOS %10.18f %10.18f\n", s, c);
-        for(int i=1; i<4; i++) {
+        for(int j=0;j<2;j++) {
-            char buf[4];
+            for (int i = 1; i < 4; i++) {
-            sprintf(buf, "%d", i);
+                char buf[4];
-            float f0 = -nanf(buf);
+                sprintf(buf, "%d", i);
-            double d0 = -nan(buf);
+                float f0 = -nanf(buf);
-            // hmm
+                double d0 = -nan(buf);
-            *(uint64_t *)&d0 |= i;
+                // hmm nanf/nan seem to ignore payload
-            *(uint32_t *)&f0 |= i;
+                *(uint64_t *) &d0 |= i;
-            float f = (float)d0;
+                *(uint32_t *) &f0 |= i;
-            double d = (double)f0;
+                if (j) {
-            printf("f2d %08"PRIx32" -> %g %016"PRIx64"\n", *(uint32_t*)&f0, d, *(uint64_t*)&d);
+                    // try without top bit set
-            printf("d2f %016"PRIx64" -> %f %08"PRIx32"\n", *(uint64_t*)&d0, f, *(uint32_t*)&f);
+                    *(uint64_t *) &d0 &= ~0x0008000000000000ull;
                    *(uint32_t *) &f0 &= ~0x00400000u;
                }
                float f = (float) d0;
                double d = (double) f0;
                printf("f2d %f %08"PRIx32" -> %g %016"PRIx64"\n", f0, *(uint32_t *) &f0, d, *(uint64_t *) &d);
                printf("d2f %f %016"PRIx64" -> %f %08"PRIx32"\n", d0, *(uint64_t *) &d0, f, *(uint32_t *) &f);
            }
        }
    }
 #endif
@ -413,17 +471,21 @@ int main() {
 //        }
        for (int32_t x = -1; x; x <<= 1) {
            printf("i %d->%f\n", x, (float) x);
            check1(__aeabi_i2f, x);
        }
        for (int32_t x = 1; x; x <<= 1) {
            printf("i %d->%f\n", x, (float) x);
            check1(__aeabi_i2f, x);
            y = x << 1;
        }
        for (int64_t x = 1; x; x <<= 1) {
            printf("i %lld->%f\n", x, (float) x);
            check1(__aeabi_l2f, x);
            y = x << 1;
        }
        for (int64_t x = -1; x; x <<= 1) {
            printf("i %lld->%f\n", x, (float) x);
            check1(__aeabi_l2f, x);
            y = x << 1;
        }
        printf("d %d->%f\n", y, (float) y);
@ -433,40 +495,63 @@ int main() {
        uint32_t y;
        for(uint32_t x = 1; x; x <<= 1) {
            printf("u %u->%f\n", x, (float)x);
            check1(__aeabi_ui2f, x);
            y = x << 1;
        }
        printf("u %u->%f\n", y, (float)y);
    }
    for(int64_t x = 1; x !=0; x <<= 1u) {
        printf("%lld->%f\n", x, (float)x);
        check1(__aeabi_l2f, x);
    }
    for(float x = -4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
        printf("f %f->%lld\n", x, (int64_t)x);
        if (x < INT64_MIN) {
            // seems like there is a bug in the gcc version!
            assert(__aeabi_f2lz(x) == INT64_MIN);
        } else {
            check1(__aeabi_f2lz, x);
        }
    }
    for(float x = 4294967296.f * 4294967296.f * 2.f; x>=0.5f; x/=2.f) {
        printf("f2i64 %f->%lld\n", x, (int64_t)x);
        if (x >= INT64_MAX) {
            // seems like there is a bug in the gcc version!
            assert(__aeabi_f2lz(x) == INT64_MAX);
        } else {
            check1(__aeabi_f2lz, x);
        }
    }
    for(float x = -4294967296.f * 4294967296.f; x<=-0.5f; x/=2.f) {
        printf("d2i32 %f->%d\n", x, (int32_t)x);
        check1(__aeabi_f2iz, x);
    }
    for(float x = 4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
-        printf("f %f->%lld\n", x, (int64_t)x);
+        printf("d2i32 %f->%d\n", x, (int32_t)x);
        check1(__aeabi_f2iz, x);
    }
-    for (double x = 1; x < 11; x += 2) {
+
-        double f = x * x;
+    for (float x = 1; x < 11; x += 2) {
-        double g = 1.0 / x;
+        float f = x * x;
-        printf("%g %10.18g %10.18g, %10.18g, %10.18g %10.18g\n", x, f, x + 0.37777777777777777777777777777,
+        float g = 1.0f / x;
-               x - 0.377777777777777777777777777777, g, 123456789.0 / x);
+        printf("%g %10.18g %10.18g, %10.18g, %10.18g %10.18g\n", x, f, x + 0.37777777777777777777777777777f,
               x - 0.377777777777777777777777777777f, g, 123456789.0f / x);
        check2(__aeabi_fmul, x, x);
        check2(__aeabi_fdiv, 1.0f, x);
    }
-    if (test_cfcmpeq() || test_cfcmple() ||
+
-        test_fcmpun() || test_cmple_gt() || test_cmplt_ge()) {
+    if (fail ||
        test_cfcmpeq() ||
        test_cfcmple() ||
        test_fcmpun() ||
        test_cmple_gt() ||
        test_cmplt_ge()) {
        printf("FAILED\n");
        return 1;
    } else {
        printf("PASSED\n");
        return 0;
    }
    if (test_cfcmpeq() || test_cfcmple() ||
        test_fcmpun() || test_cmple_gt() || test_cmplt_ge()) {
        printf("FAILED\n");
        return 1;
    } else {
        printf("PASSED\n");
        return 0;
    }
 #endif
 }