From 574fdee37b1ca5bfaa32716c9391179dcbdaf523 Mon Sep 17 00:00:00 2001
From: Graham Sanderson <graham.sanderson@raspberrypi.com>
Date: Thu, 13 May 2021 07:38:42 -0500
Subject: [PATCH] Fixup divider save_restore for floating point too; improve
 tests (#405)

- The divider state needs to be saved for __aeabi_ddiv, __aeabi_fdiv, __aeabi_dtan and __aeabi_ftan or they won't work in interrupts *(probably not used much youd hope), or on an RTOS context switch
 - Refactored code out for the integer and floating point cases
 - Improved the floating point 'tests' in passing to check more return values against GCC implementations
 - Added floating point usage to the IRQ nesting test case
---
 .../include/hardware/divider_helper.S         |  68 +++++++
 src/rp2_common/pico_divider/divider.S         |  82 ++-------
 src/rp2_common/pico_double/double_aeabi.S     |  78 +++++++-
 src/rp2_common/pico_float/float_aeabi.S       |  63 ++++++-
 test/pico_divider_test/CMakeLists.txt         |   2 +-
 .../pico_divider_nesting_test.c               |  71 +++++++-
 test/pico_float_test/CMakeLists.txt           |   6 +-
 test/pico_float_test/pico_double_test.c       | 118 +++++++++---
 test/pico_float_test/pico_float_test.c        | 171 +++++++++++++-----
 9 files changed, 501 insertions(+), 158 deletions(-)
 create mode 100644 src/rp2_common/hardware_divider/include/hardware/divider_helper.S

diff --git a/src/rp2_common/hardware_divider/include/hardware/divider_helper.S b/src/rp2_common/hardware_divider/include/hardware/divider_helper.S
new file mode 100644
index 0000000..062e12d
--- /dev/null
+++ b/src/rp2_common/hardware_divider/include/hardware/divider_helper.S
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include "hardware/regs/addressmap.h"
+#include "hardware/regs/sio.h"
+
+#if SIO_DIV_CSR_READY_LSB == 0
+.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
+#else
+need to change SHIFT above
+#endif
+#if SIO_DIV_CSR_DIRTY_LSB == 1
+.equ SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY, 2
+#else
+need to change SHIFT above
+#endif
+
+// SIO_BASE ptr in r2; pushes r4-r7, lr to stack
+// requires that division started at least 2 cycles prior to the start of the macro
+.macro save_div_state_and_lr
+// originally we did this, however a) it uses r3, and b) the push takes 6 cycles, b)
+// any IRQ which uses the divider will necessarily put the data back, which will
+// immediately make it ready
+//
+//    // ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
+//    // // wait for results as we can't save signed-ness of operation
+//    // 1:
+//    //     lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
+//    //     bcc 1b
+
+// 6 cycles
+push {r4, r5, r6, r7, lr}
+// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
+ldr r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
+ldr r5, [r2, #SIO_DIV_UDIVISOR_OFFSET]
+ldr r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
+ldr r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
+.endm
+
+// restores divider state from r4-r7, then pops them and pc
+.macro restore_div_state_and_return
+// writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
+//
+// it is worth considering what happens if we are interrupted
+//
+// after writing r4: we are DIRTY and !READY
+//    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
+//        saved/restored correctly and we'll restore the rest ourselves
+// after writing r4, r5: we are DIRTY and !READY
+//    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
+//        at least will be saved/restored correctly and and we'll restore the rest ourselves
+// after writing r4, r5, r6: we are DIRTY and READY
+//    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
+//        and we'll restore the remainder after the fact
+
+// note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
+//      and so 4 reads is cheaper (and we don't have to adjust r2)
+// note also, that we must restore via UDIVI* rather than SDIVI* to prevent the quotient/remainder being negated on read based
+//      on the signs of the inputs
+str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
+str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET]
+str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
+str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
+pop {r4, r5, r6, r7, pc}
+.endm
\ No newline at end of file
diff --git a/src/rp2_common/pico_divider/divider.S b/src/rp2_common/pico_divider/divider.S
index 8112681..ac67a5e 100644
--- a/src/rp2_common/pico_divider/divider.S
+++ b/src/rp2_common/pico_divider/divider.S
@@ -4,8 +4,8 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 
-#include "hardware/regs/sio.h"
 #include "hardware/regs/addressmap.h"
+#include "hardware/divider_helper.S"
 
 .syntax unified
 .cpu cortex-m0plus
@@ -34,17 +34,6 @@
 #endif
 .endm
 
-#if SIO_DIV_CSR_READY_LSB == 0
-.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
-#else
-need to change SHIFT above
-#endif
-#if SIO_DIV_CSR_DIRTY_LSB == 1
-.equ SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY, 2
-#else
-need to change SHIFT above
-#endif
-
 @ wait 8-n cycles for the hardware divider
 .macro wait_div n
 .rept (8-\n) / 2
@@ -56,58 +45,17 @@ need to change SHIFT above
 .endif
 .endm
 
-
 #if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4)
 #error register layout has changed - we rely on this order to make sure we save/restore in the right order
 #endif
 
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
-
-# SIO_BASE ptr in r2
-.macro save_div_state_and_lr
-    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
-    # wait for results as we can't save signed-ness of operation
-1:
-    lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
-    bcc 1b
-    push {r4, r5, r6, r7, lr}
-    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
-    ldr r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
-    ldr r5, [r2, #SIO_DIV_SDIVISOR_OFFSET]
-    ldr r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
-    ldr r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
-.endm
-
-.macro restore_div_state_and_return
-    // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
-    //
-    // it is worth considering what happens if we are interrupted
-    //
-    // after writing r4: we are DIRTY and !READY
-    //    ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
-    //        saved/restored correctly and we'll restore the rest ourselves
-    // after writing r4, r5: we are DIRTY and !READY
-    //    ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
-    //        at least will be saved/restored correctly and and we'll restore the rest ourselves
-    // after writing r4, r5, r6: we are DIRTY and READY
-    //    ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
-    //        and we'll restore the remainder after the fact
-
-    // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
-    // and so 4 reads is cheaper (and we don't have to adjust r2)
-    str r4, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
-    str r5, [r2, #SIO_DIV_SDIVISOR_OFFSET]
-    str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
-    str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
-    pop {r4, r5, r6, r7, pc}
-.endm
-
 .macro save_div_state_and_lr_64
     push {r4, r5, r6, r7, lr}
     ldr r6, =SIO_BASE
 1:
     ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
-    # wait for results as we can't save signed-ness of operation
+    // wait for results as we can't save signed-ness of operation
     lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
     bcc 1b
     // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
@@ -154,17 +102,18 @@ wrapper_func __aeabi_idivmod
 regular_func div_s32s32
 regular_func divmod_s32s32
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
     ldr r2, =(SIO_BASE)
-    # to support IRQ usage we must save/restore
     ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
     lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
     bcs divmod_s32s32_savestate
 regular_func divmod_s32s32_unsafe
 #else
-# to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
-# in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
-# are the hardware_divider functions that can be used instead anyway
+// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
+// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
+// are the hardware_divider functions that can be used instead anyway
 regular_func divmod_s32s32_unsafe
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
     ldr r2, =(SIO_BASE)
     mrs r3, PRIMASK
     cpsid i
@@ -203,6 +152,8 @@ regular_func divmod_s32s32_unsafe
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
 .align 2
 regular_func divmod_s32s32_savestate
+    // note that we must be at least 2 cycles into division at this point,
+    // which we are because of the firty check before getting here (and of course the function call before that)
     save_div_state_and_lr
     bl divmod_s32s32_unsafe
     restore_div_state_and_return
@@ -215,17 +166,18 @@ regular_func divmod_u32u32
 wrapper_func __aeabi_uidiv
 wrapper_func __aeabi_uidivmod
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
     ldr r2, =(SIO_BASE)
-    # to support IRQ usage we must save/restore
     ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
     lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
     bcs divmod_u32u32_savestate
 regular_func divmod_u32u32_unsafe
 #else
-# to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
-# in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
-# are the hardware_divider functions that can be used instead anyway
+// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
+// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
+// are the hardware_divider functions that can be used instead anyway
 regular_func divmod_u32u32_unsafe
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
     ldr r2, =(SIO_BASE)
     mrs r3, PRIMASK
     cpsid i
@@ -273,9 +225,9 @@ wrapper_func __aeabi_ldivmod
 regular_func div_s64s64
 regular_func divmod_s64s64
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
     mov ip, r2
     ldr r2, =(SIO_BASE)
-    # to support IRQ usage we must save/restore
     ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
     lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
     mov r2, ip
@@ -287,6 +239,7 @@ divmod_s64s64_savestate:
     bl divmod_s64s64_unsafe
     restore_div_state_and_return_64
 #else
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
     push {r4, lr}
     mrs r4, PRIMASK
     cpsid i
@@ -300,9 +253,9 @@ wrapper_func __aeabi_uldivmod
 regular_func div_u64u64
 regular_func divmod_u64u64
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
     mov ip, r2
     ldr r2, =(SIO_BASE)
-    # to support IRQ usage we must save/restore
     ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
     lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
     mov r2, ip
@@ -314,6 +267,7 @@ regular_func divmod_u64u64_savestate
     bl divmod_u64u64_unsafe
     restore_div_state_and_return_64
 #else
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
     push {r4, lr}
     mrs r4, PRIMASK
     cpsid i
diff --git a/src/rp2_common/pico_double/double_aeabi.S b/src/rp2_common/pico_double/double_aeabi.S
index 1e1250f..a871e43 100644
--- a/src/rp2_common/pico_double/double_aeabi.S
+++ b/src/rp2_common/pico_double/double_aeabi.S
@@ -6,6 +6,7 @@
 
 #include "pico/asm_helper.S"
 #include "pico/bootrom/sf_table.h"
+#include "hardware/divider_helper.S"
 
 __pre_init __aeabi_double_init, 00020
 
@@ -131,16 +132,16 @@ regular_func pop_r8_r11
  mov r11,r7
  bx r14
 
-# note generally each function is in a separate section unless there is fall thru or branching between them
-# note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
+// note generally each function is in a separate section unless there is fall thru or branching between them
+// note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
 
-# note functions are word aligned except where they are an odd number of linear instructions
+// note functions are word aligned except where they are an odd number of linear instructions
 
 // double FUNC_NAME(__aeabi_dadd)(double, double)         double-precision addition
 double_wrapper_section __aeabi_darithmetic
 // double FUNC_NAME(__aeabi_drsub)(double x, double y)    double-precision reverse subtraction, y - x
 
-# frsub first because it is the only one that needs alignment
+// frsub first because it is the only one that needs alignment
 .align 2
 wrapper_func __aeabi_drsub
     eors r0, r1
@@ -177,7 +178,35 @@ wrapper_func_d2 __aeabi_ddiv
     b ddiv_dsub_nan_helper
 1:
 #endif
-   shimmable_table_tail_call SF_TABLE_FDIV ddiv_shim
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
+    mov ip, r2
+    ldr r2, =(SIO_BASE)
+    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
+    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
+    bcs ddiv_save_state
+    mov r2, ip
+#else
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
+    push {r4, lr}
+    mrs r4, PRIMASK
+    cpsid i
+    bl ddiv_shim_call
+    msr PRIMASK, r4
+    pop {r4, pc}
+#endif
+ddiv_shim_call:
+    shimmable_table_tail_call SF_TABLE_FDIV ddiv_shim
+
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+ddiv_save_state:
+    ldr r2, =(SIO_BASE)
+    save_div_state_and_lr
+    mov r2, ip
+    bl ddiv_shim_call
+    ldr r2, =(SIO_BASE)
+    restore_div_state_and_return
+#endif
 
 ddiv_dsub_nan_helper:
 #if PICO_DOUBLE_PROPAGATE_NANS
@@ -592,6 +621,8 @@ regular_func sincostan_remainder
     ldr r2, =0x54442D18 // 2 * M_PI
     ldr r3, =0x401921FB
     push {lr}
+    // note remainder only uses the divider thru integer divider functions
+    // which save and restore themselves
     bl remainder
     pop {pc}
 
@@ -752,13 +783,40 @@ double_wrapper_section tan
 wrapper_func tan
     // rom version only works for -1024 < angle < 1024
     lsls r2, r1, #2
-    bcc 1f
+    bcc dtan_in_range
     lsrs r2, #22
     cmp r2, #9
-    bge 2f
-1:
+    bge dtan_angle_out_of_range
+dtan_in_range:
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
+    mov ip, r2
+    ldr r2, =(SIO_BASE)
+    ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
+    lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
+    bcs dtan_save_state
+    mov r2, ip
+#else
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
+    push {r4, lr}
+    mrs r4, PRIMASK
+    cpsid i
+    bl dtan_shim_call
+    msr PRIMASK, r4
+    pop {r4, pc}
+#endif
+dtan_shim_call:
     shimmable_table_tail_call SF_TABLE_FTAN dtan_shim
-2:
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+dtan_save_state:
+    ldr r2, =(SIO_BASE)
+    save_div_state_and_lr
+    mov r2, ip
+    bl dtan_shim_call
+    ldr r2, =(SIO_BASE)
+    restore_div_state_and_return
+#endif
+dtan_angle_out_of_range:
 #if PICO_DOUBLE_PROPAGATE_NANS
     lsls r2, r1, #1
     asrs r2, #21
@@ -775,7 +833,7 @@ wrapper_func tan
     bl sincostan_remainder
     pop {r2}
     mov lr, r2
-    b 1b
+    b dtan_in_range
 
 double_wrapper_section atan2
 wrapper_func_d2 atan2
diff --git a/src/rp2_common/pico_float/float_aeabi.S b/src/rp2_common/pico_float/float_aeabi.S
index 2aee5f2..b901d30 100644
--- a/src/rp2_common/pico_float/float_aeabi.S
+++ b/src/rp2_common/pico_float/float_aeabi.S
@@ -6,6 +6,7 @@
 
 #include "pico/asm_helper.S"
 #include "pico/bootrom/sf_table.h"
+#include "hardware/divider_helper.S"
 
 __pre_init __aeabi_float_init, 00020
 
@@ -104,16 +105,16 @@ __check_nan_f2:
 .endm
 
 
-# note generally each function is in a separate section unless there is fall thru or branching between them
-# note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
+// note generally each function is in a separate section unless there is fall thru or branching between them
+// note fadd, fsub, fmul, fdiv are so tiny and just defer to rom so are lumped together so they can share constant pool
 
-# note functions are word aligned except where they are an odd number of linear instructions
+// note functions are word aligned except where they are an odd number of linear instructions
 
 // float FUNC_NAME(__aeabi_fadd)(float, float)         single-precision addition
 float_wrapper_section __aeabi_farithmetic
 // float FUNC_NAME(__aeabi_frsub)(float x, float y)    single-precision reverse subtraction, y - x
 
-# frsub first because it is the only one that needs alignment
+// frsub first because it is the only one that needs alignment
 .align 2
 wrapper_func __aeabi_frsub
     eors r0, r1
@@ -146,7 +147,30 @@ wrapper_func_f2 __aeabi_fdiv
     b fdiv_fsub_nan_helper
 1:
 #endif
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
+    ldr r2, =(SIO_BASE)
+    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
+    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
+    bcs fdiv_save_state
+#else
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
+    push {r4, lr}
+    mrs r4, PRIMASK
+    cpsid i
+    bl fdiv_shim_call
+    msr PRIMASK, r4
+    pop {r4, pc}
+#endif
+fdiv_shim_call:
     table_tail_call SF_TABLE_FDIV
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+fdiv_save_state:
+    save_div_state_and_lr
+    bl fdiv_shim_call
+    ldr r2, =(SIO_BASE)
+    restore_div_state_and_return
+#endif
 
 fdiv_fsub_nan_helper:
 #if PICO_FLOAT_PROPAGATE_NANS
@@ -689,10 +713,33 @@ wrapper_func tanf
     lsls r1, r0, #1
     lsrs r1, #24
     cmp r1, #127 + 7
-    bge 1f
-2:
+    bge ftan_out_of_range
+ftan_in_range:
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+    // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
+    ldr r2, =(SIO_BASE)
+    ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
+    lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
+    bcs ftan_save_state
+#else
+    // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
+    push {r4, lr}
+    mrs r4, PRIMASK
+    cpsid i
+    bl ftan_shim_call
+    msr PRIMASK, r4
+    pop {r4, pc}
+#endif
+ftan_shim_call:
     table_tail_call SF_TABLE_FTAN
-1:
+#if !PICO_DIVIDER_DISABLE_INTERRUPTS
+ftan_save_state:
+    save_div_state_and_lr
+    bl ftan_shim_call
+    ldr r2, =(SIO_BASE)
+    restore_div_state_and_return
+#endif
+ftan_out_of_range:
 #if PICO_FLOAT_PROPAGATE_NANS
     // also check for infinites
     cmp r1, #255
@@ -709,7 +756,7 @@ wrapper_func tanf
     bl remainderf
     pop {r1}
     mov lr, r1
-    b 2b
+    b ftan_in_range
 
 float_wrapper_section atan2f
 wrapper_func_f2 atan2f
diff --git a/test/pico_divider_test/CMakeLists.txt b/test/pico_divider_test/CMakeLists.txt
index c1ee628..d2ab012 100644
--- a/test/pico_divider_test/CMakeLists.txt
+++ b/test/pico_divider_test/CMakeLists.txt
@@ -12,7 +12,7 @@ if (PICO_ON_DEVICE)
     pico_add_extra_outputs(pico_divider_test)
 
     target_compile_definitions(pico_divider_test PRIVATE
-            PICO_DIVIDER_DISABLE_INTERRUPTS=1
+#            PICO_DIVIDER_DISABLE_INTERRUPTS=1
 #            TURBO
     )
 
diff --git a/test/pico_divider_test/pico_divider_nesting_test.c b/test/pico_divider_test/pico_divider_nesting_test.c
index 4224181..0e0db94 100644
--- a/test/pico_divider_test/pico_divider_nesting_test.c
+++ b/test/pico_divider_test/pico_divider_nesting_test.c
@@ -15,6 +15,9 @@ volatile bool failed;
 volatile uint32_t count[3];
 volatile bool done;
 
+#define FAILED() ({ failed = true; })
+//#define FAILED() ({ failed = true; __breakpoint(); })
+
 bool timer_callback(repeating_timer_t *t) {
     count[0]++;
     static int z;
@@ -23,9 +26,27 @@ bool timer_callback(repeating_timer_t *t) {
         int a = z / 7;
         int b = z % 7;
         if (z != a * 7 + b) {
-            failed = true;
+            FAILED();
+        }
+        a = z / -7;
+        b = z % -7;
+        if (z != a * -7 + b) {
+            FAILED();
         }
     }
+    float fz = z;
+    float fa = fz / 11.0f;
+    float fb = fmodf(fz, 11.0f);
+    if (fabsf(fz - (fa * 11.0 + fb) > 1e-9)) {
+        FAILED();
+    }
+    double dz = z;
+    double da = dz / 11.0;
+    double db = fmod(dz, 11.0);
+    if (fabsf(dz - (da * 11.0 + db) > 1e-9)) {
+        FAILED();
+    }
+
     return !done;
 }
 
@@ -41,16 +62,20 @@ void do_dma_start(uint ch) {
     dma_channel_configure(ch, &c, &word[ch], &word[ch], 513 + ch * 23, true);
 }
 
+double d0c, d0s, d0t, dz;
+float f0c, f0s, f0t, fz;
+
 void test_irq_handler0() {
     count[1]++;
     dma_hw->ints0 |= 1u;
     static uint z;
+    static uint dz;
     for (int i=0; i<80;i++) {
         z += 31;
         uint a = z / 11;
         uint b = z % 11;
         if (z != a * 11 + b) {
-            failed = true;
+            FAILED();
         }
     }
     if (done) dma_channel_abort(0);
@@ -66,16 +91,17 @@ void test_irq_handler1() {
         uint a = z / -13;
         uint b = z % -13;
         if (z != a * -13 + b) {
-            failed = true;
+            FAILED();
         }
         static uint64_t z64;
         z64 -= 47;
         uint64_t a64 = z64 / -13;
         uint64_t b64 = z64 % -13;
         if (z64 != a64 * -13 + b64) {
-            failed = true;
+            FAILED();
         }
     }
+
     if (done) dma_channel_abort(1);
     else      do_dma_start(1);
 }
@@ -89,7 +115,7 @@ void test_nesting() {
     // They all busily make use of the dividers, to expose any issues with nested use
 
     repeating_timer_t timer;
-    add_repeating_timer_us(529, timer_callback, NULL, &timer);
+    add_repeating_timer_us(929, timer_callback, NULL, &timer);
     irq_set_exclusive_handler(DMA_IRQ_0, test_irq_handler0);
     irq_set_exclusive_handler(DMA_IRQ_1, test_irq_handler1);
 
@@ -101,7 +127,7 @@ void test_nesting() {
     irq_set_enabled(DMA_IRQ_1, 1);
     do_dma_start(0);
     do_dma_start(1);
-    absolute_time_t end = delayed_by_ms(get_absolute_time(), 2000);
+    absolute_time_t end = delayed_by_ms(get_absolute_time(), 10000);
     int count_local=0;
     while (!time_reached(end)) {
         for(uint i=0;i<100;i++) {
@@ -109,8 +135,39 @@ void test_nesting() {
             uint a = z / 11;
             uint b = z % 11;
             if (z != a * 11 + b) {
-                failed = true;
+                FAILED();
             }
+            int zz = (int)z;
+            int aa = zz / -11;
+            int bb = zz % -11;
+            if (zz != aa * -11 + bb) {
+                FAILED();
+            }
+            aa = -zz / -11;
+            bb = -zz % -11;
+            if (-zz != aa * -11 + bb) {
+                FAILED();
+            }
+            aa = -zz / 11;
+            bb = -zz % 11;
+            if (-zz != aa * 11 + bb) {
+                FAILED();
+            }
+            a = 0xffffffffu / 11;
+            b = 0xffffffffu % 11;
+            if (0xffffffffu != a * 11 + b) {
+                FAILED();
+            }
+        }
+        // these use the divider
+        for(uint i=0;i<=100;i+=20) {
+            // both in and out bootrom range (we perform mod in wrapper code if necessarry)
+            f0t = tanf(i * 50);
+            f0c = cosf(i * 50);
+            f0s = sinf(i * 50);
+            d0t = tan(i * 1000);
+            d0c = cos(i * 1000);
+            d0s = sin(i * 1000);
         }
         count_local++;
     }
diff --git a/test/pico_float_test/CMakeLists.txt b/test/pico_float_test/CMakeLists.txt
index 0ff1f60..26204d3 100644
--- a/test/pico_float_test/CMakeLists.txt
+++ b/test/pico_float_test/CMakeLists.txt
@@ -11,15 +11,19 @@ add_executable(pico_double_test
         )
 
 
+#todo split out variants with different flags
 target_compile_definitions(pico_float_test PRIVATE
         PICO_USE_CRT_PRINTF=1 # want full precision output
 #        PICO_FLOAT_PROPAGATE_NANS=1
+#        PICO_DIVIDER_DISABLE_INTERRUPTS=1
 )
 
+#todo split out variants with different flags
 target_compile_definitions(pico_double_test PRIVATE
         PICO_USE_CRT_PRINTF=1 # want full precision output
                 PICO_FLOAT_PROPAGATE_NANS=1
-                PICO_DOUBLE_PROPAGATE_NANS=1
+                #PICO_DOUBLE_PROPAGATE_NANS=1
+                #PICO_DIVIDER_DISABLE_INTERRUPTS=1
         )
 
 # handy for testing we aren't pulling in extra stuff
diff --git a/test/pico_float_test/pico_double_test.c b/test/pico_float_test/pico_double_test.c
index 708ab25..6c095e7 100644
--- a/test/pico_float_test/pico_double_test.c
+++ b/test/pico_float_test/pico_double_test.c
@@ -282,6 +282,51 @@ int test_dcmpun() {
     return 0;
 }
 
+#define assert_nan(a) assert(isnan(a))
+#define check_nan(a) ({ assert_nan(a); a; })
+
+double __aeabi_i2d(int32_t);
+double __aeabi_ui2d(int32_t);
+double __aeabi_l2d(int64_t);
+double __aeabi_ul2d(int64_t);
+int32_t __aeabi_d2iz(double);
+int64_t __aeabi_d2lz(double);
+double __aeabi_dmul(double, double);
+double __aeabi_ddiv(double, double);
+#if LIB_PICO_DOUBLE_PICO
+double __real___aeabi_i2d(int);
+double __real___aeabi_ui2d(int);
+double __real___aeabi_l2d(int64_t);
+double __real___aeabi_ul2d(int64_t);
+double __real___aeabi_dmul(double, double);
+double __real___aeabi_ddiv(double, double);
+int32_t __real___aeabi_d2iz(double);
+int64_t __real___aeabi_d2lz(double);
+double __real_sqrt(double);
+double __real_cos(double);
+double __real_sin(double);
+double __real_tan(double);
+double __real_exp(double);
+double __real_log(double);
+double __real_atan2(double, double);
+double __real_pow(double, double);
+double __real_trunc(double);
+double __real_ldexp(double, int);
+double __real_fmod(double, double);
+
+#define EPSILON 1e-9
+#define assert_close(a, b) assert(((b - a) < EPSILON || (a - b) < EPSILON) || (isinf(a) && isinf(b) && (a < 0) == (b < 0)))
+#define check1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); assert(r == r2); r; })
+#define check2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); assert(r == r2); r; })
+#define check_close1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); if (isnan(p0)) assert_nan(r); else assert_close(r, r2); r; })
+#define check_close2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); if (isnan(p0) || isnan(p1)) assert_nan(r); else assert_close(r, r2); r; })
+#else
+#define check1(func,p0) func(p0)
+#define check2(func,p0,p1) func(p0,p1)
+#define check_close1(func,p0) func(p0)
+#define check_close2(func,p0,p1) func(p0,p1)
+#endif
+
 double aa = 0.5;
 double bb = 1;
 
@@ -305,14 +350,18 @@ int main() {
 #if 1
     for (double x = 0; x < 3; x++) {
         printf("\n ----- %g\n", x);
-        printf("SQRT %10.18g\n", sqrt(x));
-        printf("COS %10.18g\n", cos(x));
-        printf("SIN %10.18g\n", sin(x));
-        printf("TAN %10.18g\n", tan(x));
-        printf("ATAN2 %10.18g\n", atan2(x, 10));
-        printf("ATAN2 %10.18g\n", atan2(10, x));
-        printf("EXP %10.18g\n", exp(x));
-        printf("LN %10.18g\n", log(x));
+        printf("SQRT %10.18g\n", check_close1(sqrt, x));
+        printf("COS %10.18g\n", check_close1(cos, x));
+        printf("SIN %10.18g\n", check_close1(sin, x));
+        printf("TAN %10.18g\n", check_close1(tan, x));
+        printf("ATAN2 %10.18g\n", check_close2(atan2, x, 10.0));
+        printf("ATAN2 %10.18g\n", check_close2(atan2, 10.0, x));
+        printf("EXP %10.18g\n", check_close1(exp, x));
+        printf("LN %10.18g\n", check_close1(log, x));
+        printf("POW %10.18f\n", check_close2(pow, x, x));
+        printf("TRUNC %10.18f\n", check_close1(trunc, x));
+        printf("LDEXP %10.18f\n", check_close2(ldexp, x, x));
+        printf("FMOD %10.18f\n", check_close2(fmod, x, 3.0f));
         double s, c;
         sincos(x, &s, &c);
         printf("SINCOS %10.18f %10.18f\n", s, c);
@@ -325,22 +374,21 @@ int main() {
 #if PICO_DOUBLE_PROPAGATE_NANS
     {
         float x = NAN;
-        printf("NANO %10.18f\n", x);
-        printf("SQRT %10.18f\n", sqrt(x));
-        printf("COS %10.18f\n", cos(x));
-        printf("SIN %10.18f\n", sin(x));
-        printf("TAN %10.18f\n", tan(x));
-        printf("ATAN2 %10.18f\n", atan2(x, 10));
-        printf("ATAN2 %10.18f\n", atan2(10, x));
-        printf("EXP %10.18f\n", exp(x));
-        printf("LN %10.18f\n", log(x));
-        printf("POW %10.18f\n", pow(x, x));
-        printf("TRUNC %10.18f\n", trunc(x));
-        printf("LDEXP %10.18f\n", ldexp(x, x));
-        printf("FMOD %10.18f\n", fmod(x, 3.0f));
+        printf("SQRT %10.18g\n", check_close1(sqrt, x));
+        printf("COS %10.18g\n", check_close1(cos, x));
+        printf("SIN %10.18g\n", check_close1(sin, x));
+        printf("TAN %10.18g\n", check_close1(tan, x));
+        printf("ATAN2 %10.18g\n", check_close2(atan2, x, 10.0));
+        printf("ATAN2 %10.18g\n", check_close2(atan2, 10.0, x));
+        printf("EXP %10.18g\n", check_close1(exp, x));
+        printf("LN %10.18g\n", check_close1(log, x));
+        printf("POW %10.18f\n", check_nan(pow(x, x)));
+        printf("TRUNC %10.18f\n", check_nan(trunc(x)));
+        printf("LDEXP %10.18f\n", check_nan(ldexp(x, x)));
+        printf("FMOD %10.18f\n", check_nan(fmod(x, 3.0f)));
         double s, c;
         sincos(x, &s, &c);
-        printf("SINCOS %10.18f %10.18f\n", s, c);
+        printf("SINCOS %10.18f %10.18f\n", check_nan(s), check_nan(c));
 
         for(int j=0;j<2;j++) {
             for (int i = 1; i < 4; i++) {
@@ -372,17 +420,21 @@ int main() {
 //        }
         for (int32_t x = -1; x; x <<= 1) {
             printf("i %d->%f\n", x, (double) x);
+            check1(__aeabi_i2d, x);
         }
         for (int32_t x = 1; x; x <<= 1) {
             printf("i %d->%f\n", x, (double) x);
+            check1(__aeabi_i2d, x);
             y = x << 1;
         }
         for (int64_t x = 1; x; x <<= 1) {
             printf("i %lld->%f\n", x, (double) x);
+            check1(__aeabi_l2d, x);
             y = x << 1;
         }
         for (int64_t x = -1; x; x <<= 1) {
             printf("i %lld->%f\n", x, (double) x);
+            check1(__aeabi_l2d, x);
             y = x << 1;
         }
         printf("d %d->%f\n", y, (float) y);
@@ -392,24 +444,40 @@ int main() {
         uint32_t y;
         for(uint32_t x = 1; x; x <<= 1) {
             printf("u %u->%f\n", x, (double)x);
+            check1(__aeabi_ui2d, x);
             y = x << 1;
         }
         printf("u %u->%f\n", y, (double)y);
     }
     for(int64_t x = 1; x !=0; x <<= 1u) {
         printf("%lld->%f\n", x, (double)x);
+        check1(__aeabi_l2d, x);
     }
-    for(double x = -4294967296.f * 4294967296.f; x<=-0.5f; x/=2.f) {
+    for(double x = -4294967296.f * 4294967296.f * 2.f; x<=-0.5f; x/=2.f) {
         printf("d2i64 %f->%lld\n", x, (int64_t)x);
+        if (x < INT64_MIN) {
+            // seems like there is a bug in the gcc version!
+            assert(__aeabi_d2lz(x) == INT64_MIN);
+        } else {
+            check1(__aeabi_d2lz, x);
+        }
     }
-    for(double x = 4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
+    for(double x = 4294967296.f * 4294967296.f * 2.f; x>=0.5f; x/=2.f) {
         printf("d2i64 %f->%lld\n", x, (int64_t)x);
+        if (x >= INT64_MAX) {
+            // seems like there is a bug in the gcc version!
+            assert(__aeabi_d2lz(x) == INT64_MAX);
+        } else {
+            check1(__aeabi_d2lz, x);
+        }
     }
     for(double x = -4294967296.f * 4294967296.f; x<=-0.5f; x/=2.f) {
         printf("d2i32 %f->%d\n", x, (int32_t)x);
+        check1(__aeabi_d2iz, x);
     }
     for(double x = 4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
         printf("d2i32 %f->%d\n", x, (int32_t)x);
+        check1(__aeabi_d2iz, x);
     }
 
     for (double x = 1; x < 11; x += 2) {
@@ -417,6 +485,8 @@ int main() {
         double g = 1.0 / x;
         printf("%g %10.18g %10.18g, %10.18g, %10.18g %10.18g\n", x, f, x + 0.37777777777777777777777777777,
                x - 0.377777777777777777777777777777, g, 123456789.0 / x);
+        check2(__aeabi_dmul, x, x);
+        check2(__aeabi_ddiv, 1.0, x);
     }
 
     if (fail ||
diff --git a/test/pico_float_test/pico_float_test.c b/test/pico_float_test/pico_float_test.c
index 75a37a8..5da23d2 100644
--- a/test/pico_float_test/pico_float_test.c
+++ b/test/pico_float_test/pico_float_test.c
@@ -16,7 +16,6 @@
 #include <stdlib.h>
 #include <math.h>
 #include <pico/float.h>
-//#include <pico/float.h>
 #include "pico/stdlib.h"
 #include "inttypes.h"
 
@@ -283,12 +282,58 @@ int test_fcmpun() {
     return 0;
 }
 
+#define assert_nan(a) assert(isnan(a))
+#define check_nan(a) ({ assert_nan(a); a; })
+
+float __aeabi_i2f(int32_t);
+float __aeabi_ui2f(int32_t);
+float __aeabi_l2f(int64_t);
+float __aeabi_ul2f(int64_t);
+int32_t __aeabi_f2iz(float);
+int64_t __aeabi_f2lz(float);
+float __aeabi_fmul(float, float);
+float __aeabi_fdiv(float, float);
+#if LIB_PICO_FLOAT_PICO
+float __real___aeabi_i2f(int);
+float __real___aeabi_ui2f(int);
+float __real___aeabi_l2f(int64_t);
+float __real___aeabi_ul2f(int64_t);
+float __real___aeabi_fmul(float, float);
+float __real___aeabi_fdiv(float, float);
+int32_t __real___aeabi_f2iz(float);
+int64_t __real___aeabi_f2lz(float);
+float __real_sqrtf(float);
+float __real_cosf(float);
+float __real_sinf(float);
+float __real_tanf(float);
+float __real_expf(float);
+float __real_logf(float);
+float __real_atan2f(float, float);
+float __real_powf(float, float);
+float __real_truncf(float);
+float __real_ldexpf(float, int);
+float __real_fmodf(float, float);
+#define EPSILON 1e-9
+#define assert_close(a, b) assert(((b - a) < EPSILON || (a - b) < EPSILON) || (isinf(a) && isinf(b) && (a < 0) == (b < 0)))
+#define check1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); assert(r == r2); r; })
+#define check2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); assert(r == r2); r; })
+#define check_close1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); if (isnan(p0)) assert_nan(r); else assert_close(r, r2); r; })
+#define check_close2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); if (isnan(p0) || isnan(p1)) assert_nan(r); else assert_close(r, r2); r; })
+#else
+#define check1(func,p0) func(p0)
+#define check2(func,p0,p1) func(p0,p1)
+#define check_close1(func,p0) func(p0)
+#define check_close2(func,p0,p1) func(p0,p1)
+#endif
+
 double aa = 0.5;
 double bb = 1;
 
 int main() {
     setup_default_uart();
 
+    bool fail = false;
+
     printf("%d\n", aa < bb);
     for(float a = -1; a <= 1; a++) {
         for(float b = -1; b <= 1; b++) {
@@ -341,21 +386,27 @@ int main() {
 #if 1
     for (float x = 0; x < 3; x++) {
         printf("\n ----- %f\n", x);
-        printf("FSQRT %10.18f\n", sqrtf(x));
-        printf("FCOS %10.18f\n", cosf(x));
-        printf("FSIN %10.18f\n", sinf(x));
+        printf("FSQRT %10.18f\n", check_close1(sqrtf, x));
+        printf("FCOS %10.18f\n", check_close1(cosf, x));
+        printf("FSIN %10.18f\n", check_close1(sinf, x));
         float s, c;
         sincosf(x, &s, &c);
         printf("FSINCOS %10.18f %10.18f\n", s, c);
-        printf("FTAN %10.18f\n", tanf(x));
-        printf("FATAN2 %10.18f\n", atan2f(x, 10));
-        printf("FATAN2 %10.18f\n", atan2f(10, x));
-        printf("FEXP %10.18f\n", expf(x));
-        printf("FLN %10.18f\n", logf(x));
-        printf("POWF %10.18f\n", powf(x, x));
-        printf("TRUNCF %10.18f\n", truncf(x));
-        printf("LDEXPF %10.18f\n", ldexpf(x, x));
-        printf("FMODF %10.18f\n", fmodf(x, 3.0f));
+        printf("FTAN %10.18f\n", check_close1(tanf, x));
+        printf("FATAN2 %10.18f\n", check_close2(atan2f, x, 10.f));
+        printf("FATAN2 %10.18f\n", check_close2(atan2f, 10.f, x));
+        printf("FEXP %10.18f\n", check_close1(expf, x));
+        printf("FLN %10.18f\n", check_close1(logf, x));
+        printf("POWF %10.18f\n", check_close2(powf, x, x));
+        printf("TRUNCF %10.18f\n", check_close1(truncf, x));
+        printf("LDEXPF %10.18f\n", check_close2(ldexpf, x, x));
+        printf("FMODF %10.18f\n", check_close2(fmodf, x, 3.0f));
+        sincosf(x, &s, &c);
+        printf("SINCOS %10.18f %10.18f\n", s, c);
+        if (s != sin(x) || c != cos(x)) {
+            printf("SINCOS mismatch\n");
+            fail = true;
+        }
     }
 
     for (double x = 0; x < 3; x++) {
@@ -390,18 +441,25 @@ int main() {
 //        sincosf(x, &s, &c);
         printf("FSINCOS %10.18f %10.18f\n", s, c);
 
-        for(int i=1; i<4; i++) {
-            char buf[4];
-            sprintf(buf, "%d", i);
-            float f0 = -nanf(buf);
-            double d0 = -nan(buf);
-            // hmm
-            *(uint64_t *)&d0 |= i;
-            *(uint32_t *)&f0 |= i;
-            float f = (float)d0;
-            double d = (double)f0;
-            printf("f2d %08"PRIx32" -> %g %016"PRIx64"\n", *(uint32_t*)&f0, d, *(uint64_t*)&d);
-            printf("d2f %016"PRIx64" -> %f %08"PRIx32"\n", *(uint64_t*)&d0, f, *(uint32_t*)&f);
+        for(int j=0;j<2;j++) {
+            for (int i = 1; i < 4; i++) {
+                char buf[4];
+                sprintf(buf, "%d", i);
+                float f0 = -nanf(buf);
+                double d0 = -nan(buf);
+                // hmm nanf/nan seem to ignore payload
+                *(uint64_t *) &d0 |= i;
+                *(uint32_t *) &f0 |= i;
+                if (j) {
+                    // try without top bit set
+                    *(uint64_t *) &d0 &= ~0x0008000000000000ull;
+                    *(uint32_t *) &f0 &= ~0x00400000u;
+                }
+                float f = (float) d0;
+                double d = (double) f0;
+                printf("f2d %f %08"PRIx32" -> %g %016"PRIx64"\n", f0, *(uint32_t *) &f0, d, *(uint64_t *) &d);
+                printf("d2f %f %016"PRIx64" -> %f %08"PRIx32"\n", d0, *(uint64_t *) &d0, f, *(uint32_t *) &f);
+            }
         }
     }
 #endif
@@ -413,17 +471,21 @@ int main() {
 //        }
         for (int32_t x = -1; x; x <<= 1) {
             printf("i %d->%f\n", x, (float) x);
+            check1(__aeabi_i2f, x);
         }
         for (int32_t x = 1; x; x <<= 1) {
             printf("i %d->%f\n", x, (float) x);
+            check1(__aeabi_i2f, x);
             y = x << 1;
         }
         for (int64_t x = 1; x; x <<= 1) {
             printf("i %lld->%f\n", x, (float) x);
+            check1(__aeabi_l2f, x);
             y = x << 1;
         }
         for (int64_t x = -1; x; x <<= 1) {
             printf("i %lld->%f\n", x, (float) x);
+            check1(__aeabi_l2f, x);
             y = x << 1;
         }
         printf("d %d->%f\n", y, (float) y);
@@ -433,40 +495,63 @@ int main() {
         uint32_t y;
         for(uint32_t x = 1; x; x <<= 1) {
             printf("u %u->%f\n", x, (float)x);
+            check1(__aeabi_ui2f, x);
             y = x << 1;
         }
         printf("u %u->%f\n", y, (float)y);
     }
     for(int64_t x = 1; x !=0; x <<= 1u) {
         printf("%lld->%f\n", x, (float)x);
+        check1(__aeabi_l2f, x);
+    }
+    for(float x = -4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
+        printf("f %f->%lld\n", x, (int64_t)x);
+        if (x < INT64_MIN) {
+            // seems like there is a bug in the gcc version!
+            assert(__aeabi_f2lz(x) == INT64_MIN);
+        } else {
+            check1(__aeabi_f2lz, x);
+        }
+    }
+    for(float x = 4294967296.f * 4294967296.f * 2.f; x>=0.5f; x/=2.f) {
+        printf("f2i64 %f->%lld\n", x, (int64_t)x);
+        if (x >= INT64_MAX) {
+            // seems like there is a bug in the gcc version!
+            assert(__aeabi_f2lz(x) == INT64_MAX);
+        } else {
+            check1(__aeabi_f2lz, x);
+        }
+    }
+    for(float x = -4294967296.f * 4294967296.f; x<=-0.5f; x/=2.f) {
+        printf("d2i32 %f->%d\n", x, (int32_t)x);
+        check1(__aeabi_f2iz, x);
     }
     for(float x = 4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
-        printf("f %f->%lld\n", x, (int64_t)x);
+        printf("d2i32 %f->%d\n", x, (int32_t)x);
+        check1(__aeabi_f2iz, x);
     }
-    for (double x = 1; x < 11; x += 2) {
-        double f = x * x;
-        double g = 1.0 / x;
-        printf("%g %10.18g %10.18g, %10.18g, %10.18g %10.18g\n", x, f, x + 0.37777777777777777777777777777,
-               x - 0.377777777777777777777777777777, g, 123456789.0 / x);
+
+    for (float x = 1; x < 11; x += 2) {
+        float f = x * x;
+        float g = 1.0f / x;
+        printf("%g %10.18g %10.18g, %10.18g, %10.18g %10.18g\n", x, f, x + 0.37777777777777777777777777777f,
+               x - 0.377777777777777777777777777777f, g, 123456789.0f / x);
+        check2(__aeabi_fmul, x, x);
+        check2(__aeabi_fdiv, 1.0f, x);
     }
-    if (test_cfcmpeq() || test_cfcmple() ||
-        test_fcmpun() || test_cmple_gt() || test_cmplt_ge()) {
+
+    if (fail ||
+        test_cfcmpeq() ||
+        test_cfcmple() ||
+        test_fcmpun() ||
+        test_cmple_gt() ||
+        test_cmplt_ge()) {
         printf("FAILED\n");
         return 1;
     } else {
         printf("PASSED\n");
         return 0;
     }
-
-    if (test_cfcmpeq() || test_cfcmple() ||
-        test_fcmpun() || test_cmple_gt() || test_cmplt_ge()) {
-        printf("FAILED\n");
-        return 1;
-    } else {
-        printf("PASSED\n");
-        return 0;
-    }
-
 #endif
 }