Some optimizations for the hardware divider (#1033)

* Remove unnecessary wait in pico_divider. There is no need to wait if there is more than 8 cycles between setup and result readout. Dividend/divisor readout should be correct without delay. Update comment to reflect that. * Optimize hw_divider_save_state/hw_divider_restore_state. Doing multiple pushes to avoid stack usage is faster. The wait loop in hw_divider_save_state had an incorrect branch in the wait loop. This didn't matter since the wait wasn't necessary to begin with. * Remove pointless aligns in hardware_divider. The regular_func_with_section inserts a new section so if aligning is desired it should be placed in the macro after section start. * Save a few bytes in hardware_divider. Signed and unsigned code can use the same exit code. Branching to the common code is free since we need the 8 cycle delay anyway.
2022-10-17 00:40:22 +02:00
parent 2d4e3baa82
commit 3bd7a829db
3 changed files with 35 additions and 61 deletions
--- a/src/rp2_common/hardware_divider/divider.S
+++ b/src/rp2_common/hardware_divider/divider.S
@ -7,70 +7,51 @@
 .thumb
 // tag::hw_div_s32[]
 .macro __divider_delay
    // delay 8 cycles
    b 1f
 1:  b 1f
 1:  b 1f
 1:  b 1f
 1:
 .endm
 .align 2
 regular_func_with_section hw_divider_divmod_s32
    ldr r3, =(SIO_BASE)
    str r0, [r3, #SIO_DIV_SDIVIDEND_OFFSET]
    str r1, [r3, #SIO_DIV_SDIVISOR_OFFSET]
-    __divider_delay
+    b hw_divider_divmod_return
    // return 64 bit value so we can efficiently return both (note quotient must be read last)
    ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
    ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
    bx lr
 // end::hw_div_s32[]
 .align 2
 // tag::hw_div_u32[]
 regular_func_with_section hw_divider_divmod_u32
    ldr r3, =(SIO_BASE)
    str r0, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
    str r1, [r3, #SIO_DIV_UDIVISOR_OFFSET]
-    __divider_delay
+    b hw_divider_divmod_return
-    // return 64 bit value so we can efficiently return both (note quotient must be read last)
+// end::hw_div_u32[]
 // Common delay and return section for s32 and u32
 .section .text.hw_divider_divmod_return
 hw_divider_divmod_return:
    // Branching here is 2 cycles, delay another 6
    b 1f
 1:  b 1f
 1:  b 1f
 1:  // return 64 bit value so we can efficiently return both (note quotient must be read last)
    ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
    ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
    bx lr
 // end::hw_div_u32[]
 #if SIO_DIV_CSR_READY_LSB == 0
 .equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
 #else
 #error need to change SHIFT above
 #endif
 regular_func_with_section hw_divider_save_state
-    push {r4, r5, lr}
+    ldr r3, =SIO_BASE
-    ldr r5, =SIO_BASE
+    ldr r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
-    ldr r4, [r5, #SIO_DIV_CSR_OFFSET]
+    ldr r2, [r3, #SIO_DIV_UDIVISOR_OFFSET]
-    # wait for results as we can't save signed-ness of operation
+    stmia r0!, {r1-r2}
-1:
+    // The 8 cycles needed to guarantee that the result is ready is ensured by the preceeding
-    lsrs r4, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
+    // code of 7 cycles together with any branch to it taking at least 2 cycles.
-    bcc 1b
+    ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
-    ldr r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET]
+    ldr r2, [r3, #SIO_DIV_QUOTIENT_OFFSET]
-    ldr r2, [r5, #SIO_DIV_UDIVISOR_OFFSET]
+    stmia r0!, {r1-r2}
-    ldr r3, [r5, #SIO_DIV_REMAINDER_OFFSET]
+    bx lr
    ldr r4, [r5, #SIO_DIV_QUOTIENT_OFFSET]
    stmia r0!, {r1-r4}
    pop {r4, r5, pc}
 regular_func_with_section hw_divider_restore_state
-    push {r4, r5, lr}
+    ldr r3, =SIO_BASE
-    ldr r5, =SIO_BASE
+    ldmia r0!, {r1-r2}
-    ldmia r0!, {r1-r4}
+    str r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
-    str r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET]
+    str r2, [r3, #SIO_DIV_UDIVISOR_OFFSET]
-    str r2, [r5, #SIO_DIV_UDIVISOR_OFFSET]
+    ldmia r0!, {r1-r2}
-    str r3, [r5, #SIO_DIV_REMAINDER_OFFSET]
+    str r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
-    str r4, [r5, #SIO_DIV_QUOTIENT_OFFSET]
+    str r2, [r3, #SIO_DIV_QUOTIENT_OFFSET]
-    pop {r4, r5, pc}
+    bx lr
--- a/src/rp2_common/hardware_divider/include/hardware/divider_helper.S
+++ b/src/rp2_common/hardware_divider/include/hardware/divider_helper.S
@ -19,11 +19,10 @@ need to change SHIFT above
 #endif
 // SIO_BASE ptr in r2; pushes r4-r7, lr to stack
 // requires that division started at least 2 cycles prior to the start of the macro
 .macro save_div_state_and_lr
-// originally we did this, however a) it uses r3, and b) the push takes 6 cycles, b)
+// originally we did this, however a) it uses r3, and b) the push and dividend/divisor
-// any IRQ which uses the divider will necessarily put the data back, which will
+// readout takes 8 cycles, c) any IRQ which uses the divider will necessarily put the
-// immediately make it ready
+// data back, which will immediately make it ready
 //
 //    // ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
 //    // // wait for results as we can't save signed-ness of operation
@ -31,7 +30,7 @@ need to change SHIFT above
 //    //     lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
 //    //     bcc 1b
-// 6 cycles
+// 6 cycle push + 2 ldr ensures the 8 cycle delay before remainder and quotient are ready
 push {r4, r5, r6, r7, lr}
 // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
 ldr r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
--- a/src/rp2_common/pico_divider/divider.S
+++ b/src/rp2_common/pico_divider/divider.S
@ -53,14 +53,10 @@
 .macro save_div_state_and_lr_64
    push {r4, r5, r6, r7, lr}
    ldr r6, =SIO_BASE
 1:
    ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
    // wait for results as we can't save signed-ness of operation
    lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
    bcc 1b
    // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
    ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
    ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
    // No need to wait before reading result as long as preceeding code takes more than 8 cycles
    ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
    ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
 .endm
@ -152,8 +148,6 @@ regular_func divmod_s32s32_unsafe
 #if !PICO_DIVIDER_DISABLE_INTERRUPTS
 .align 2
 regular_func divmod_s32s32_savestate
    // note that we must be at least 2 cycles into division at this point,
    // which we are because of the firty check before getting here (and of course the function call before that)
    save_div_state_and_lr
    bl divmod_s32s32_unsafe
    restore_div_state_and_return