Some optimizations for the hardware divider (#1033)
* Remove unnecessary wait in pico_divider. There is no need to wait if there is more than 8 cycles between setup and result readout. Dividend/divisor readout should be correct without delay. Update comment to reflect that. * Optimize hw_divider_save_state/hw_divider_restore_state. Doing multiple pushes to avoid stack usage is faster. The wait loop in hw_divider_save_state had an incorrect branch in the wait loop. This didn't matter since the wait wasn't necessary to begin with. * Remove pointless aligns in hardware_divider. The regular_func_with_section inserts a new section so if aligning is desired it should be placed in the macro after section start. * Save a few bytes in hardware_divider. Signed and unsigned code can use the same exit code. Branching to the common code is free since we need the 8 cycle delay anyway.
This commit is contained in:
parent
2d4e3baa82
commit
3bd7a829db
@ -7,70 +7,51 @@
|
||||
.thumb
|
||||
|
||||
// tag::hw_div_s32[]
|
||||
|
||||
.macro __divider_delay
|
||||
// delay 8 cycles
|
||||
b 1f
|
||||
1: b 1f
|
||||
1: b 1f
|
||||
1: b 1f
|
||||
1:
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
|
||||
regular_func_with_section hw_divider_divmod_s32
|
||||
ldr r3, =(SIO_BASE)
|
||||
str r0, [r3, #SIO_DIV_SDIVIDEND_OFFSET]
|
||||
str r1, [r3, #SIO_DIV_SDIVISOR_OFFSET]
|
||||
__divider_delay
|
||||
// return 64 bit value so we can efficiently return both (note quotient must be read last)
|
||||
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
||||
ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
||||
bx lr
|
||||
b hw_divider_divmod_return
|
||||
// end::hw_div_s32[]
|
||||
|
||||
.align 2
|
||||
|
||||
// tag::hw_div_u32[]
|
||||
regular_func_with_section hw_divider_divmod_u32
|
||||
ldr r3, =(SIO_BASE)
|
||||
str r0, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||
str r1, [r3, #SIO_DIV_UDIVISOR_OFFSET]
|
||||
__divider_delay
|
||||
// return 64 bit value so we can efficiently return both (note quotient must be read last)
|
||||
b hw_divider_divmod_return
|
||||
// end::hw_div_u32[]
|
||||
|
||||
// Common delay and return section for s32 and u32
|
||||
.section .text.hw_divider_divmod_return
|
||||
hw_divider_divmod_return:
|
||||
// Branching here is 2 cycles, delay another 6
|
||||
b 1f
|
||||
1: b 1f
|
||||
1: b 1f
|
||||
1: // return 64 bit value so we can efficiently return both (note quotient must be read last)
|
||||
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
||||
ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
||||
bx lr
|
||||
// end::hw_div_u32[]
|
||||
|
||||
#if SIO_DIV_CSR_READY_LSB == 0
|
||||
.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
|
||||
#else
|
||||
#error need to change SHIFT above
|
||||
#endif
|
||||
|
||||
regular_func_with_section hw_divider_save_state
|
||||
push {r4, r5, lr}
|
||||
ldr r5, =SIO_BASE
|
||||
ldr r4, [r5, #SIO_DIV_CSR_OFFSET]
|
||||
# wait for results as we can't save signed-ness of operation
|
||||
1:
|
||||
lsrs r4, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
|
||||
bcc 1b
|
||||
ldr r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||
ldr r2, [r5, #SIO_DIV_UDIVISOR_OFFSET]
|
||||
ldr r3, [r5, #SIO_DIV_REMAINDER_OFFSET]
|
||||
ldr r4, [r5, #SIO_DIV_QUOTIENT_OFFSET]
|
||||
stmia r0!, {r1-r4}
|
||||
pop {r4, r5, pc}
|
||||
ldr r3, =SIO_BASE
|
||||
ldr r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||
ldr r2, [r3, #SIO_DIV_UDIVISOR_OFFSET]
|
||||
stmia r0!, {r1-r2}
|
||||
// The 8 cycles needed to guarantee that the result is ready is ensured by the preceeding
|
||||
// code of 7 cycles together with any branch to it taking at least 2 cycles.
|
||||
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
||||
ldr r2, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
||||
stmia r0!, {r1-r2}
|
||||
bx lr
|
||||
|
||||
regular_func_with_section hw_divider_restore_state
|
||||
push {r4, r5, lr}
|
||||
ldr r5, =SIO_BASE
|
||||
ldmia r0!, {r1-r4}
|
||||
str r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||
str r2, [r5, #SIO_DIV_UDIVISOR_OFFSET]
|
||||
str r3, [r5, #SIO_DIV_REMAINDER_OFFSET]
|
||||
str r4, [r5, #SIO_DIV_QUOTIENT_OFFSET]
|
||||
pop {r4, r5, pc}
|
||||
ldr r3, =SIO_BASE
|
||||
ldmia r0!, {r1-r2}
|
||||
str r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||
str r2, [r3, #SIO_DIV_UDIVISOR_OFFSET]
|
||||
ldmia r0!, {r1-r2}
|
||||
str r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
||||
str r2, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
||||
bx lr
|
||||
|
@ -19,11 +19,10 @@ need to change SHIFT above
|
||||
#endif
|
||||
|
||||
// SIO_BASE ptr in r2; pushes r4-r7, lr to stack
|
||||
// requires that division started at least 2 cycles prior to the start of the macro
|
||||
.macro save_div_state_and_lr
|
||||
// originally we did this, however a) it uses r3, and b) the push takes 6 cycles, b)
|
||||
// any IRQ which uses the divider will necessarily put the data back, which will
|
||||
// immediately make it ready
|
||||
// originally we did this, however a) it uses r3, and b) the push and dividend/divisor
|
||||
// readout takes 8 cycles, c) any IRQ which uses the divider will necessarily put the
|
||||
// data back, which will immediately make it ready
|
||||
//
|
||||
// // ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
|
||||
// // // wait for results as we can't save signed-ness of operation
|
||||
@ -31,7 +30,7 @@ need to change SHIFT above
|
||||
// // lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
|
||||
// // bcc 1b
|
||||
|
||||
// 6 cycles
|
||||
// 6 cycle push + 2 ldr ensures the 8 cycle delay before remainder and quotient are ready
|
||||
push {r4, r5, r6, r7, lr}
|
||||
// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
|
||||
ldr r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||
|
@ -53,14 +53,10 @@
|
||||
.macro save_div_state_and_lr_64
|
||||
push {r4, r5, r6, r7, lr}
|
||||
ldr r6, =SIO_BASE
|
||||
1:
|
||||
ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
|
||||
// wait for results as we can't save signed-ness of operation
|
||||
lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
|
||||
bcc 1b
|
||||
// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
|
||||
ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||
ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
|
||||
// No need to wait before reading result as long as preceeding code takes more than 8 cycles
|
||||
ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
|
||||
ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
|
||||
.endm
|
||||
@ -152,8 +148,6 @@ regular_func divmod_s32s32_unsafe
|
||||
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
|
||||
.align 2
|
||||
regular_func divmod_s32s32_savestate
|
||||
// note that we must be at least 2 cycles into division at this point,
|
||||
// which we are because of the firty check before getting here (and of course the function call before that)
|
||||
save_div_state_and_lr
|
||||
bl divmod_s32s32_unsafe
|
||||
restore_div_state_and_return
|
||||
|
Loading…
Reference in New Issue
Block a user