Some optimizations for the hardware divider (#1033)
* Remove unnecessary wait in pico_divider. There is no need to wait if there is more than 8 cycles between setup and result readout. Dividend/divisor readout should be correct without delay. Update comment to reflect that. * Optimize hw_divider_save_state/hw_divider_restore_state. Doing multiple pushes to avoid stack usage is faster. The wait loop in hw_divider_save_state had an incorrect branch in the wait loop. This didn't matter since the wait wasn't necessary to begin with. * Remove pointless aligns in hardware_divider. The regular_func_with_section inserts a new section so if aligning is desired it should be placed in the macro after section start. * Save a few bytes in hardware_divider. Signed and unsigned code can use the same exit code. Branching to the common code is free since we need the 8 cycle delay anyway.
This commit is contained in:
parent
2d4e3baa82
commit
3bd7a829db
@ -7,70 +7,51 @@
|
|||||||
.thumb
|
.thumb
|
||||||
|
|
||||||
// tag::hw_div_s32[]
|
// tag::hw_div_s32[]
|
||||||
|
|
||||||
.macro __divider_delay
|
|
||||||
// delay 8 cycles
|
|
||||||
b 1f
|
|
||||||
1: b 1f
|
|
||||||
1: b 1f
|
|
||||||
1: b 1f
|
|
||||||
1:
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.align 2
|
|
||||||
|
|
||||||
regular_func_with_section hw_divider_divmod_s32
|
regular_func_with_section hw_divider_divmod_s32
|
||||||
ldr r3, =(SIO_BASE)
|
ldr r3, =(SIO_BASE)
|
||||||
str r0, [r3, #SIO_DIV_SDIVIDEND_OFFSET]
|
str r0, [r3, #SIO_DIV_SDIVIDEND_OFFSET]
|
||||||
str r1, [r3, #SIO_DIV_SDIVISOR_OFFSET]
|
str r1, [r3, #SIO_DIV_SDIVISOR_OFFSET]
|
||||||
__divider_delay
|
b hw_divider_divmod_return
|
||||||
// return 64 bit value so we can efficiently return both (note quotient must be read last)
|
|
||||||
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
|
||||||
ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
|
||||||
bx lr
|
|
||||||
// end::hw_div_s32[]
|
// end::hw_div_s32[]
|
||||||
|
|
||||||
.align 2
|
|
||||||
|
|
||||||
// tag::hw_div_u32[]
|
// tag::hw_div_u32[]
|
||||||
regular_func_with_section hw_divider_divmod_u32
|
regular_func_with_section hw_divider_divmod_u32
|
||||||
ldr r3, =(SIO_BASE)
|
ldr r3, =(SIO_BASE)
|
||||||
str r0, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
|
str r0, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||||
str r1, [r3, #SIO_DIV_UDIVISOR_OFFSET]
|
str r1, [r3, #SIO_DIV_UDIVISOR_OFFSET]
|
||||||
__divider_delay
|
b hw_divider_divmod_return
|
||||||
// return 64 bit value so we can efficiently return both (note quotient must be read last)
|
// end::hw_div_u32[]
|
||||||
|
|
||||||
|
// Common delay and return section for s32 and u32
|
||||||
|
.section .text.hw_divider_divmod_return
|
||||||
|
hw_divider_divmod_return:
|
||||||
|
// Branching here is 2 cycles, delay another 6
|
||||||
|
b 1f
|
||||||
|
1: b 1f
|
||||||
|
1: b 1f
|
||||||
|
1: // return 64 bit value so we can efficiently return both (note quotient must be read last)
|
||||||
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
||||||
ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
||||||
bx lr
|
bx lr
|
||||||
// end::hw_div_u32[]
|
|
||||||
|
|
||||||
#if SIO_DIV_CSR_READY_LSB == 0
|
|
||||||
.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
|
|
||||||
#else
|
|
||||||
#error need to change SHIFT above
|
|
||||||
#endif
|
|
||||||
|
|
||||||
regular_func_with_section hw_divider_save_state
|
regular_func_with_section hw_divider_save_state
|
||||||
push {r4, r5, lr}
|
ldr r3, =SIO_BASE
|
||||||
ldr r5, =SIO_BASE
|
ldr r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||||
ldr r4, [r5, #SIO_DIV_CSR_OFFSET]
|
ldr r2, [r3, #SIO_DIV_UDIVISOR_OFFSET]
|
||||||
# wait for results as we can't save signed-ness of operation
|
stmia r0!, {r1-r2}
|
||||||
1:
|
// The 8 cycles needed to guarantee that the result is ready is ensured by the preceeding
|
||||||
lsrs r4, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
|
// code of 7 cycles together with any branch to it taking at least 2 cycles.
|
||||||
bcc 1b
|
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
||||||
ldr r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET]
|
ldr r2, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
||||||
ldr r2, [r5, #SIO_DIV_UDIVISOR_OFFSET]
|
stmia r0!, {r1-r2}
|
||||||
ldr r3, [r5, #SIO_DIV_REMAINDER_OFFSET]
|
bx lr
|
||||||
ldr r4, [r5, #SIO_DIV_QUOTIENT_OFFSET]
|
|
||||||
stmia r0!, {r1-r4}
|
|
||||||
pop {r4, r5, pc}
|
|
||||||
|
|
||||||
regular_func_with_section hw_divider_restore_state
|
regular_func_with_section hw_divider_restore_state
|
||||||
push {r4, r5, lr}
|
ldr r3, =SIO_BASE
|
||||||
ldr r5, =SIO_BASE
|
ldmia r0!, {r1-r2}
|
||||||
ldmia r0!, {r1-r4}
|
str r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||||
str r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET]
|
str r2, [r3, #SIO_DIV_UDIVISOR_OFFSET]
|
||||||
str r2, [r5, #SIO_DIV_UDIVISOR_OFFSET]
|
ldmia r0!, {r1-r2}
|
||||||
str r3, [r5, #SIO_DIV_REMAINDER_OFFSET]
|
str r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
|
||||||
str r4, [r5, #SIO_DIV_QUOTIENT_OFFSET]
|
str r2, [r3, #SIO_DIV_QUOTIENT_OFFSET]
|
||||||
pop {r4, r5, pc}
|
bx lr
|
||||||
|
@ -19,11 +19,10 @@ need to change SHIFT above
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// SIO_BASE ptr in r2; pushes r4-r7, lr to stack
|
// SIO_BASE ptr in r2; pushes r4-r7, lr to stack
|
||||||
// requires that division started at least 2 cycles prior to the start of the macro
|
|
||||||
.macro save_div_state_and_lr
|
.macro save_div_state_and_lr
|
||||||
// originally we did this, however a) it uses r3, and b) the push takes 6 cycles, b)
|
// originally we did this, however a) it uses r3, and b) the push and dividend/divisor
|
||||||
// any IRQ which uses the divider will necessarily put the data back, which will
|
// readout takes 8 cycles, c) any IRQ which uses the divider will necessarily put the
|
||||||
// immediately make it ready
|
// data back, which will immediately make it ready
|
||||||
//
|
//
|
||||||
// // ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
|
// // ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
|
||||||
// // // wait for results as we can't save signed-ness of operation
|
// // // wait for results as we can't save signed-ness of operation
|
||||||
@ -31,7 +30,7 @@ need to change SHIFT above
|
|||||||
// // lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
|
// // lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
|
||||||
// // bcc 1b
|
// // bcc 1b
|
||||||
|
|
||||||
// 6 cycles
|
// 6 cycle push + 2 ldr ensures the 8 cycle delay before remainder and quotient are ready
|
||||||
push {r4, r5, r6, r7, lr}
|
push {r4, r5, r6, r7, lr}
|
||||||
// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
|
// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
|
||||||
ldr r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
|
ldr r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||||
|
@ -53,14 +53,10 @@
|
|||||||
.macro save_div_state_and_lr_64
|
.macro save_div_state_and_lr_64
|
||||||
push {r4, r5, r6, r7, lr}
|
push {r4, r5, r6, r7, lr}
|
||||||
ldr r6, =SIO_BASE
|
ldr r6, =SIO_BASE
|
||||||
1:
|
|
||||||
ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
|
|
||||||
// wait for results as we can't save signed-ness of operation
|
|
||||||
lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
|
|
||||||
bcc 1b
|
|
||||||
// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
|
// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
|
||||||
ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
|
ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
|
||||||
ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
|
ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
|
||||||
|
// No need to wait before reading result as long as preceeding code takes more than 8 cycles
|
||||||
ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
|
ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
|
||||||
ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
|
ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
|
||||||
.endm
|
.endm
|
||||||
@ -152,8 +148,6 @@ regular_func divmod_s32s32_unsafe
|
|||||||
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
|
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
|
||||||
.align 2
|
.align 2
|
||||||
regular_func divmod_s32s32_savestate
|
regular_func divmod_s32s32_savestate
|
||||||
// note that we must be at least 2 cycles into division at this point,
|
|
||||||
// which we are because of the firty check before getting here (and of course the function call before that)
|
|
||||||
save_div_state_and_lr
|
save_div_state_and_lr
|
||||||
bl divmod_s32s32_unsafe
|
bl divmod_s32s32_unsafe
|
||||||
restore_div_state_and_return
|
restore_div_state_and_return
|
||||||
|
Loading…
Reference in New Issue
Block a user