Some optimizations for the hardware divider (#1033)

* Remove unnecessary wait in pico_divider.

There is no need to wait if there is more than 8 cycles between setup and result readout.
Dividend/divisor readout should be correct without delay. Update comment to reflect that.

* Optimize hw_divider_save_state/hw_divider_restore_state.

Doing multiple pushes to avoid stack usage is faster.
The wait loop in hw_divider_save_state had an incorrect branch in the wait loop.
This didn't matter since the wait wasn't necessary to begin with.

* Remove pointless aligns in hardware_divider.

The regular_func_with_section inserts a new section so if aligning
is desired it should be placed in the macro after section start.

* Save a few bytes in hardware_divider.

Signed and unsigned code can use the same exit code.
Branching to the common code is free since we need the 8 cycle
delay anyway.
This commit is contained in:
Peter Pettersson 2022-10-17 00:40:22 +02:00 committed by GitHub
parent 2d4e3baa82
commit 3bd7a829db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 61 deletions

View File

@ -7,70 +7,51 @@
.thumb
// tag::hw_div_s32[]
.macro __divider_delay
// delay 8 cycles
b 1f
1: b 1f
1: b 1f
1: b 1f
1:
.endm
.align 2
regular_func_with_section hw_divider_divmod_s32
ldr r3, =(SIO_BASE)
str r0, [r3, #SIO_DIV_SDIVIDEND_OFFSET]
str r1, [r3, #SIO_DIV_SDIVISOR_OFFSET]
__divider_delay
// return 64 bit value so we can efficiently return both (note quotient must be read last)
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
bx lr
b hw_divider_divmod_return
// end::hw_div_s32[]
.align 2
// tag::hw_div_u32[]
regular_func_with_section hw_divider_divmod_u32
ldr r3, =(SIO_BASE)
str r0, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
str r1, [r3, #SIO_DIV_UDIVISOR_OFFSET]
__divider_delay
// return 64 bit value so we can efficiently return both (note quotient must be read last)
b hw_divider_divmod_return
// end::hw_div_u32[]
// Common delay and return section for s32 and u32
.section .text.hw_divider_divmod_return
hw_divider_divmod_return:
// Branching here is 2 cycles, delay another 6
b 1f
1: b 1f
1: b 1f
1: // return 64 bit value so we can efficiently return both (note quotient must be read last)
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET]
bx lr
// end::hw_div_u32[]
#if SIO_DIV_CSR_READY_LSB == 0
.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1
#else
#error need to change SHIFT above
#endif
regular_func_with_section hw_divider_save_state
push {r4, r5, lr}
ldr r5, =SIO_BASE
ldr r4, [r5, #SIO_DIV_CSR_OFFSET]
# wait for results as we can't save signed-ness of operation
1:
lsrs r4, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
bcc 1b
ldr r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET]
ldr r2, [r5, #SIO_DIV_UDIVISOR_OFFSET]
ldr r3, [r5, #SIO_DIV_REMAINDER_OFFSET]
ldr r4, [r5, #SIO_DIV_QUOTIENT_OFFSET]
stmia r0!, {r1-r4}
pop {r4, r5, pc}
ldr r3, =SIO_BASE
ldr r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
ldr r2, [r3, #SIO_DIV_UDIVISOR_OFFSET]
stmia r0!, {r1-r2}
// The 8 cycles needed to guarantee that the result is ready is ensured by the preceeding
// code of 7 cycles together with any branch to it taking at least 2 cycles.
ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
ldr r2, [r3, #SIO_DIV_QUOTIENT_OFFSET]
stmia r0!, {r1-r2}
bx lr
regular_func_with_section hw_divider_restore_state
push {r4, r5, lr}
ldr r5, =SIO_BASE
ldmia r0!, {r1-r4}
str r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET]
str r2, [r5, #SIO_DIV_UDIVISOR_OFFSET]
str r3, [r5, #SIO_DIV_REMAINDER_OFFSET]
str r4, [r5, #SIO_DIV_QUOTIENT_OFFSET]
pop {r4, r5, pc}
ldr r3, =SIO_BASE
ldmia r0!, {r1-r2}
str r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET]
str r2, [r3, #SIO_DIV_UDIVISOR_OFFSET]
ldmia r0!, {r1-r2}
str r1, [r3, #SIO_DIV_REMAINDER_OFFSET]
str r2, [r3, #SIO_DIV_QUOTIENT_OFFSET]
bx lr

View File

@ -19,11 +19,10 @@ need to change SHIFT above
#endif
// SIO_BASE ptr in r2; pushes r4-r7, lr to stack
// requires that division started at least 2 cycles prior to the start of the macro
.macro save_div_state_and_lr
// originally we did this, however a) it uses r3, and b) the push takes 6 cycles, b)
// any IRQ which uses the divider will necessarily put the data back, which will
// immediately make it ready
// originally we did this, however a) it uses r3, and b) the push and dividend/divisor
// readout takes 8 cycles, c) any IRQ which uses the divider will necessarily put the
// data back, which will immediately make it ready
//
// // ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
// // // wait for results as we can't save signed-ness of operation
@ -31,7 +30,7 @@ need to change SHIFT above
// // lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
// // bcc 1b
// 6 cycles
// 6 cycle push + 2 ldr ensures the 8 cycle delay before remainder and quotient are ready
push {r4, r5, r6, r7, lr}
// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
ldr r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]

View File

@ -53,14 +53,10 @@
.macro save_div_state_and_lr_64
push {r4, r5, r6, r7, lr}
ldr r6, =SIO_BASE
1:
ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
// wait for results as we can't save signed-ness of operation
lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
bcc 1b
// note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
// No need to wait before reading result as long as preceeding code takes more than 8 cycles
ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
.endm
@ -152,8 +148,6 @@ regular_func divmod_s32s32_unsafe
#if !PICO_DIVIDER_DISABLE_INTERRUPTS
.align 2
regular_func divmod_s32s32_savestate
// note that we must be at least 2 cycles into division at this point,
// which we are because of the firty check before getting here (and of course the function call before that)
save_div_state_and_lr
bl divmod_s32s32_unsafe
restore_div_state_and_return