diff --git a/src/rp2_common/hardware_divider/divider.S b/src/rp2_common/hardware_divider/divider.S index b9389c5..55134f2 100644 --- a/src/rp2_common/hardware_divider/divider.S +++ b/src/rp2_common/hardware_divider/divider.S @@ -7,70 +7,51 @@ .thumb // tag::hw_div_s32[] - -.macro __divider_delay - // delay 8 cycles - b 1f -1: b 1f -1: b 1f -1: b 1f -1: -.endm - -.align 2 - regular_func_with_section hw_divider_divmod_s32 ldr r3, =(SIO_BASE) str r0, [r3, #SIO_DIV_SDIVIDEND_OFFSET] str r1, [r3, #SIO_DIV_SDIVISOR_OFFSET] - __divider_delay - // return 64 bit value so we can efficiently return both (note quotient must be read last) - ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET] - ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET] - bx lr + b hw_divider_divmod_return // end::hw_div_s32[] -.align 2 - // tag::hw_div_u32[] regular_func_with_section hw_divider_divmod_u32 ldr r3, =(SIO_BASE) str r0, [r3, #SIO_DIV_UDIVIDEND_OFFSET] str r1, [r3, #SIO_DIV_UDIVISOR_OFFSET] - __divider_delay - // return 64 bit value so we can efficiently return both (note quotient must be read last) + b hw_divider_divmod_return +// end::hw_div_u32[] + +// Common delay and return section for s32 and u32 +.section .text.hw_divider_divmod_return +hw_divider_divmod_return: + // Branching here is 2 cycles, delay another 6 + b 1f +1: b 1f +1: b 1f +1: // return 64 bit value so we can efficiently return both (note quotient must be read last) ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET] ldr r0, [r3, #SIO_DIV_QUOTIENT_OFFSET] bx lr -// end::hw_div_u32[] - -#if SIO_DIV_CSR_READY_LSB == 0 -.equ SIO_DIV_CSR_READY_SHIFT_FOR_CARRY, 1 -#else -#error need to change SHIFT above -#endif regular_func_with_section hw_divider_save_state - push {r4, r5, lr} - ldr r5, =SIO_BASE - ldr r4, [r5, #SIO_DIV_CSR_OFFSET] - # wait for results as we can't save signed-ness of operation -1: - lsrs r4, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY - bcc 1b - ldr r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET] - ldr r2, [r5, #SIO_DIV_UDIVISOR_OFFSET] - ldr r3, [r5, #SIO_DIV_REMAINDER_OFFSET] - ldr r4, [r5, #SIO_DIV_QUOTIENT_OFFSET] - stmia r0!, {r1-r4} - pop {r4, r5, pc} + ldr r3, =SIO_BASE + ldr r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET] + ldr r2, [r3, #SIO_DIV_UDIVISOR_OFFSET] + stmia r0!, {r1-r2} + // The 8 cycles needed to guarantee that the result is ready is ensured by the preceeding + // code of 7 cycles together with any branch to it taking at least 2 cycles. + ldr r1, [r3, #SIO_DIV_REMAINDER_OFFSET] + ldr r2, [r3, #SIO_DIV_QUOTIENT_OFFSET] + stmia r0!, {r1-r2} + bx lr regular_func_with_section hw_divider_restore_state - push {r4, r5, lr} - ldr r5, =SIO_BASE - ldmia r0!, {r1-r4} - str r1, [r5, #SIO_DIV_UDIVIDEND_OFFSET] - str r2, [r5, #SIO_DIV_UDIVISOR_OFFSET] - str r3, [r5, #SIO_DIV_REMAINDER_OFFSET] - str r4, [r5, #SIO_DIV_QUOTIENT_OFFSET] - pop {r4, r5, pc} + ldr r3, =SIO_BASE + ldmia r0!, {r1-r2} + str r1, [r3, #SIO_DIV_UDIVIDEND_OFFSET] + str r2, [r3, #SIO_DIV_UDIVISOR_OFFSET] + ldmia r0!, {r1-r2} + str r1, [r3, #SIO_DIV_REMAINDER_OFFSET] + str r2, [r3, #SIO_DIV_QUOTIENT_OFFSET] + bx lr diff --git a/src/rp2_common/hardware_divider/include/hardware/divider_helper.S b/src/rp2_common/hardware_divider/include/hardware/divider_helper.S index 062e12d..e69fab5 100644 --- a/src/rp2_common/hardware_divider/include/hardware/divider_helper.S +++ b/src/rp2_common/hardware_divider/include/hardware/divider_helper.S @@ -19,11 +19,10 @@ need to change SHIFT above #endif // SIO_BASE ptr in r2; pushes r4-r7, lr to stack -// requires that division started at least 2 cycles prior to the start of the macro .macro save_div_state_and_lr -// originally we did this, however a) it uses r3, and b) the push takes 6 cycles, b) -// any IRQ which uses the divider will necessarily put the data back, which will -// immediately make it ready +// originally we did this, however a) it uses r3, and b) the push and dividend/divisor +// readout takes 8 cycles, c) any IRQ which uses the divider will necessarily put the +// data back, which will immediately make it ready // // // ldr r3, [r2, #SIO_DIV_CSR_OFFSET] // // // wait for results as we can't save signed-ness of operation @@ -31,7 +30,7 @@ need to change SHIFT above // // lsrs r3, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY // // bcc 1b -// 6 cycles +// 6 cycle push + 2 ldr ensures the 8 cycle delay before remainder and quotient are ready push {r4, r5, r6, r7, lr} // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia! ldr r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET] diff --git a/src/rp2_common/pico_divider/divider.S b/src/rp2_common/pico_divider/divider.S index ba42662..c095f35 100644 --- a/src/rp2_common/pico_divider/divider.S +++ b/src/rp2_common/pico_divider/divider.S @@ -53,14 +53,10 @@ .macro save_div_state_and_lr_64 push {r4, r5, r6, r7, lr} ldr r6, =SIO_BASE -1: - ldr r5, [r6, #SIO_DIV_CSR_OFFSET] - // wait for results as we can't save signed-ness of operation - lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY - bcc 1b // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia! ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET] ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET] + // No need to wait before reading result as long as preceeding code takes more than 8 cycles ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET] ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET] .endm @@ -152,8 +148,6 @@ regular_func divmod_s32s32_unsafe #if !PICO_DIVIDER_DISABLE_INTERRUPTS .align 2 regular_func divmod_s32s32_savestate - // note that we must be at least 2 cycles into division at this point, - // which we are because of the firty check before getting here (and of course the function call before that) save_div_state_and_lr bl divmod_s32s32_unsafe restore_div_state_and_return