From 0732d0c2a31b42c9c64d5751ed32c34ab7c4b2df Mon Sep 17 00:00:00 2001 From: Graham Sanderson Date: Fri, 19 Feb 2021 07:11:56 -0600 Subject: [PATCH] Add more memory barriers to avoid code re-ordering issues with DMA (#155) * Add more memory barriers to avoid code re-ordering issues with DMA * Comment typos * Fix Wstrict-prototype on __compiler_memory_barrier * Remove now-redundant __compiler_barrier macro from hardware_flash Co-authored-by: Luke Wren --- .../pico_platform/include/pico/platform.h | 4 +- .../hardware_dma/include/hardware/dma.h | 2 + src/rp2_common/hardware_flash/flash.c | 10 +- .../hardware_sync/include/hardware/sync.h | 13 ++- .../pico_platform/include/pico/platform.h | 99 +++++++++++++++---- 5 files changed, 100 insertions(+), 28 deletions(-) diff --git a/src/host/pico_platform/include/pico/platform.h b/src/host/pico_platform/include/pico/platform.h index 0e994d9..da060a6 100644 --- a/src/host/pico_platform/include/pico/platform.h +++ b/src/host/pico_platform/include/pico/platform.h @@ -126,11 +126,13 @@ void *decode_host_safe_hw_ptr(uint32_t ptr); typedef unsigned int uint; -inline static int32_t __mul_instruction(int32_t a,int32_t b) +static inline int32_t __mul_instruction(int32_t a,int32_t b) { return a*b; } +static inline void __compiler_memory_barrier(void) { +} #ifdef __cplusplus } #endif diff --git a/src/rp2_common/hardware_dma/include/hardware/dma.h b/src/rp2_common/hardware_dma/include/hardware/dma.h index 9b67674..dfb2de5 100644 --- a/src/rp2_common/hardware_dma/include/hardware/dma.h +++ b/src/rp2_common/hardware_dma/include/hardware/dma.h @@ -540,6 +540,8 @@ inline static bool dma_channel_is_busy(uint channel) { */ inline static void dma_channel_wait_for_finish_blocking(uint channel) { while (dma_channel_is_busy(channel)) tight_loop_contents(); + // stop the compiler hoisting a non volatile buffer access above the DMA completion. + __compiler_memory_barrier(); } /*! \brief Enable the DMA sniffing targeting the specified channel diff --git a/src/rp2_common/hardware_flash/flash.c b/src/rp2_common/hardware_flash/flash.c index 622f08c..c4cf830 100644 --- a/src/rp2_common/hardware_flash/flash.c +++ b/src/rp2_common/hardware_flash/flash.c @@ -18,8 +18,6 @@ #define FLASH_RUID_DATA_BYTES 8 #define FLASH_RUID_TOTAL_BYTES (1 + FLASH_RUID_DUMMY_BYTES + FLASH_RUID_DATA_BYTES) -#define __compiler_barrier() asm volatile("" ::: "memory") - //----------------------------------------------------------------------------- // Infrastructure for reentering XIP mode after exiting for programming (take // a copy of boot2 before XIP exit). Calling boot2 as a function works because @@ -38,7 +36,7 @@ static void __no_inline_not_in_flash_func(flash_init_boot2_copyout)(void) { return; for (int i = 0; i < BOOT2_SIZE_WORDS; ++i) boot2_copyout[i] = ((uint32_t *)XIP_BASE)[i]; - __compiler_barrier(); + __compiler_memory_barrier(); boot2_copyout_valid = true; } @@ -77,7 +75,7 @@ void __no_inline_not_in_flash_func(flash_range_erase)(uint32_t flash_offs, size_ flash_init_boot2_copyout(); // No flash accesses after this point - __compiler_barrier(); + __compiler_memory_barrier(); connect_internal_flash(); flash_exit_xip(); @@ -100,7 +98,7 @@ void __no_inline_not_in_flash_func(flash_range_program)(uint32_t flash_offs, con assert(connect_internal_flash && flash_exit_xip && flash_range_program && flash_flush_cache); flash_init_boot2_copyout(); - __compiler_barrier(); + __compiler_memory_barrier(); connect_internal_flash(); flash_exit_xip(); @@ -133,7 +131,7 @@ static void __no_inline_not_in_flash_func(flash_do_cmd)(const uint8_t *txbuf, ui void (*flash_flush_cache)(void) = (void(*)(void))rom_func_lookup(rom_table_code('F', 'C')); assert(connect_internal_flash && flash_exit_xip && flash_flush_cache); flash_init_boot2_copyout(); - __compiler_barrier(); + __compiler_memory_barrier(); connect_internal_flash(); flash_exit_xip(); diff --git a/src/rp2_common/hardware_sync/include/hardware/sync.h b/src/rp2_common/hardware_sync/include/hardware/sync.h index 6d46a5b..a2c0d5c 100644 --- a/src/rp2_common/hardware_sync/include/hardware/sync.h +++ b/src/rp2_common/hardware_sync/include/hardware/sync.h @@ -111,7 +111,18 @@ inline static void __wfi(void) { * instruction will be observed before any explicit access after the instruction. */ inline static void __dmb(void) { - __asm volatile ("dmb"); + __asm volatile ("dmb" : : : "memory"); +} + +/*! \brief Insert a DSB instruction in to the code path. + * \ingroup hardware_sync + * + * The DSB (data synchronization barrier) acts as a special kind of data + * memory barrier (DMB). The DSB operation completes when all explicit memory + * accesses before this instruction complete. + */ +inline static void __dsb(void) { + __asm volatile ("dsb" : : : "memory"); } /*! \brief Insert a ISB instruction in to the code path. diff --git a/src/rp2_common/pico_platform/include/pico/platform.h b/src/rp2_common/pico_platform/include/pico/platform.h index 2ed5e21..ef013d5 100644 --- a/src/rp2_common/pico_platform/include/pico/platform.h +++ b/src/rp2_common/pico_platform/include/pico/platform.h @@ -22,18 +22,31 @@ extern "C" { #define __isr +// Section naming macros +#define __after_data(group) __attribute__((section(".after_data." group))) #define __not_in_flash(group) __attribute__((section(".time_critical." group))) -#define __not_in_flash_func(x) __not_in_flash(__STRING(x)) x -#define __no_inline_not_in_flash_func(x) __attribute__((noinline)) __not_in_flash_func(x) - +#define __scratch_x(group) __attribute__((section(".scratch_x." group))) +#define __scratch_y(group) __attribute__((section(".scratch_y." group))) +#define __uninitialized_ram(group) __attribute__((section(".uninitialized_ram." #group))) group // For use with PICO_COPY_TO_RAM: #define __in_flash(group) __attribute__((section(".flashdata" group))) -#define __scratch_x(group) __attribute__((section(".scratch_x." group))) -#define __scratch_y(group) __attribute__((section(".scratch_y." group))) +/** + * Decorates a function name, such that the function will execute from RAM (assuming it is not inlined + * into a flash function by the compiler) + */ +#define __not_in_flash_func(func_name) __not_in_flash(__STRING(func_name)) func_name +/** + * Historical synonym for __not_in_flash_func() + */ +#define __time_critical_func(func_name) __not_in_flash_func(func_name) + +/** + * Decorates a function name, such that the function will execute from RAM, explicitly marking it as + * noinline to prevent it being inlined into a flash function by the compiler + */ +#define __no_inline_not_in_flash_func(func_name) __attribute__((noinline)) __not_in_flash_func(func_name) -#define __time_critical_func(x) __not_in_flash_func(x) -#define __after_data(group) __attribute__((section(".after_data." group))) #define __packed_aligned __packed __aligned(4) #ifndef count_of @@ -48,19 +61,37 @@ extern "C" { #define MIN(a, b) ((b)>(a)?(a):(b)) #endif -#define __uninitialized_ram(group) __attribute__((section(".uninitialized_ram." #group))) group - -inline static void __breakpoint(void) { +/** + * Execute a breakpoint instruction + */ +static inline void __breakpoint(void) { __asm__("bkpt #0"); } +/** + * Ensure that the compiler does not move memory access across this method call + */ +static inline void __compiler_memory_barrier(void) { + __asm__ volatile ("" : : : "memory"); +} + // return a 32 bit handle for a raw ptr; DMA chaining for example embeds pointers in 32 bit values // which of course does not work if we're running the code natively on a 64 bit platforms. Therefore // we provide this macro which allows that code to provide a 64->32 bit mapping in host mode #define host_safe_hw_ptr(x) ((uintptr_t)(x)) +/** + * Panic (see panic()) with the message "Unsupported". + */ void __attribute__((noreturn)) panic_unsupported(void); +/** + * Panic with a message. An attempt is made to output the message to all registered STDOUT drivers + * after which this method executes a BKPT instruction. + * + * @param fmt format string (printf-like) + * @param ... printf-like arguments + */ void __attribute__((noreturn)) panic(const char *fmt, ...); // PICO_CONFIG: PICO_NO_FPGA_CHECK, Remove the FPGA platform check for small code size reduction, type=bool, default=0, advanced=true, group=pico_runtime @@ -74,36 +105,64 @@ static inline bool running_on_fpga(void) {return false;} bool running_on_fpga(void); #endif +/** + * @return the RP2040 chip revision number + */ uint8_t rp2040_chip_version(void); +/** + * @return the RP2040 rom version number + */ static inline uint8_t rp2040_rom_version(void) { return *(uint8_t*)0x13; } -// called by any tight hardware polling loop... nominally empty, but can be modified for debugging +/** + * Empty function intended to be called by any tight hardware polling loop. using this ubiquitously + * makes it much easier to find tight loops, but also in the future #ifdef-ed support for lockup + * debugging might be added + */ static inline void tight_loop_contents(void) {} -// return a 32 bit handle for a raw ptr; DMA chaining for example embeds pointers in 32 bit values -// which of course does not work if we're running the code natively on a 64 bit platform for testing. -// Therefore we provide this function which allows the host runtime to provide a mapping +/** + * Helper macro for making chain DMA code portable to PICO_PLATFORM=host. The problem here is + * that embedded pointers in the data are only 32 bit, which is a problem if the host + * system is 64 bit. This macro is zero cost on the actual device, but in host mode + * it provides a 64->32 bit mapping + */ #define native_safe_hw_ptr(x) ((uintptr_t)(x)) -// multiplies a by b using multiply instruction using the ARM mul instruction regardless of values +/** + * Multiplies a by b using multiply instruction using the ARM mul instruction regardless of values; + * i.e. this is a 1 cycle operation. + * + * \param a the first operand + * \param b the second operand + * \return a * b + */ inline static int32_t __mul_instruction(int32_t a, int32_t b) { asm ("mul %0, %1" : "+l" (a) : "l" (b) : ); return a; } -#define WRAPPER_FUNC(x) __wrap_ ## x -#define REAL_FUNC(x) __real_ ## x - -// macro to multiply value a by possibly constant value b -// if b is known to be constant and not zero or a power of 2, then a mul instruction is used rather than gcc's default +/** + * Efficiently Multiplies value a by possibly constant value b. + * If b is known to be constant and not zero or a power of 2, then a mul instruction is used rather than gcc's default + * which is often a slow combination of shifts and adds + * + * \param a the first operand + * \param b the second operand + * \return a * b + */ #define __fast_mul(a, b) __builtin_choose_expr(__builtin_constant_p(b) && !__builtin_constant_p(a), \ (__builtin_popcount(b) >= 2 ? __mul_instruction(a,b) : (a)*(b)), \ (a)*(b)) +#define WRAPPER_FUNC(x) __wrap_ ## x +#define REAL_FUNC(x) __real_ ## x + #define __check_type_compatible(type_a, type_b) static_assert(__builtin_types_compatible_p(type_a, type_b), __STRING(type_a) " is not compatible with " __STRING(type_b)); + #ifdef __cplusplus } #endif