Add more memory barriers to avoid code re-ordering issues with DMA (#155)

* Add more memory barriers to avoid code re-ordering issues with DMA

* Comment typos

* Fix Wstrict-prototype on __compiler_memory_barrier

* Remove now-redundant __compiler_barrier macro from hardware_flash

Co-authored-by: Luke Wren <wren6991@gmail.com>
This commit is contained in:
Graham Sanderson 2021-02-19 07:11:56 -06:00 committed by graham sanderson
parent 4b7ffd71f0
commit 0732d0c2a3
5 changed files with 100 additions and 28 deletions

View File

@ -126,11 +126,13 @@ void *decode_host_safe_hw_ptr(uint32_t ptr);
typedef unsigned int uint;
inline static int32_t __mul_instruction(int32_t a,int32_t b)
static inline int32_t __mul_instruction(int32_t a,int32_t b)
{
return a*b;
}
static inline void __compiler_memory_barrier(void) {
}
#ifdef __cplusplus
}
#endif

View File

@ -540,6 +540,8 @@ inline static bool dma_channel_is_busy(uint channel) {
*/
inline static void dma_channel_wait_for_finish_blocking(uint channel) {
while (dma_channel_is_busy(channel)) tight_loop_contents();
// stop the compiler hoisting a non volatile buffer access above the DMA completion.
__compiler_memory_barrier();
}
/*! \brief Enable the DMA sniffing targeting the specified channel

View File

@ -18,8 +18,6 @@
#define FLASH_RUID_DATA_BYTES 8
#define FLASH_RUID_TOTAL_BYTES (1 + FLASH_RUID_DUMMY_BYTES + FLASH_RUID_DATA_BYTES)
#define __compiler_barrier() asm volatile("" ::: "memory")
//-----------------------------------------------------------------------------
// Infrastructure for reentering XIP mode after exiting for programming (take
// a copy of boot2 before XIP exit). Calling boot2 as a function works because
@ -38,7 +36,7 @@ static void __no_inline_not_in_flash_func(flash_init_boot2_copyout)(void) {
return;
for (int i = 0; i < BOOT2_SIZE_WORDS; ++i)
boot2_copyout[i] = ((uint32_t *)XIP_BASE)[i];
__compiler_barrier();
__compiler_memory_barrier();
boot2_copyout_valid = true;
}
@ -77,7 +75,7 @@ void __no_inline_not_in_flash_func(flash_range_erase)(uint32_t flash_offs, size_
flash_init_boot2_copyout();
// No flash accesses after this point
__compiler_barrier();
__compiler_memory_barrier();
connect_internal_flash();
flash_exit_xip();
@ -100,7 +98,7 @@ void __no_inline_not_in_flash_func(flash_range_program)(uint32_t flash_offs, con
assert(connect_internal_flash && flash_exit_xip && flash_range_program && flash_flush_cache);
flash_init_boot2_copyout();
__compiler_barrier();
__compiler_memory_barrier();
connect_internal_flash();
flash_exit_xip();
@ -133,7 +131,7 @@ static void __no_inline_not_in_flash_func(flash_do_cmd)(const uint8_t *txbuf, ui
void (*flash_flush_cache)(void) = (void(*)(void))rom_func_lookup(rom_table_code('F', 'C'));
assert(connect_internal_flash && flash_exit_xip && flash_flush_cache);
flash_init_boot2_copyout();
__compiler_barrier();
__compiler_memory_barrier();
connect_internal_flash();
flash_exit_xip();

View File

@ -111,7 +111,18 @@ inline static void __wfi(void) {
* instruction will be observed before any explicit access after the instruction.
*/
inline static void __dmb(void) {
__asm volatile ("dmb");
__asm volatile ("dmb" : : : "memory");
}
/*! \brief Insert a DSB instruction in to the code path.
* \ingroup hardware_sync
*
* The DSB (data synchronization barrier) acts as a special kind of data
* memory barrier (DMB). The DSB operation completes when all explicit memory
* accesses before this instruction complete.
*/
inline static void __dsb(void) {
__asm volatile ("dsb" : : : "memory");
}
/*! \brief Insert a ISB instruction in to the code path.

View File

@ -22,18 +22,31 @@ extern "C" {
#define __isr
// Section naming macros
#define __after_data(group) __attribute__((section(".after_data." group)))
#define __not_in_flash(group) __attribute__((section(".time_critical." group)))
#define __not_in_flash_func(x) __not_in_flash(__STRING(x)) x
#define __no_inline_not_in_flash_func(x) __attribute__((noinline)) __not_in_flash_func(x)
#define __scratch_x(group) __attribute__((section(".scratch_x." group)))
#define __scratch_y(group) __attribute__((section(".scratch_y." group)))
#define __uninitialized_ram(group) __attribute__((section(".uninitialized_ram." #group))) group
// For use with PICO_COPY_TO_RAM:
#define __in_flash(group) __attribute__((section(".flashdata" group)))
#define __scratch_x(group) __attribute__((section(".scratch_x." group)))
#define __scratch_y(group) __attribute__((section(".scratch_y." group)))
/**
* Decorates a function name, such that the function will execute from RAM (assuming it is not inlined
* into a flash function by the compiler)
*/
#define __not_in_flash_func(func_name) __not_in_flash(__STRING(func_name)) func_name
/**
* Historical synonym for __not_in_flash_func()
*/
#define __time_critical_func(func_name) __not_in_flash_func(func_name)
/**
* Decorates a function name, such that the function will execute from RAM, explicitly marking it as
* noinline to prevent it being inlined into a flash function by the compiler
*/
#define __no_inline_not_in_flash_func(func_name) __attribute__((noinline)) __not_in_flash_func(func_name)
#define __time_critical_func(x) __not_in_flash_func(x)
#define __after_data(group) __attribute__((section(".after_data." group)))
#define __packed_aligned __packed __aligned(4)
#ifndef count_of
@ -48,19 +61,37 @@ extern "C" {
#define MIN(a, b) ((b)>(a)?(a):(b))
#endif
#define __uninitialized_ram(group) __attribute__((section(".uninitialized_ram." #group))) group
inline static void __breakpoint(void) {
/**
* Execute a breakpoint instruction
*/
static inline void __breakpoint(void) {
__asm__("bkpt #0");
}
/**
* Ensure that the compiler does not move memory access across this method call
*/
static inline void __compiler_memory_barrier(void) {
__asm__ volatile ("" : : : "memory");
}
// return a 32 bit handle for a raw ptr; DMA chaining for example embeds pointers in 32 bit values
// which of course does not work if we're running the code natively on a 64 bit platforms. Therefore
// we provide this macro which allows that code to provide a 64->32 bit mapping in host mode
#define host_safe_hw_ptr(x) ((uintptr_t)(x))
/**
* Panic (see panic()) with the message "Unsupported".
*/
void __attribute__((noreturn)) panic_unsupported(void);
/**
* Panic with a message. An attempt is made to output the message to all registered STDOUT drivers
* after which this method executes a BKPT instruction.
*
* @param fmt format string (printf-like)
* @param ... printf-like arguments
*/
void __attribute__((noreturn)) panic(const char *fmt, ...);
// PICO_CONFIG: PICO_NO_FPGA_CHECK, Remove the FPGA platform check for small code size reduction, type=bool, default=0, advanced=true, group=pico_runtime
@ -74,36 +105,64 @@ static inline bool running_on_fpga(void) {return false;}
bool running_on_fpga(void);
#endif
/**
* @return the RP2040 chip revision number
*/
uint8_t rp2040_chip_version(void);
/**
* @return the RP2040 rom version number
*/
static inline uint8_t rp2040_rom_version(void) {
return *(uint8_t*)0x13;
}
// called by any tight hardware polling loop... nominally empty, but can be modified for debugging
/**
* Empty function intended to be called by any tight hardware polling loop. using this ubiquitously
* makes it much easier to find tight loops, but also in the future #ifdef-ed support for lockup
* debugging might be added
*/
static inline void tight_loop_contents(void) {}
// return a 32 bit handle for a raw ptr; DMA chaining for example embeds pointers in 32 bit values
// which of course does not work if we're running the code natively on a 64 bit platform for testing.
// Therefore we provide this function which allows the host runtime to provide a mapping
/**
* Helper macro for making chain DMA code portable to PICO_PLATFORM=host. The problem here is
* that embedded pointers in the data are only 32 bit, which is a problem if the host
* system is 64 bit. This macro is zero cost on the actual device, but in host mode
* it provides a 64->32 bit mapping
*/
#define native_safe_hw_ptr(x) ((uintptr_t)(x))
// multiplies a by b using multiply instruction using the ARM mul instruction regardless of values
/**
* Multiplies a by b using multiply instruction using the ARM mul instruction regardless of values;
* i.e. this is a 1 cycle operation.
*
* \param a the first operand
* \param b the second operand
* \return a * b
*/
inline static int32_t __mul_instruction(int32_t a, int32_t b) {
asm ("mul %0, %1" : "+l" (a) : "l" (b) : );
return a;
}
#define WRAPPER_FUNC(x) __wrap_ ## x
#define REAL_FUNC(x) __real_ ## x
// macro to multiply value a by possibly constant value b
// if b is known to be constant and not zero or a power of 2, then a mul instruction is used rather than gcc's default
/**
* Efficiently Multiplies value a by possibly constant value b.
* If b is known to be constant and not zero or a power of 2, then a mul instruction is used rather than gcc's default
* which is often a slow combination of shifts and adds
*
* \param a the first operand
* \param b the second operand
* \return a * b
*/
#define __fast_mul(a, b) __builtin_choose_expr(__builtin_constant_p(b) && !__builtin_constant_p(a), \
(__builtin_popcount(b) >= 2 ? __mul_instruction(a,b) : (a)*(b)), \
(a)*(b))
#define WRAPPER_FUNC(x) __wrap_ ## x
#define REAL_FUNC(x) __real_ ## x
#define __check_type_compatible(type_a, type_b) static_assert(__builtin_types_compatible_p(type_a, type_b), __STRING(type_a) " is not compatible with " __STRING(type_b));
#ifdef __cplusplus
}
#endif