Add more memory barriers to avoid code re-ordering issues with DMA (#155)
* Add more memory barriers to avoid code re-ordering issues with DMA * Comment typos * Fix Wstrict-prototype on __compiler_memory_barrier * Remove now-redundant __compiler_barrier macro from hardware_flash Co-authored-by: Luke Wren <wren6991@gmail.com>
This commit is contained in:
parent
4b7ffd71f0
commit
0732d0c2a3
@ -126,11 +126,13 @@ void *decode_host_safe_hw_ptr(uint32_t ptr);
|
||||
|
||||
typedef unsigned int uint;
|
||||
|
||||
inline static int32_t __mul_instruction(int32_t a,int32_t b)
|
||||
static inline int32_t __mul_instruction(int32_t a,int32_t b)
|
||||
{
|
||||
return a*b;
|
||||
}
|
||||
|
||||
static inline void __compiler_memory_barrier(void) {
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -540,6 +540,8 @@ inline static bool dma_channel_is_busy(uint channel) {
|
||||
*/
|
||||
inline static void dma_channel_wait_for_finish_blocking(uint channel) {
|
||||
while (dma_channel_is_busy(channel)) tight_loop_contents();
|
||||
// stop the compiler hoisting a non volatile buffer access above the DMA completion.
|
||||
__compiler_memory_barrier();
|
||||
}
|
||||
|
||||
/*! \brief Enable the DMA sniffing targeting the specified channel
|
||||
|
@ -18,8 +18,6 @@
|
||||
#define FLASH_RUID_DATA_BYTES 8
|
||||
#define FLASH_RUID_TOTAL_BYTES (1 + FLASH_RUID_DUMMY_BYTES + FLASH_RUID_DATA_BYTES)
|
||||
|
||||
#define __compiler_barrier() asm volatile("" ::: "memory")
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Infrastructure for reentering XIP mode after exiting for programming (take
|
||||
// a copy of boot2 before XIP exit). Calling boot2 as a function works because
|
||||
@ -38,7 +36,7 @@ static void __no_inline_not_in_flash_func(flash_init_boot2_copyout)(void) {
|
||||
return;
|
||||
for (int i = 0; i < BOOT2_SIZE_WORDS; ++i)
|
||||
boot2_copyout[i] = ((uint32_t *)XIP_BASE)[i];
|
||||
__compiler_barrier();
|
||||
__compiler_memory_barrier();
|
||||
boot2_copyout_valid = true;
|
||||
}
|
||||
|
||||
@ -77,7 +75,7 @@ void __no_inline_not_in_flash_func(flash_range_erase)(uint32_t flash_offs, size_
|
||||
flash_init_boot2_copyout();
|
||||
|
||||
// No flash accesses after this point
|
||||
__compiler_barrier();
|
||||
__compiler_memory_barrier();
|
||||
|
||||
connect_internal_flash();
|
||||
flash_exit_xip();
|
||||
@ -100,7 +98,7 @@ void __no_inline_not_in_flash_func(flash_range_program)(uint32_t flash_offs, con
|
||||
assert(connect_internal_flash && flash_exit_xip && flash_range_program && flash_flush_cache);
|
||||
flash_init_boot2_copyout();
|
||||
|
||||
__compiler_barrier();
|
||||
__compiler_memory_barrier();
|
||||
|
||||
connect_internal_flash();
|
||||
flash_exit_xip();
|
||||
@ -133,7 +131,7 @@ static void __no_inline_not_in_flash_func(flash_do_cmd)(const uint8_t *txbuf, ui
|
||||
void (*flash_flush_cache)(void) = (void(*)(void))rom_func_lookup(rom_table_code('F', 'C'));
|
||||
assert(connect_internal_flash && flash_exit_xip && flash_flush_cache);
|
||||
flash_init_boot2_copyout();
|
||||
__compiler_barrier();
|
||||
__compiler_memory_barrier();
|
||||
connect_internal_flash();
|
||||
flash_exit_xip();
|
||||
|
||||
|
@ -111,7 +111,18 @@ inline static void __wfi(void) {
|
||||
* instruction will be observed before any explicit access after the instruction.
|
||||
*/
|
||||
inline static void __dmb(void) {
|
||||
__asm volatile ("dmb");
|
||||
__asm volatile ("dmb" : : : "memory");
|
||||
}
|
||||
|
||||
/*! \brief Insert a DSB instruction in to the code path.
|
||||
* \ingroup hardware_sync
|
||||
*
|
||||
* The DSB (data synchronization barrier) acts as a special kind of data
|
||||
* memory barrier (DMB). The DSB operation completes when all explicit memory
|
||||
* accesses before this instruction complete.
|
||||
*/
|
||||
inline static void __dsb(void) {
|
||||
__asm volatile ("dsb" : : : "memory");
|
||||
}
|
||||
|
||||
/*! \brief Insert a ISB instruction in to the code path.
|
||||
|
@ -22,18 +22,31 @@ extern "C" {
|
||||
|
||||
#define __isr
|
||||
|
||||
// Section naming macros
|
||||
#define __after_data(group) __attribute__((section(".after_data." group)))
|
||||
#define __not_in_flash(group) __attribute__((section(".time_critical." group)))
|
||||
#define __not_in_flash_func(x) __not_in_flash(__STRING(x)) x
|
||||
#define __no_inline_not_in_flash_func(x) __attribute__((noinline)) __not_in_flash_func(x)
|
||||
|
||||
#define __scratch_x(group) __attribute__((section(".scratch_x." group)))
|
||||
#define __scratch_y(group) __attribute__((section(".scratch_y." group)))
|
||||
#define __uninitialized_ram(group) __attribute__((section(".uninitialized_ram." #group))) group
|
||||
// For use with PICO_COPY_TO_RAM:
|
||||
#define __in_flash(group) __attribute__((section(".flashdata" group)))
|
||||
|
||||
#define __scratch_x(group) __attribute__((section(".scratch_x." group)))
|
||||
#define __scratch_y(group) __attribute__((section(".scratch_y." group)))
|
||||
/**
|
||||
* Decorates a function name, such that the function will execute from RAM (assuming it is not inlined
|
||||
* into a flash function by the compiler)
|
||||
*/
|
||||
#define __not_in_flash_func(func_name) __not_in_flash(__STRING(func_name)) func_name
|
||||
/**
|
||||
* Historical synonym for __not_in_flash_func()
|
||||
*/
|
||||
#define __time_critical_func(func_name) __not_in_flash_func(func_name)
|
||||
|
||||
/**
|
||||
* Decorates a function name, such that the function will execute from RAM, explicitly marking it as
|
||||
* noinline to prevent it being inlined into a flash function by the compiler
|
||||
*/
|
||||
#define __no_inline_not_in_flash_func(func_name) __attribute__((noinline)) __not_in_flash_func(func_name)
|
||||
|
||||
#define __time_critical_func(x) __not_in_flash_func(x)
|
||||
#define __after_data(group) __attribute__((section(".after_data." group)))
|
||||
#define __packed_aligned __packed __aligned(4)
|
||||
|
||||
#ifndef count_of
|
||||
@ -48,19 +61,37 @@ extern "C" {
|
||||
#define MIN(a, b) ((b)>(a)?(a):(b))
|
||||
#endif
|
||||
|
||||
#define __uninitialized_ram(group) __attribute__((section(".uninitialized_ram." #group))) group
|
||||
|
||||
inline static void __breakpoint(void) {
|
||||
/**
|
||||
* Execute a breakpoint instruction
|
||||
*/
|
||||
static inline void __breakpoint(void) {
|
||||
__asm__("bkpt #0");
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure that the compiler does not move memory access across this method call
|
||||
*/
|
||||
static inline void __compiler_memory_barrier(void) {
|
||||
__asm__ volatile ("" : : : "memory");
|
||||
}
|
||||
|
||||
// return a 32 bit handle for a raw ptr; DMA chaining for example embeds pointers in 32 bit values
|
||||
// which of course does not work if we're running the code natively on a 64 bit platforms. Therefore
|
||||
// we provide this macro which allows that code to provide a 64->32 bit mapping in host mode
|
||||
#define host_safe_hw_ptr(x) ((uintptr_t)(x))
|
||||
|
||||
/**
|
||||
* Panic (see panic()) with the message "Unsupported".
|
||||
*/
|
||||
void __attribute__((noreturn)) panic_unsupported(void);
|
||||
|
||||
/**
|
||||
* Panic with a message. An attempt is made to output the message to all registered STDOUT drivers
|
||||
* after which this method executes a BKPT instruction.
|
||||
*
|
||||
* @param fmt format string (printf-like)
|
||||
* @param ... printf-like arguments
|
||||
*/
|
||||
void __attribute__((noreturn)) panic(const char *fmt, ...);
|
||||
|
||||
// PICO_CONFIG: PICO_NO_FPGA_CHECK, Remove the FPGA platform check for small code size reduction, type=bool, default=0, advanced=true, group=pico_runtime
|
||||
@ -74,36 +105,64 @@ static inline bool running_on_fpga(void) {return false;}
|
||||
bool running_on_fpga(void);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @return the RP2040 chip revision number
|
||||
*/
|
||||
uint8_t rp2040_chip_version(void);
|
||||
|
||||
/**
|
||||
* @return the RP2040 rom version number
|
||||
*/
|
||||
static inline uint8_t rp2040_rom_version(void) {
|
||||
return *(uint8_t*)0x13;
|
||||
}
|
||||
|
||||
// called by any tight hardware polling loop... nominally empty, but can be modified for debugging
|
||||
/**
|
||||
* Empty function intended to be called by any tight hardware polling loop. using this ubiquitously
|
||||
* makes it much easier to find tight loops, but also in the future #ifdef-ed support for lockup
|
||||
* debugging might be added
|
||||
*/
|
||||
static inline void tight_loop_contents(void) {}
|
||||
|
||||
// return a 32 bit handle for a raw ptr; DMA chaining for example embeds pointers in 32 bit values
|
||||
// which of course does not work if we're running the code natively on a 64 bit platform for testing.
|
||||
// Therefore we provide this function which allows the host runtime to provide a mapping
|
||||
/**
|
||||
* Helper macro for making chain DMA code portable to PICO_PLATFORM=host. The problem here is
|
||||
* that embedded pointers in the data are only 32 bit, which is a problem if the host
|
||||
* system is 64 bit. This macro is zero cost on the actual device, but in host mode
|
||||
* it provides a 64->32 bit mapping
|
||||
*/
|
||||
#define native_safe_hw_ptr(x) ((uintptr_t)(x))
|
||||
|
||||
// multiplies a by b using multiply instruction using the ARM mul instruction regardless of values
|
||||
/**
|
||||
* Multiplies a by b using multiply instruction using the ARM mul instruction regardless of values;
|
||||
* i.e. this is a 1 cycle operation.
|
||||
*
|
||||
* \param a the first operand
|
||||
* \param b the second operand
|
||||
* \return a * b
|
||||
*/
|
||||
inline static int32_t __mul_instruction(int32_t a, int32_t b) {
|
||||
asm ("mul %0, %1" : "+l" (a) : "l" (b) : );
|
||||
return a;
|
||||
}
|
||||
|
||||
#define WRAPPER_FUNC(x) __wrap_ ## x
|
||||
#define REAL_FUNC(x) __real_ ## x
|
||||
|
||||
// macro to multiply value a by possibly constant value b
|
||||
// if b is known to be constant and not zero or a power of 2, then a mul instruction is used rather than gcc's default
|
||||
/**
|
||||
* Efficiently Multiplies value a by possibly constant value b.
|
||||
* If b is known to be constant and not zero or a power of 2, then a mul instruction is used rather than gcc's default
|
||||
* which is often a slow combination of shifts and adds
|
||||
*
|
||||
* \param a the first operand
|
||||
* \param b the second operand
|
||||
* \return a * b
|
||||
*/
|
||||
#define __fast_mul(a, b) __builtin_choose_expr(__builtin_constant_p(b) && !__builtin_constant_p(a), \
|
||||
(__builtin_popcount(b) >= 2 ? __mul_instruction(a,b) : (a)*(b)), \
|
||||
(a)*(b))
|
||||
|
||||
#define WRAPPER_FUNC(x) __wrap_ ## x
|
||||
#define REAL_FUNC(x) __real_ ## x
|
||||
|
||||
#define __check_type_compatible(type_a, type_b) static_assert(__builtin_types_compatible_p(type_a, type_b), __STRING(type_a) " is not compatible with " __STRING(type_b));
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user