From 6d87da4c59d2e72674e5095fbcf120e0bf9ef02d Mon Sep 17 00:00:00 2001
From: Graham Sanderson <graham.sanderson@raspberrypi.com>
Date: Wed, 5 May 2021 11:46:25 -0500
Subject: [PATCH] Rework lock_core / timers (#378)

- Add recursive_mutex
  - Make all locking primitives and sleep use common overridable wait/notify support to allow RTOS
    implementations to replace WFE/SEV with something more appropriate
  - Add busy_wait_ms
---
 pico_sdk_init.cmake                           |   2 +-
 src/common/pico_sync/critical_section.c       |  15 +-
 .../pico_sync/include/pico/critical_section.h |  37 ++--
 src/common/pico_sync/include/pico/lock_core.h | 165 +++++++++++++++++-
 src/common/pico_sync/include/pico/mutex.h     |  41 ++++-
 src/common/pico_sync/mutex.c                  |  92 ++++++----
 src/common/pico_sync/sem.c                    |  53 +++---
 src/common/pico_time/time.c                   |  31 ++--
 .../pico_util/include/pico/util/queue.h       |   8 +-
 src/common/pico_util/queue.c                  | 107 +++++++-----
 src/rp2_common/hardware_sync/sync.c           |   1 +
 .../hardware_timer/include/hardware/timer.h   |  11 +-
 src/rp2_common/hardware_timer/timer.c         |   9 +
 src/rp2_common/pico_runtime/runtime.c         |   6 +-
 src/rp2_common/pico_stdio/stdio.c             |   8 +-
 15 files changed, 434 insertions(+), 152 deletions(-)

diff --git a/pico_sdk_init.cmake b/pico_sdk_init.cmake
index 879842c..e8405b2 100644
--- a/pico_sdk_init.cmake
+++ b/pico_sdk_init.cmake
@@ -35,8 +35,8 @@ if (NOT TARGET _pico_sdk_pre_init_marker)
 
     include(pico_pre_load_platform)
 
-    # todo perhaps this should be included by the platform instead?
     # We want to configure correct toolchain prior to project load
+    # todo perhaps this should be included by the platform instead?
     include(pico_pre_load_toolchain)
 
     macro(pico_sdk_init)
diff --git a/src/common/pico_sync/critical_section.c b/src/common/pico_sync/critical_section.c
index 893afb6..f28732b 100644
--- a/src/common/pico_sync/critical_section.c
+++ b/src/common/pico_sync/critical_section.c
@@ -10,15 +10,18 @@
 static_assert(sizeof(critical_section_t) == 8, "");
 #endif
 
-void critical_section_init(critical_section_t *critsec) {
-    critical_section_init_with_lock_num(critsec, (uint)spin_lock_claim_unused(true));
+void critical_section_init(critical_section_t *crit_sec) {
+    critical_section_init_with_lock_num(crit_sec, (uint)spin_lock_claim_unused(true));
 }
 
-void critical_section_init_with_lock_num(critical_section_t *critsec, uint lock_num) {
-    lock_init(&critsec->core, lock_num);
+void critical_section_init_with_lock_num(critical_section_t *crit_sec, uint lock_num) {
+    crit_sec->spin_lock = spin_lock_instance(lock_num);
     __mem_fence_release();
 }
 
-void critical_section_deinit(critical_section_t *critsec) {
-    spin_lock_unclaim(spin_lock_get_num(critsec->core.spin_lock));
+void critical_section_deinit(critical_section_t *crit_sec) {
+    spin_lock_unclaim(spin_lock_get_num(crit_sec->spin_lock));
+#ifndef NDEBUG
+    crit_sec->spin_lock = (spin_lock_t *)-1;
+#endif
 }
\ No newline at end of file
diff --git a/src/common/pico_sync/include/pico/critical_section.h b/src/common/pico_sync/include/pico/critical_section.h
index 17a8b3f..2f94494 100644
--- a/src/common/pico_sync/include/pico/critical_section.h
+++ b/src/common/pico_sync/include/pico/critical_section.h
@@ -22,11 +22,12 @@ extern "C" {
  *  from the other core, and from (higher priority) interrupts on the same core. It does the former
  *  using a spin lock and the latter by disabling interrupts on the calling core.
  *
- *  Because interrupts are disabled by this function, uses of the critical_section should be as short as possible.
+ *  Because interrupts are disabled when a critical_section is owned, uses of the critical_section
+ *  should be as short as possible.
  */
 
 typedef struct __packed_aligned critical_section {
-    lock_core_t core;
+    spin_lock_t *spin_lock;
     uint32_t save;
 } critical_section_t;
 
@@ -38,16 +39,16 @@ typedef struct __packed_aligned critical_section {
  * critical sections, however if you do so you *must* use \ref critical_section_init_with_lock_num
  * to ensure that the spin lock's used are different.
  *
- * \param critsec Pointer to critical_section structure
+ * \param crit_sec Pointer to critical_section structure
  */
-void critical_section_init(critical_section_t *critsec);
+void critical_section_init(critical_section_t *crit_sec);
 
 /*! \brief  Initialise a critical_section structure assigning a specific spin lock number
  *  \ingroup critical_section
- * \param critsec Pointer to critical_section structure
+ * \param crit_sec Pointer to critical_section structure
  * \param lock_num the specific spin lock number to use
  */
-void critical_section_init_with_lock_num(critical_section_t *critsec, uint lock_num);
+void critical_section_init_with_lock_num(critical_section_t *crit_sec, uint lock_num);
 
 /*! \brief  Enter a critical_section
  *  \ingroup critical_section
@@ -55,20 +56,32 @@ void critical_section_init_with_lock_num(critical_section_t *critsec, uint lock_
  * If the spin lock associated with this critical section is in use, then this
  * method will block until it is released.
  *
- * \param critsec Pointer to critical_section structure
+ * \param crit_sec Pointer to critical_section structure
  */
-static inline void critical_section_enter_blocking(critical_section_t *critsec) {
-    critsec->save = spin_lock_blocking(critsec->core.spin_lock);
+static inline void critical_section_enter_blocking(critical_section_t *crit_sec) {
+    crit_sec->save = spin_lock_blocking(crit_sec->spin_lock);
 }
 
 /*! \brief  Release a critical_section
  *  \ingroup critical_section
  *
- * \param critsec Pointer to critical_section structure
+ * \param crit_sec Pointer to critical_section structure
  */
-static inline void critical_section_exit(critical_section_t *critsec) {
-    spin_unlock(critsec->core.spin_lock, critsec->save);
+static inline void critical_section_exit(critical_section_t *crit_sec) {
+    spin_unlock(crit_sec->spin_lock, crit_sec->save);
 }
+
+/*! \brief  De-Initialise a critical_section created by the critical_section_init method
+ *  \ingroup critical_section
+ *
+ * This method is only used to free the associated spin lock allocated via
+ * the critical_section_init method (it should not be used to de-initialize a spin lock
+ * created via critical_section_init_with_lock_num). After this call, the critical section is invalid
+ *
+ * \param crit_sec Pointer to critical_section structure
+ */
+void critical_section_deinit(critical_section_t *crit_sec);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/common/pico_sync/include/pico/lock_core.h b/src/common/pico_sync/include/pico/lock_core.h
index d8f36f2..004ee45 100644
--- a/src/common/pico_sync/include/pico/lock_core.h
+++ b/src/common/pico_sync/include/pico/lock_core.h
@@ -8,26 +8,183 @@
 #define _PICO_LOCK_CORE_H
 
 #include "pico.h"
+#include "pico/time.h"
 #include "hardware/sync.h"
 
+/** \file lock_core.h
+ *  \defgroup lock_core lock_core
+ *  \ingroup pico_sync
+ * \brief base synchronization/lock primitive support
+ *
+ * Most of the pico_sync locking primitives contain a lock_core_t structure member. This currently just holds a spin
+ * lock which is used only to protect the contents of the rest of the structure as part of implementing the synchronization
+ * primitive. As such, the spin_lock member of lock core is never still held on return from any function for the primitive.
+ *
+ * \ref critical_section is an exceptional case in that it does not have a lock_core_t and simply wraps a pin lock, providing
+ * methods to lock and unlock said spin lock.
+ *
+ * lock_core based structures work by locking the spin lock, checking state, and then deciding whether they additionally need to block
+ * or notify when the spin lock is released. In the blocking case, they will wake up again in the future, and try the process again.
+ *
+ * By default the SDK just uses the processors' events via SEV and WEV for notification and blocking as these are sufficient for
+ * cross core, and notification from interrupt handlers. However macros are defined in this file that abstract the wait
+ * and notify mechanisms to allow the SDK locking functions to effectively be used within an RTOS on other environment.
+ *
+ * When implementing an RTOS, it is desirable for the SDK synchronization primitives that wait, to block the calling task (and immediately yield),
+ * and those that notify, to wake a blocked task which isn't on processor. At least the wait macro implementation needs to be atomic with the protecting
+ * spin_lock unlock from the callers point of view; i.e. the task should unlock the spin lock when as it starts its wait. Such implementation is
+ * up to the RTOS integration, however the macros are defined such that such operations are always combined into a single call
+ * (so they can be perfomed atomically) even though the default implementation does not need this, as a WFE which starts
+ * following the corresponding SEV is not missed.
+ */
+
 // PICO_CONFIG: PARAM_ASSERTIONS_ENABLED_LOCK_CORE, Enable/disable assertions in the lock core, type=bool, default=0, group=pico_sync
 #ifndef PARAM_ASSERTIONS_ENABLED_LOCK_CORE
 #define PARAM_ASSERTIONS_ENABLED_LOCK_CORE 0
 #endif
 
 /** \file lock_core.h
- *  \ingroup pico_sync
+ *  \ingroup lock_core
  *
- * Base implementation for locking primitives protected by a spin lock
+ * Base implementation for locking primitives protected by a spin lock. The spin lock is only used to protect
+ * access to the remaining lock state (in primitives using lock_core); it is never left locked outside
+ * of the function implementations
  */
-typedef struct lock_core {
+struct lock_core {
     // spin lock protecting this lock's state
     spin_lock_t *spin_lock;
 
     // note any lock members in containing structures need not be volatile;
     // they are protected by memory/compiler barriers when gaining and release spin locks
-} lock_core_t;
+};
 
+typedef struct lock_core lock_core_t;
+
+/*! \brief  Initialise a lock structure
+ *  \ingroup lock_core
+ *
+ * Inititalize a lock structure, providing the spin lock number to use for protecting internal state.
+ *
+ * \param core Pointer to the lock_core to initialize
+ * \param lock_num Spin lock number to use for the lock. As the spin lock is only used internally to the locking primitive
+ *                 method implementations, this does not need to be globally unique, however could suffer contention
+ */
 void lock_init(lock_core_t *core, uint lock_num);
 
+#ifndef lock_owner_id_t
+/*! \brief  type to use to store the 'owner' of a lock.
+ *  \ingroup lock_core
+ * By default this is int8_t as it only needs to store the core number or -1, however it may be
+ * overridden if a larger type is required (e.g. for an RTOS task id)
+ */
+#define lock_owner_id_t int8_t
 #endif
+
+#ifndef LOCK_INVALID_OWNER_ID
+/*! \brief  marker value to use for a lock_owner_id_t which does not refer to any valid owner
+ *  \ingroup lock_core
+ */
+#define LOCK_INVALID_OWNER_ID ((lock_owner_id_t)-1)
+#endif
+
+#ifndef lock_get_caller_owner_id
+/*! \brief  return the owner id for the caller
+ *  \ingroup lock_core
+ * By default this returns the calling core number, but may be overridden (e.g. to return an RTOS task id)
+ */
+#define lock_get_caller_owner_id() ((lock_owner_id_t)get_core_num())
+#endif
+
+#ifndef lock_internal_spin_unlock_with_wait
+/*! \brief   Atomically unlock the lock's spin lock, and wait for a notification.
+ *  \ingroup lock_core
+ *
+ * _Atomic_ here refers to the fact that it should not be possible for a concurrent lock_internal_spin_unlock_with_notify
+ * to insert itself between the spin unlock and this wait in a way that the wait does not see the notification (i.e. causing
+ * a missed notification). In other words this method should always wake up in response to a lock_internal_spin_unlock_with_notify
+ * for the same lock, which completes after this call starts.
+ *
+ * In an ideal implementation, this method would return exactly after the corresponding lock_internal_spin_unlock_with_notify
+ * has subsequently been called on the same lock instance, however this method is free to return at _any_ point before that;
+ * this macro is _always_ used in a loop which locks the spin lock, checks the internal locking primitive state and then
+ * waits again if the calling thread should not proceed.
+ *
+ * By default this macro simply unlocks the spin lock, and then performs a WFE, but may be overridden
+ * (e.g. to actually block the RTOS task).
+ *
+ * \param lock the lock_core for the primitive which needs to block
+ * \param save the uint32_t value that should be passed to spin_unlock when the spin lock is unlocked. (i.e. the `PRIMASK`
+ *             state when the spin lock was acquire
+ */
+#define lock_internal_spin_unlock_with_wait(lock, save) spin_unlock((lock)->spin_lock, save), __wfe()
+#endif
+
+#ifndef lock_internal_spin_unlock_with_notify
+/*! \brief   Atomically unlock the lock's spin lock, and send a notification
+ *  \ingroup lock_core
+ *
+ * _Atomic_ here refers to the fact that it should not be possible for this notification to happen during a
+ * lock_internal_spin_unlock_with_wait in a way that that wait does not see the notification (i.e. causing
+ * a missed notification). In other words this method should always wake up any lock_internal_spin_unlock_with_wait
+ * which started before this call completes.
+ *
+ * In an ideal implementation, this method would wake up only the corresponding lock_internal_spin_unlock_with_wait
+ * that has been called on the same lock instance, however it is free to wake up any of them, as they will check
+ * their condition and then re-wait if necessary/
+ *
+ * By default this macro simply unlocks the spin lock, and then performs a SEV, but may be overridden
+ * (e.g. to actually un-block RTOS task(s)).
+ *
+ * \param lock the lock_core for the primitive which needs to block
+ * \param save the uint32_t value that should be passed to spin_unlock when the spin lock is unlocked. (i.e. the PRIMASK
+ *             state when the spin lock was acquire)
+ */
+#define lock_internal_spin_unlock_with_notify(lock, save) spin_unlock((lock)->spin_lock, save), __sev()
+#endif
+
+#ifndef lock_internal_spin_unlock_with_best_effort_wait_or_timeout
+/*! \brief   Atomically unlock the lock's spin lock, and wait for a notification or a timeout
+ *  \ingroup lock_core
+ *
+ * _Atomic_ here refers to the fact that it should not be possible for a concurrent lock_internal_spin_unlock_with_notify
+ * to insert itself between the spin unlock and this wait in a way that the wait does not see the notification (i.e. causing
+ * a missed notification). In other words this method should always wake up in response to a lock_internal_spin_unlock_with_notify
+ * for the same lock, which completes after this call starts.
+ *
+ * In an ideal implementation, this method would return exactly after the corresponding lock_internal_spin_unlock_with_notify
+ * has subsequently been called on the same lock instance or the timeout has been reached, however this method is free to return
+ * at _any_ point before that; this macro is _always_ used in a loop which locks the spin lock, checks the internal locking
+ * primitive state and then waits again if the calling thread should not proceed.
+ *
+ * By default this simply unlocks the spin lock, and then calls \ref best_effort_wfe_or_timeout
+ * but may be overridden (e.g. to actually block the RTOS task with a timeout).
+ *
+ * \param lock the lock_core for the primitive which needs to block
+ * \param save the uint32_t value that should be passed to spin_unlock when the spin lock is unlocked. (i.e. the PRIMASK
+ *             state when the spin lock was acquire)
+ * \param until the \ref absolute_time_t value
+ * \return true if the timeout has been reached
+ */
+#define lock_internal_spin_unlock_with_best_effort_wait_or_timeout(lock, save, until) ({ \
+    spin_unlock((lock)->spin_lock, save);                                                \
+    best_effort_wfe_or_timeout(until);                                                   \
+})
+#endif
+
+#ifndef sync_internal_yield_until_before
+/*! \brief   yield to other processing until some time before the requested time
+ *  \ingroup lock_core
+ *
+ * This method is provided for cases where the caller has no useful work to do
+ * until the specified time.
+ *
+ * By default this method does nothing, however if can be overridden (for example by an
+ * RTOS which is able to block the current task until the scheduler tick before
+ * the given time)
+ *
+ * \param until the \ref absolute_time_t value
+ */
+#define sync_internal_yield_until_before(until) ((void)0)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/common/pico_sync/include/pico/mutex.h b/src/common/pico_sync/include/pico/mutex.h
index 4b5d175..6ed9785 100644
--- a/src/common/pico_sync/include/pico/mutex.h
+++ b/src/common/pico_sync/include/pico/mutex.h
@@ -28,12 +28,17 @@ extern "C" {
  *
  * See \ref critical_section.h for protecting access between multiple cores AND IRQ handlers
  */
-
 typedef struct __packed_aligned mutex {
     lock_core_t core;
-    int8_t owner; //! core number or -1 for unowned
+    lock_owner_id_t owner;      //! owner id LOCK_INVALID_OWNER_ID for unowned
+    uint8_t recursion_state;    //! 0 means non recursive (owner or unowned)
+                                //! 1 is a maxed out recursive lock
+                                //! 2-254 is an owned lock
+                                //! 255 is an un-owned lock
 } mutex_t;
 
+#define MAX_RECURSION_STATE ((uint8_t)255)
+
 /*! \brief  Initialise a mutex structure
  *  \ingroup mutex
  *
@@ -41,6 +46,15 @@ typedef struct __packed_aligned mutex {
  */
 void mutex_init(mutex_t *mtx);
 
+/*! \brief  Initialise a recursive mutex structure
+ *  \ingroup mutex
+ *
+ * A recursive mutex may be entered in a nested fashion by the same owner
+ *
+ * \param mtx Pointer to mutex structure
+ */
+void recursive_mutex_init(mutex_t *mtx);
+
 /*! \brief  Take ownership of a mutex
  *  \ingroup mutex
  *
@@ -129,6 +143,29 @@ static inline bool mutex_is_initialzed(mutex_t *mtx) {
  */
 #define auto_init_mutex(name) static __attribute__((section(".mutex_array"))) mutex_t name
 
+/*! \brief Helper macro for static definition of recursive mutexes
+ *  \ingroup mutex
+ *
+ * A recursive mutex defined as follows:
+ *
+ * ```c
+ * auto_init_recursive_mutex(my_mutex);
+ * ```
+ *
+ * Is equivalent to doing
+ *
+ * ```c
+ * static mutex_t my_mutex;
+ *
+ * void my_init_function() {
+ *    recursive_mutex_init(&my_mutex);
+ * }
+ * ```
+ *
+ * But the initialization of the mutex is performed automatically during runtime initialization
+ */
+#define auto_init_recursive_mutex(name) static __attribute__((section(".mutex_array"))) mutex_t name = { .recursion_state = MAX_RECURSION_STATE }
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/common/pico_sync/mutex.c b/src/common/pico_sync/mutex.c
index 4b4bb27..a758abf 100644
--- a/src/common/pico_sync/mutex.c
+++ b/src/common/pico_sync/mutex.c
@@ -7,37 +7,53 @@
 #include "pico/mutex.h"
 #include "pico/time.h"
 
-#if !PICO_NO_HARDWARE
-static_assert(sizeof(mutex_t) == 8, "");
-#endif
+static void mutex_init_internal(mutex_t *mtx, uint8_t recursion_state) {
+    lock_init(&mtx->core, next_striped_spin_lock_num());
+    mtx->owner = LOCK_INVALID_OWNER_ID;
+    mtx->recursion_state = recursion_state;
+    __mem_fence_release();
+}
 
 void mutex_init(mutex_t *mtx) {
-    lock_init(&mtx->core, next_striped_spin_lock_num());
-    mtx->owner = -1;
-    __mem_fence_release();
+    mutex_init_internal(mtx, 0);
+}
+
+void recursive_mutex_init(mutex_t *mtx) {
+    mutex_init_internal(mtx, MAX_RECURSION_STATE);
 }
 
 void __time_critical_func(mutex_enter_blocking)(mutex_t *mtx) {
     assert(mtx->core.spin_lock);
-    bool block = true;
     do {
         uint32_t save = spin_lock_blocking(mtx->core.spin_lock);
-        if (mtx->owner < 0) {
-            mtx->owner = (int8_t)get_core_num();
-            block = false;
+        lock_owner_id_t caller = lock_get_caller_owner_id();
+        if (mtx->owner == LOCK_INVALID_OWNER_ID) {
+            mtx->owner = caller;
+            if (mtx->recursion_state) {
+                assert(mtx->recursion_state == MAX_RECURSION_STATE);
+                mtx->recursion_state--;
+            }
+        } else if (mtx->owner == caller && mtx->recursion_state > 1) {
+            mtx->recursion_state--;
+        } else {
+            lock_internal_spin_unlock_with_wait(&mtx->core, save);
+            // spin lock already unlocked, so loop again
+            continue;
         }
         spin_unlock(mtx->core.spin_lock, save);
-        if (block) {
-            __wfe();
-        }
-    } while (block);
+        break;
+    } while (true);
 }
 
 bool __time_critical_func(mutex_try_enter)(mutex_t *mtx, uint32_t *owner_out) {
     bool entered;
     uint32_t save = spin_lock_blocking(mtx->core.spin_lock);
-    if (mtx->owner < 0) {
-        mtx->owner = (int8_t)get_core_num();
+    lock_owner_id_t caller = lock_get_caller_owner_id();
+    if (mtx->owner == LOCK_INVALID_OWNER_ID) {
+        mtx->owner = lock_get_caller_owner_id();
+        entered = true;
+    } else if (mtx->owner == caller && mtx->recursion_state > 1) {
+        mtx->recursion_state--;
         entered = true;
     } else {
         if (owner_out) *owner_out = (uint32_t) mtx->owner;
@@ -53,27 +69,41 @@ bool __time_critical_func(mutex_enter_timeout_ms)(mutex_t *mtx, uint32_t timeout
 
 bool __time_critical_func(mutex_enter_block_until)(mutex_t *mtx, absolute_time_t until) {
     assert(mtx->core.spin_lock);
-    bool block = true;
     do {
         uint32_t save = spin_lock_blocking(mtx->core.spin_lock);
-        if (mtx->owner < 0) {
-            mtx->owner = (int8_t)get_core_num();
-            block = false;
-        }
-        spin_unlock(mtx->core.spin_lock, save);
-        if (block) {
-            if (best_effort_wfe_or_timeout(until)) {
+        lock_owner_id_t caller = lock_get_caller_owner_id();
+        if (mtx->owner == LOCK_INVALID_OWNER_ID) {
+            mtx->owner = caller;
+        } else if (mtx->owner == caller && mtx->recursion_state > 1) {
+            mtx->recursion_state--;
+        } else {
+            if (lock_internal_spin_unlock_with_best_effort_wait_or_timeout(&mtx->core, save, until)) {
+                // timed out
                 return false;
+            } else {
+                // not timed out; spin lock already unlocked, so loop again
+                continue;
             }
         }
-    } while (block);
-    return true;
+        spin_unlock(mtx->core.spin_lock, save);
+        return true;
+    } while (true);
 }
 
 void __time_critical_func(mutex_exit)(mutex_t *mtx) {
     uint32_t save = spin_lock_blocking(mtx->core.spin_lock);
-    assert(mtx->owner >= 0);
-    mtx->owner = -1;
-    __sev();
-    spin_unlock(mtx->core.spin_lock, save);
-}
+    assert(mtx->owner != LOCK_INVALID_OWNER_ID);
+    if (!mtx->recursion_state) {
+        mtx->owner = LOCK_INVALID_OWNER_ID;
+        lock_internal_spin_unlock_with_notify(&mtx->core, save);
+    } else {
+        mtx->recursion_state++;
+        assert(mtx->recursion_state);
+        if (mtx->recursion_state == MAX_RECURSION_STATE) {
+            mtx->owner = LOCK_INVALID_OWNER_ID;
+            lock_internal_spin_unlock_with_notify(&mtx->core, save);
+        } else {
+            spin_unlock(mtx->core.spin_lock, save);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/common/pico_sync/sem.c b/src/common/pico_sync/sem.c
index 3ad9816..464e964 100644
--- a/src/common/pico_sync/sem.c
+++ b/src/common/pico_sync/sem.c
@@ -20,64 +20,57 @@ int __time_critical_func(sem_available)(semaphore_t *sem) {
 }
 
 void __time_critical_func(sem_acquire_blocking)(semaphore_t *sem) {
-    bool block = true;
     do {
         uint32_t save = spin_lock_blocking(sem->core.spin_lock);
         if (sem->permits > 0) {
             sem->permits--;
-            __sev();
-            block = false;
+            lock_internal_spin_unlock_with_notify(&sem->core, save);
+            break;
         }
-        spin_unlock(sem->core.spin_lock, save);
-        if (block) {
-            __wfe();
-        }
-    } while (block);
+        lock_internal_spin_unlock_with_wait(&sem->core, save);
+    } while (true);
 }
 
 bool __time_critical_func(sem_acquire_timeout_ms)(semaphore_t *sem, uint32_t timeout_ms) {
-    bool block = true;
     absolute_time_t target = nil_time;
     do {
         uint32_t save = spin_lock_blocking(sem->core.spin_lock);
         if (sem->permits > 0) {
             sem->permits--;
-            __sev();
-            block = false;
+            lock_internal_spin_unlock_with_notify(&sem->core, save);
+            return true;
         }
-        spin_unlock(sem->core.spin_lock, save);
-        if (block) {
-            if (is_nil_time(target)) {
-                target = make_timeout_time_ms(timeout_ms);
-            }
-            if (best_effort_wfe_or_timeout(target)) {
-                return false;
-            }
+        if (is_nil_time(target)) {
+            target = make_timeout_time_ms(timeout_ms);
         }
-    } while (block);
-    return true;
+        if (lock_internal_spin_unlock_with_best_effort_wait_or_timeout(&sem->core, save, target)) {
+            return false;
+        }
+    } while (true);
 }
 
 // todo this should really have a blocking variant for when permits are maxed out
 bool __time_critical_func(sem_release)(semaphore_t *sem) {
-    bool rc;
     uint32_t save = spin_lock_blocking(sem->core.spin_lock);
     int32_t count = sem->permits;
     if (count < sem->max_permits) {
         sem->permits = (int16_t)(count + 1);
-        __sev();
-        rc = true;
+        lock_internal_spin_unlock_with_notify(&sem->core, save);
+        return true;
     } else {
-        rc = false;
+        spin_unlock(sem->core.spin_lock, save);
+        return false;
     }
-    spin_unlock(sem->core.spin_lock, save);
-    return rc;
 }
 
 void __time_critical_func(sem_reset)(semaphore_t *sem, int16_t permits) {
     assert(permits >= 0 && permits <= sem->max_permits);
     uint32_t save = spin_lock_blocking(sem->core.spin_lock);
-    if (permits > sem->permits) __sev();
-    sem->permits = permits;
-    spin_unlock(sem->core.spin_lock, save);
+    if (permits > sem->permits) {
+        sem->permits = permits;
+        lock_internal_spin_unlock_with_notify(&sem->core, save);
+    } else {
+        sem->permits = permits;
+        spin_unlock(sem->core.spin_lock, save);
+    }
 }
diff --git a/src/common/pico_time/time.c b/src/common/pico_time/time.c
index 2105214..ec925fa 100644
--- a/src/common/pico_time/time.c
+++ b/src/common/pico_time/time.c
@@ -10,7 +10,7 @@
 #include "pico.h"
 #include "pico/time.h"
 #include "pico/util/pheap.h"
-#include "hardware/sync.h"
+#include "pico/sync.h"
 
 const absolute_time_t ABSOLUTE_TIME_INITIALIZED_VAR(nil_time, 0);
 const absolute_time_t ABSOLUTE_TIME_INITIALIZED_VAR(at_the_end_of_time, INT64_MAX);
@@ -37,6 +37,7 @@ typedef struct alarm_pool {
 PHEAP_DEFINE_STATIC(default_alarm_pool_heap, PICO_TIME_DEFAULT_ALARM_POOL_MAX_TIMERS);
 static alarm_pool_entry_t default_alarm_pool_entries[PICO_TIME_DEFAULT_ALARM_POOL_MAX_TIMERS];
 static uint8_t default_alarm_pool_entry_ids_high[PICO_TIME_DEFAULT_ALARM_POOL_MAX_TIMERS];
+static lock_core_t sleep_notifier;
 
 static alarm_pool_t default_alarm_pool = {
         .heap = &default_alarm_pool_heap,
@@ -81,6 +82,7 @@ void alarm_pool_init_default() {
         alarm_pool_post_alloc_init(&default_alarm_pool,
                                    PICO_TIME_DEFAULT_ALARM_POOL_HARDWARE_ALARM_NUM);
     }
+    lock_init(&sleep_notifier, PICO_SPINLOCK_ID_TIMER);
 #endif
 }
 
@@ -318,8 +320,9 @@ void alarm_pool_dump(alarm_pool_t *pool) {
 }
 
 #if !PICO_TIME_DEFAULT_ALARM_POOL_DISABLED
-static int64_t sev_callback(__unused alarm_id_t id, __unused void *user_data) {
-    __sev();
+static int64_t sleep_until_callback(__unused alarm_id_t id, __unused void *user_data) {
+    uint32_t save = spin_lock_blocking(sleep_notifier.spin_lock);
+    lock_internal_spin_unlock_with_notify(&sleep_notifier, save);
     return 0;
 }
 #endif
@@ -338,13 +341,17 @@ void sleep_until(absolute_time_t t) {
     absolute_time_t t_before;
     update_us_since_boot(&t_before, t_before_us);
     if (absolute_time_diff_us(get_absolute_time(), t_before) > 0) {
-        if (add_alarm_at(t_before, sev_callback, NULL, false) >= 0) {
+        if (add_alarm_at(t_before, sleep_until_callback, NULL, false) >= 0) {
             // able to add alarm for just before the time
             while (!time_reached(t_before)) {
-                __wfe();
+                uint32_t save = spin_lock_blocking(sleep_notifier.spin_lock);
+                lock_internal_spin_unlock_with_wait(&sleep_notifier, save);
             }
         }
     }
+#else
+    // hook in case we're in RTOS; note we assume using the alarm pool is better always if available.
+    sync_internal_yield_until_before(t);
 #endif
     // now wait until the exact time
     busy_wait_until(t);
@@ -354,13 +361,17 @@ void sleep_us(uint64_t us) {
 #if !PICO_TIME_DEFAULT_ALARM_POOL_DISABLED
     sleep_until(make_timeout_time_us(us));
 #else
-    if (us >> 32u) {
-        busy_wait_until(make_timeout_time_us(us));
+    if (us < PICO_TIME_SLEEP_OVERHEAD_ADJUST_US) {
+        busy_wait_us(us);
     } else {
-        busy_wait_us_32(us);
+        // hook in case we're in RTOS; note we assume using the alarm pool is better always if available.
+        absolute_time_t t = make_timeout_time_us(us - PICO_TIME_SLEEP_OVERHEAD_ADJUST_US);
+        sync_internal_yield_until_before(t);
+
+        // then wait the rest of thw way
+        busy_wait_until(t);
     }
 #endif
-
 }
 
 void sleep_ms(uint32_t ms) {
@@ -370,7 +381,7 @@ void sleep_ms(uint32_t ms) {
 bool best_effort_wfe_or_timeout(absolute_time_t timeout_timestamp) {
 #if !PICO_TIME_DEFAULT_ALARM_POOL_DISABLED
     alarm_id_t id;
-    id = add_alarm_at(timeout_timestamp, sev_callback, NULL, false);
+    id = add_alarm_at(timeout_timestamp, sleep_until_callback, NULL, false);
     if (id <= 0) {
         tight_loop_contents();
         return time_reached(timeout_timestamp);
diff --git a/src/common/pico_util/include/pico/util/queue.h b/src/common/pico_util/include/pico/util/queue.h
index 60a450a..e83ecd4 100644
--- a/src/common/pico_util/include/pico/util/queue.h
+++ b/src/common/pico_util/include/pico/util/queue.h
@@ -22,8 +22,10 @@
 extern "C" {
 #endif
 
+#include "pico/lock_core.h"
+
 typedef struct {
-    spin_lock_t *lock;
+    lock_core_t core;
     uint8_t *data;
     uint16_t wptr;
     uint16_t rptr;
@@ -85,9 +87,9 @@ static inline uint queue_get_level_unsafe(queue_t *q) {
  * \return Number of entries in the queue
  */
 static inline uint queue_get_level(queue_t *q) {
-    uint32_t save = spin_lock_blocking(q->lock);
+    uint32_t save = spin_lock_blocking(q->core.spin_lock);
     uint level = queue_get_level_unsafe(q);
-    spin_unlock(q->lock, save);
+    spin_unlock(q->core.spin_lock, save);
     return level;
 }
 
diff --git a/src/common/pico_util/queue.c b/src/common/pico_util/queue.c
index 4b1c27e..bc667ac 100644
--- a/src/common/pico_util/queue.c
+++ b/src/common/pico_util/queue.c
@@ -9,7 +9,7 @@
 #include "pico/util/queue.h"
 
 void queue_init_with_spinlock(queue_t *q, uint element_size, uint element_count, uint spinlock_num) {
-    q->lock = spin_lock_instance(spinlock_num);
+    lock_init(&q->core, spinlock_num);
     q->data = (uint8_t *)calloc(element_count + 1, element_size);
     q->element_count = (uint16_t)element_count;
     q->element_size = (uint16_t)element_size;
@@ -33,66 +33,79 @@ static inline uint16_t inc_index(queue_t *q, uint16_t index) {
     return index;
 }
 
+static bool queue_add_internal(queue_t *q, void *data, bool block) {
+    do {
+        uint32_t save = spin_lock_blocking(q->core.spin_lock);
+        if (queue_get_level_unsafe(q) != q->element_count) {
+            memcpy(element_ptr(q, q->wptr), data, q->element_size);
+            q->wptr = inc_index(q, q->wptr);
+            lock_internal_spin_unlock_with_notify(&q->core, save);
+            return true;
+        }
+        if (block) {
+            lock_internal_spin_unlock_with_wait(&q->core, save);
+        } else {
+            spin_unlock(q->core.spin_lock, save);
+            return false;
+        }
+    } while (true);
+}
+
+static bool queue_remove_internal(queue_t *q, void *data, bool block) {
+    do {
+        uint32_t save = spin_lock_blocking(q->core.spin_lock);
+        if (queue_get_level_unsafe(q) != 0) {
+            memcpy(data, element_ptr(q, q->rptr), q->element_size);
+            q->rptr = inc_index(q, q->rptr);
+            lock_internal_spin_unlock_with_notify(&q->core, save);
+            return true;
+        }
+        if (block) {
+            lock_internal_spin_unlock_with_wait(&q->core, save);
+        } else {
+            spin_unlock(q->core.spin_lock, save);
+            return false;
+        }
+    } while (true);
+}
+
+static bool queue_peek_internal(queue_t *q, void *data, bool block) {
+    do {
+        uint32_t save = spin_lock_blocking(q->core.spin_lock);
+        if (queue_get_level_unsafe(q) != 0) {
+            memcpy(data, element_ptr(q, q->rptr), q->element_size);
+            lock_internal_spin_unlock_with_notify(&q->core, save);
+            return true;
+        }
+        if (block) {
+            lock_internal_spin_unlock_with_wait(&q->core, save);
+        } else {
+            spin_unlock(q->core.spin_lock, save);
+            return false;
+        }
+    } while (true);
+}
+
 bool queue_try_add(queue_t *q, void *data) {
-    bool success = false;
-    uint32_t flags = spin_lock_blocking(q->lock);
-    if (queue_get_level_unsafe(q) != q->element_count) {
-        memcpy(element_ptr(q, q->wptr), data, q->element_size);
-        q->wptr = inc_index(q, q->wptr);
-        success = true;
-    }
-    spin_unlock(q->lock, flags);
-    if (success) __sev();
-    return success;
+    return queue_add_internal(q, data, false);
 }
 
 bool queue_try_remove(queue_t *q, void *data) {
-    bool success = false;
-    uint32_t flags = spin_lock_blocking(q->lock);
-    if (queue_get_level_unsafe(q) != 0) {
-        memcpy(data, element_ptr(q, q->rptr), q->element_size);
-        q->rptr = inc_index(q, q->rptr);
-        success = true;
-    }
-    spin_unlock(q->lock, flags);
-    if (success) __sev();
-    return success;
+    return queue_remove_internal(q, data, false);
 }
 
 bool queue_try_peek(queue_t *q, void *data) {
-    bool success = false;
-    uint32_t flags = spin_lock_blocking(q->lock);
-    if (queue_get_level_unsafe(q) != 0) {
-        memcpy(data, element_ptr(q, q->rptr), q->element_size);
-        success = true;
-    }
-    spin_unlock(q->lock, flags);
-    return success;
+    return queue_peek_internal(q, data, false);
 }
 
 void queue_add_blocking(queue_t *q, void *data) {
-    bool done;
-    do {
-        done = queue_try_add(q, data);
-        if (done) break;
-        __wfe();
-    } while (true);
+    queue_add_internal(q, data, true);
 }
 
 void queue_remove_blocking(queue_t *q, void *data) {
-    bool done;
-    do {
-        done = queue_try_remove(q, data);
-        if (done) break;
-        __wfe();
-    } while (true);
+    queue_remove_internal(q, data, true);
 }
 
 void queue_peek_blocking(queue_t *q, void *data) {
-    bool done;
-    do {
-        done = queue_try_peek(q, data);
-        if (done) break;
-        __wfe();
-    } while (true);
+    queue_peek_internal(q, data, true);
 }
diff --git a/src/rp2_common/hardware_sync/sync.c b/src/rp2_common/hardware_sync/sync.c
index c032a27..986f311 100644
--- a/src/rp2_common/hardware_sync/sync.c
+++ b/src/rp2_common/hardware_sync/sync.c
@@ -49,6 +49,7 @@ void spin_lock_claim_mask(uint32_t mask) {
 
 void spin_lock_unclaim(uint lock_num) {
     check_lock_num(lock_num);
+    spin_unlock_unsafe(spin_lock_instance(lock_num));
     hw_claim_clear((uint8_t *) &claimed, lock_num);
 }
 
diff --git a/src/rp2_common/hardware_timer/include/hardware/timer.h b/src/rp2_common/hardware_timer/include/hardware/timer.h
index 4665ae5..ff1f7c1 100644
--- a/src/rp2_common/hardware_timer/include/hardware/timer.h
+++ b/src/rp2_common/hardware_timer/include/hardware/timer.h
@@ -80,17 +80,24 @@ uint64_t time_us_64(void);
 /*! \brief Busy wait wasting cycles for the given (32 bit) number of microseconds
  *  \ingroup hardware_timer
  *
- * \param delay_us delay amount
+ * \param delay_us delay amount in microseconds
  */
 void busy_wait_us_32(uint32_t delay_us);
 
 /*! \brief Busy wait wasting cycles for the given (64 bit) number of microseconds
  *  \ingroup hardware_timer
  *
- * \param delay_us delay amount
+ * \param delay_us delay amount in microseconds
  */
 void busy_wait_us(uint64_t delay_us);
 
+/*! \brief Busy wait wasting cycles for the given number of milliseconds
+ *  \ingroup hardware_timer
+ *
+ * \param delay_ms delay amount in milliseconds
+ */
+void busy_wait_ms(uint32_t delay_ms);
+
 /*! \brief Busy wait wasting cycles until after the specified timestamp
  *  \ingroup hardware_timer
  *
diff --git a/src/rp2_common/hardware_timer/timer.c b/src/rp2_common/hardware_timer/timer.c
index 23bea05..79d040f 100644
--- a/src/rp2_common/hardware_timer/timer.c
+++ b/src/rp2_common/hardware_timer/timer.c
@@ -73,6 +73,15 @@ void busy_wait_us(uint64_t delay_us) {
     busy_wait_until(t);
 }
 
+void busy_wait_ms(uint32_t delay_ms)
+{
+    if (delay_ms <= 0x7fffffffu / 1000) {
+        busy_wait_us_32(delay_ms * 1000);
+    } else {
+        busy_wait_us(delay_ms * 1000ull);
+    }
+}
+
 void busy_wait_until(absolute_time_t t) {
     uint64_t target = to_us_since_boot(t);
     uint32_t hi_target = (uint32_t)(target >> 32u);
diff --git a/src/rp2_common/pico_runtime/runtime.c b/src/rp2_common/pico_runtime/runtime.c
index 8313086..261cca6 100644
--- a/src/rp2_common/pico_runtime/runtime.c
+++ b/src/rp2_common/pico_runtime/runtime.c
@@ -118,7 +118,11 @@ void runtime_init(void) {
 
     // the first function pointer, not the address of it.
     for (mutex_t *m = &__mutex_array_start; m < &__mutex_array_end; m++) {
-        mutex_init(m);
+        if (m->recursion_state) {
+            recursive_mutex_init(m);
+        } else {
+            mutex_init(m);
+        }
     }
 
 #if !(PICO_NO_RAM_VECTOR_TABLE || PICO_NO_FLASH)
diff --git a/src/rp2_common/pico_stdio/stdio.c b/src/rp2_common/pico_stdio/stdio.c
index 5aaf1a8..2ccca71 100644
--- a/src/rp2_common/pico_stdio/stdio.c
+++ b/src/rp2_common/pico_stdio/stdio.c
@@ -34,13 +34,15 @@ static stdio_driver_t *filter;
 auto_init_mutex(print_mutex);
 
 bool stdout_serialize_begin(void) {
-    uint core_num = get_core_num();
+    lock_owner_id_t caller = lock_get_caller_owner_id();
+    // not using lock_owner_id_t to avoid backwards incompatibility change to mutex_try_enter API
+    static_assert(sizeof(lock_owner_id_t) <= 4, "");
     uint32_t owner;
     if (!mutex_try_enter(&print_mutex, &owner)) {
-        if (owner == core_num) {
+        if (owner == (uint32_t)caller) {
             return false;
         }
-        // other core owns the mutex, so lets wait
+        // we are not a nested call, so lets wait
         mutex_enter_blocking(&print_mutex);
     }
     return true;