SMP: Add logic to avoid a deadlock condition when CPU1 is hung waiting for g_cpu_irqlock and CPU0 is waitin for g_cpu_paused

This commit is contained in:
Gregory Nutt 2016-11-22 11:34:16 -06:00
parent b39556f625
commit bac7153609
10 changed files with 348 additions and 115 deletions

39
TODO
View File

@ -10,7 +10,7 @@ issues related to each board port.
nuttx/:
(13) Task/Scheduler (sched/)
(2) SMP
(1) SMP
(1) Memory Management (mm/)
(1) Power Management (drivers/pm)
(3) Signals (sched/signal, arch/)
@ -336,43 +336,6 @@ o SMP
Priority: High. spinlocks, and hence SMP, will not work on such systems
without this change.
Title: DEADLOCK SCENARIO WITH up_cpu_pause().
Description: I think there is a possibilty for a hang in up_cpu_pause().
Suppose this situation:
- CPU1 is in a critical section and has the g_cpu_irqlock
spinlock.
- CPU0 takes an interrupt and attempts to enter the critical
section. It spins waiting on g_cpu_irqlock with interrupt
disabled.
- CPU1 calls up_cpu_pause() to pause operation on CPU1. This
will issue an inter-CPU interrupt to CPU0
- But interrupts are disabled. What will happen? I think
that this is a deadlock: Interrupts will stay disabled on
CPU0 because it is spinning in the interrupt handler;
up_cpu_pause() will hang becuase the inter-CPU interrupt
is pending.
Are inter-CPU interrupts maskable in the same way as other
interrupts? If the are not-maskable, then we must also handle
them as nested interrupts in some fashion.
A work-around might be to check the state of other-CPU
interrupt handler inside the spin loop of up_cpu_pause().
Having the other CPU spinning and waiting for up_cpu_pause()
provided that (1) the pending interrupt can be cleared, and
(2) leave_critical_section() is not called prior to the point
where up_cpu_resume() is called, and (3) up_cpu_resume() is
smart enough to know that it should not attempt to resume a
non-paused CPU.
This would require some kind of information about each
interrupt handler: In an interrupt, waiting for spinlock,
have spinlock, etc.
Status: Open
Priority: Medium-High. I don't know for certain that this is a problem but it seems like it could
o Memory Management (mm/)
^^^^^^^^^^^^^^^^^^^^^^^

View File

@ -55,6 +55,20 @@
* Private Data
****************************************************************************/
/* These spinlocks are used in the SMP configuration in order to implement
* up_cpu_pause(). The protocol for CPUn to pause CPUm is as follows
*
* 1. The up_cpu_pause() implementation on CPUn locks both g_cpu_wait[m]
* and g_cpu_paused[m]. CPUn then waits spinning on g_cpu_wait[m].
* 2. CPUm receives the interrupt it (1) unlocks g_cpu_paused[m] and
* (2) locks g_cpu_wait[m]. The first unblocks CPUn and the second
* blocks CPUm in the interrupt handler.
*
* When CPUm resumes, CPUn unlocks g_cpu_wait[m] and the interrupt handler
* on CPUm continues. CPUm must, of course, also then unlock g_cpu_wait[m]
* so that it will be ready for the next pause operation.
*/
static volatile spinlock_t g_cpu_wait[CONFIG_SMP_NCPUS];
static volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS];
@ -62,6 +76,92 @@ static volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS];
* Public Functions
****************************************************************************/
/****************************************************************************
* Name: up_cpu_pausereq
*
* Description:
* Return true if a pause request is pending for this CPU.
*
* Input Parameters:
* cpu - The index of the CPU to be queried
*
* Returned Value:
* true = a pause request is pending.
* false = no pasue request is pending.
*
****************************************************************************/
bool up_cpu_pausereq(int cpu)
{
return spin_islocked(&g_cpu_paused[cpu]);
}
/****************************************************************************
* Name: up_cpu_paused
*
* Description:
* Handle a pause request from another CPU. Normally, this logic is
* executed from interrupt handling logic within the architecture-specific
* However, it is sometimes necessary necessary to perform the pending
* pause operation in other contexts where the interrupt cannot be taken
* in order to avoid deadlocks.
*
* This function performs the following operations:
*
* 1. It saves the current task state at the head of the current assigned
* task list.
* 2. It waits on a spinlock, then
* 3. Returns from interrupt, restoring the state of the new task at the
* head of the ready to run list.
*
* Input Parameters:
* cpu - The index of the CPU to be paused
*
* Returned Value:
* On success, OK is returned. Otherwise, a negated errno value indicating
* the nature of the failure is returned.
*
****************************************************************************/
int up_cpu_paused(int cpu)
{
FAR struct tcb_s *tcb = this_task();
/* Update scheduler parameters */
sched_suspend_scheduler(tcb);
/* Save the current context at CURRENT_REGS into the TCB at the head
* of the assigned task list for this CPU.
*/
up_savestate(tcb->xcp.regs);
/* Wait for the spinlock to be released */
spin_unlock(&g_cpu_paused[cpu]);
spin_lock(&g_cpu_wait[cpu]);
/* Restore the exception context of the tcb at the (new) head of the
* assigned task list.
*/
tcb = this_task();
/* Reset scheduler parameters */
sched_resume_scheduler(tcb);
/* Then switch contexts. Any necessary address environment changes
* will be made when the interrupt returns.
*/
up_restorestate(tcb->xcp.regs);
spin_unlock(&g_cpu_wait[cpu]);
return OK;
}
/****************************************************************************
* Name: arm_pause_handler
*
@ -84,40 +184,18 @@ static volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS];
int arm_pause_handler(int irq, FAR void *context)
{
FAR struct tcb_s *tcb = this_task();
int cpu = up_cpu_index();
int cpu = this_cpu();
/* Update scheduler parameters */
sched_suspend_scheduler(tcb);
/* Save the current context at CURRENT_REGS into the TCB at the head of the
* assigned task list for this CPU.
/* Check for false alarms. Such false could occur as a consequence of
* some deadlock breaking logic that might have already serviced the SG2
* interrupt by calling up_cpu_paused.
*/
up_savestate(tcb->xcp.regs);
if (spin_islocked(&g_cpu_paused[cpu]))
{
return up_cpu_paused(cpu);
}
/* Wait for the spinlock to be released */
spin_unlock(&g_cpu_paused[cpu]);
spin_lock(&g_cpu_wait[cpu]);
/* Restore the exception context of the tcb at the (new) head of the
* assigned task list.
*/
tcb = this_task();
/* Reset scheduler parameters */
sched_resume_scheduler(tcb);
/* Then switch contexts. Any necessary address environment changes will
* be made when the interrupt returns.
*/
up_restorestate(tcb->xcp.regs);
spin_unlock(&g_cpu_wait[cpu]);
return OK;
}
@ -134,7 +212,7 @@ int arm_pause_handler(int irq, FAR void *context)
* CPU.
*
* Input Parameters:
* cpu - The index of the CPU to be stopped/
* cpu - The index of the CPU to be stopped
*
* Returned Value:
* Zero on success; a negated errno value on failure.

View File

@ -199,6 +199,25 @@ extern volatile int g_eventloop;
extern volatile int g_uart_data_available;
#endif
#ifdef CONFIG_SMP
/* These spinlocks are used in the SMP configuration in order to implement
* up_cpu_pause(). The protocol for CPUn to pause CPUm is as follows
*
* 1. The up_cpu_pause() implementation on CPUn locks both g_cpu_wait[m]
* and g_cpu_paused[m]. CPUn then waits spinning on g_cpu_wait[m].
* 2. CPUm receives the interrupt it (1) unlocks g_cpu_paused[m] and
* (2) locks g_cpu_wait[m]. The first unblocks CPUn and the second
* blocks CPUm in the interrupt handler.
*
* When CPUm resumes, CPUn unlocks g_cpu_wait[m] and the interrupt handler
* on CPUm continues. CPUm must, of course, also then unlock g_cpu_wait[m]
* so that it will be ready for the next pause operation.
*/
volatile spinlock_t g_cpu_wait[CONFIG_SMP_NCPUS];
volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS];
#endif
/****************************************************************************
* Public Function Prototypes
****************************************************************************/

View File

@ -78,16 +78,30 @@ struct sim_cpuinfo_s
static pthread_key_t g_cpukey;
static pthread_t g_sim_cputhread[CONFIG_SMP_NCPUS];
static volatile unsigned char g_sim_cpupaused[CONFIG_SMP_NCPUS];
static volatile spinlock_t g_sim_cpuwait[CONFIG_SMP_NCPUS];
/* These spinlocks are used in the SMP configuration in order to implement
* up_cpu_pause(). The protocol for CPUn to pause CPUm is as follows
*
* 1. The up_cpu_pause() implementation on CPUn locks both g_cpu_wait[m]
* and g_cpu_paused[m]. CPUn then waits spinning on g_cpu_wait[m].
* 2. CPUm receives the interrupt it (1) unlocks g_cpu_paused[m] and
* (2) locks g_cpu_wait[m]. The first unblocks CPUn and the second
* blocks CPUm in the interrupt handler.
*
* When CPUm resumes, CPUn unlocks g_cpu_wait[m] and the interrupt handler
* on CPUm continues. CPUm must, of course, also then unlock g_cpu_wait[m]
* so that it will be ready for the next pause operation.
*/
volatile spinlock_t g_cpu_wait[CONFIG_SMP_NCPUS];
volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS];
/****************************************************************************
* NuttX domain function prototypes
****************************************************************************/
void os_start(void) __attribute__ ((noreturn));
void sim_cpu_pause(int cpu, volatile spinlock_t *wait,
volatile unsigned char *paused);
void up_cpu_paused(int cpu);
void sim_smp_hook(void);
/****************************************************************************
@ -222,9 +236,7 @@ static void sim_handle_signal(int signo, siginfo_t *info, void *context)
{
int cpu = (int)((uintptr_t)pthread_getspecific(g_cpukey));
/* We need to perform the actual tasking operations in the NuttX domain */
sim_cpu_pause(cpu, &g_sim_cpuwait[cpu], &g_sim_cpupaused[cpu]);
(void)up_cpu_paused(cpu);
}
/****************************************************************************
@ -446,7 +458,8 @@ int up_cpu_pause(int cpu)
{
/* Take the spinlock that will prevent the CPU thread from running */
g_sim_cpuwait[cpu] = SP_LOCKED;
g_cpu_wait[cpu] = SP_LOCKED;
g_cpu_paused[cpu] = SP_LOCKED;
/* Signal the CPU thread */
@ -454,7 +467,7 @@ int up_cpu_pause(int cpu)
/* Spin, waiting for the thread to pause */
while (!g_sim_cpupaused[cpu])
while (g_cpu_paused[cpu] != 0)
{
pthread_yield();
}
@ -485,6 +498,6 @@ int up_cpu_resume(int cpu)
{
/* Release the spinlock that will alloc the CPU thread to continue */
g_sim_cpuwait[cpu] = SP_UNLOCKED;
g_cpu_wait[cpu] = SP_UNLOCKED;
return 0;
}

View File

@ -52,26 +52,53 @@
****************************************************************************/
/****************************************************************************
* Name: sim_cpu_pause
* Name: up_cpu_pausereq
*
* Description:
* This is the SIGUSR1 signal handling logic. It implements the core
* logic of up_cpu_pause() on the thread of execution the simulated CPU.
* This is the part of the implementation that must be performed in the
* NuttX vs. the host domain.
* Return true if a pause request is pending for this CPU.
*
* Input Parameters:
* cpu - The CPU being paused.
* wait - Spinlock to wait on to be un-paused
* paused - A boolean to set when we are in the paused state.
* cpu - The index of the CPU to be queried
*
* Returned Value:
* None
* true = a pause request is pending.
* false = no pasue request is pending.
*
****************************************************************************/
void sim_cpu_pause(int cpu, volatile spinlock_t *wait,
volatile unsigned char *paused)
bool up_cpu_pausereq(int cpu)
{
return spin_islocked(&g_cpu_paused[cpu]);
}
/****************************************************************************
* Name: up_cpu_paused
*
* Description:
* Handle a pause request from another CPU. Normally, this logic is
* executed from interrupt handling logic within the architecture-specific
* However, it is sometimes necessary necessary to perform the pending
* pause operation in other contexts where the interrupt cannot be taken
* in order to avoid deadlocks.
*
* This function performs the following operations:
*
* 1. It saves the current task state at the head of the current assigned
* task list.
* 2. It waits on a spinlock, then
* 3. Returns from interrupt, restoring the state of the new task at the
* head of the ready to run list.
*
* Input Parameters:
* cpu - The index of the CPU to be paused
*
* Returned Value:
* On success, OK is returned. Otherwise, a negated errno value indicating
* the nature of the failure is returned.
*
****************************************************************************/
int up_cpu_paused(int cpu)
{
struct tcb_s *rtcb = current_task(cpu);
@ -86,16 +113,18 @@ void sim_cpu_pause(int cpu, volatile spinlock_t *wait,
if (up_setjmp(rtcb->xcp.regs) == 0)
{
/* Indicate that we are in the paused state */
/* Unlock the g_cpu_paused spinlock to indicate that we are in the
* paused state
*/
*paused = 1;
spin_unlock(&g_cpu_paused[cpu]);
/* Spin until we are asked to resume. When we resume, we need to
* inicate that we are not longer paused.
*/
spin_lock(wait);
*paused = 0;
spin_lock(&g_cpu_wait[cpu]);
spin_unlock(&g_cpu_wait[cpu]);
/* While we were paused, logic on a different CPU probably changed
* the task as that head of the assigned task list. So now we need
@ -125,7 +154,8 @@ void sim_cpu_pause(int cpu, volatile spinlock_t *wait,
up_longjmp(rtcb->xcp.regs, 1);
}
return OK;
}
#endif /* CONFIG_SMP */

View File

@ -1827,6 +1827,56 @@ int up_cpu_start(int cpu);
int up_cpu_pause(int cpu);
#endif
/****************************************************************************
* Name: up_cpu_pausereq
*
* Description:
* Return true if a pause request is pending for this CPU.
*
* Input Parameters:
* cpu - The index of the CPU to be queried
*
* Returned Value:
* true = a pause request is pending.
* false = no pasue request is pending.
*
****************************************************************************/
#ifdef CONFIG_SMP
bool up_cpu_pausereq(int cpu);
#endif
/****************************************************************************
* Name: up_cpu_paused
*
* Description:
* Handle a pause request from another CPU. Normally, this logic is
* executed from interrupt handling logic within the architecture-specific
* However, it is sometimes necessary necessary to perform the pending
* pause operation in other contexts where the interrupt cannot be taken
* in order to avoid deadlocks.
*
* This function performs the following operations:
*
* 1. It saves the current task state at the head of the current assigned
* task list.
* 2. It waits on a spinlock, then
* 3. Returns from interrupt, restoring the state of the new task at the
* head of the ready to run list.
*
* Input Parameters:
* cpu - The index of the CPU to be paused
*
* Returned Value:
* On success, OK is returned. Otherwise, a negated errno value indicating
* the nature of the failure is returned.
*
****************************************************************************/
#ifdef CONFIG_SMP
int up_cpu_paused(int cpu);
#endif
/****************************************************************************
* Name: up_cpu_resume
*

View File

@ -59,6 +59,26 @@
#include <arch/spinlock.h>
/****************************************************************************
* Pre-processor Definitions
****************************************************************************/
/* Memory barriers may be provided in arch/spinlock.h
*
* DMB - Data memory barrier. Assures writes are completed to memory.
* DSB - Data syncrhonization barrier.
*/
#define HAVE_DMB 1
#ifndef SP_DMB
# define SP_DMB()
# undef HAVE_DMB
#endif
#ifndef SP_DSB
# define SP_DSB()
#endif
/****************************************************************************
* Public Types
****************************************************************************/
@ -203,7 +223,7 @@ void spin_lockr(FAR struct spinlock_s *lock);
*
****************************************************************************/
#ifdef SP_DMB
#ifdef HAVE_DMB
void spin_unlock(FAR volatile spinlock_t *lock);
#else
# define spin_unlock(l) do { *(l) = SP_UNLOCKED; } while (0)

View File

@ -78,6 +78,81 @@ volatile cpu_set_t g_cpu_irqset;
static uint8_t g_cpu_nestcount[CONFIG_SMP_NCPUS];
#endif
/****************************************************************************
* Private Functions
****************************************************************************/
/****************************************************************************
* Name: irq_waitlock
*
* Description:
* Spin to get g_irq_waitlock, handling a known deadlock condition:
*
* Suppose this situation:
*
* - CPUn is in a critical section and has the g_cpu_irqlock spinlock.
* - CPUm takes an interrupt and attempts to enter the critical section.
* - It spins waiting on g_cpu_irqlock with interrupts disabled.
* - CPUn calls up_cpu_pause() to pause operation on CPUm. This will
* issue an inter-CPU interrupt to CPUm
* - But interrupts are disabled on CPUm so the up_cpu_pause() is never
* handled, causing the deadlock.
*
* This function detects this deadlock condition while spinning in an
* interrupt and calls up_cpu_pause() handler, breaking the deadlock.
*
****************************************************************************/
#ifdef CONFIG_SMP
static void irq_waitlock(void)
{
int cpu = this_cpu();
/* Duplicate the spin_lock() logic from spinlock.c, but adding the check
* for the deadlock condition.
*/
while (up_testset(&g_cpu_irqlock) == SP_LOCKED)
{
/* The deadlock condition would occur if CPUn:
*
* 1. Holds the g_cpu_irqlock, and
* 2. Is trying to interrupt CPUm
*
* The protocol for CPUn to pause CPUm is as follows
*
* 1. The up_cpu_pause() implementation on CPUn locks both
* g_cpu_wait[m] and g_cpu_paused[m]. CPUn then waits
* spinning on g_cpu_wait[m].
* 2. When CPUm receives the interrupt it (1) unlocks
* g_cpu_paused[m] and (2) locks g_cpu_wait[m]. The
* first unblocks CPUn and the second blocks CPUm in the
* interrupt handler.
*
* The problem in the deadlock case is that CPUm cannot receive
* the interrupt because it is locked up spinning. He we break
* the deadlock here be handling the pause interrupt request
* while waiting.
*/
if (up_cpu_pausereq(cpu))
{
/* Yes.. some CPU is requesting to pause us! Handle the
* pause interrupt now.
*/
DEBUGVERIFY(up_cpu_paused(cpu));
}
SP_DSB();
}
/* We have g_cpu_irqlock! */
SP_DMB();
}
#endif
/****************************************************************************
* Public Functions
****************************************************************************/
@ -190,7 +265,7 @@ irqstate_t enter_critical_section(void)
* no longer in the critical section).
*/
spin_lock(&g_cpu_irqlock);
irq_waitlock();
}
/* In any event, the nesting count is now one */

View File

@ -65,22 +65,6 @@
#undef CONFIG_SPINLOCK_LOCKDOWN /* Feature not yet available */
/* Memory barriers may be provided in arch/spinlock.h
*
* DMB - Data memory barrier. Assures writes are completed to memory.
* DSB - Data syncrhonization barrier.
*/
#define HAVE_DMB 1
#ifndef SP_DMB
# define SP_DMB()
# undef HAVE_DMB
#endif
#ifndef SP_DSB
# define SP_DSB()
#endif
/****************************************************************************
* Public Functions
****************************************************************************/

View File

@ -97,7 +97,7 @@ int task_restart(pid_t pid)
/* Not implemented */
errcode = ENOSYS;
goto errout_with_lock;
goto errout;
}
/* We are restarting some other task than ourselves. Make sure that the
@ -222,7 +222,8 @@ int task_restart(pid_t pid)
return OK;
errout_with_lock:
set_errno(errcode);
leave_critical_section(flags);
errout:
set_errno(errcode);
return ERROR;
}