From d295f11a3a2ce9e1299ff454c1f630598a9f473c Mon Sep 17 00:00:00 2001
From: Masayuki Ishikawa <Masayuki.Ishikawa@jp.sony.com>
Date: Wed, 31 Jan 2018 11:23:22 +0900
Subject: [PATCH] SMP: Introduce a new global IRQ clearing logic and tasklist
 protection.

The previous implementation of clearing global IRQ in sched_addreadytorun()
and sched_removereadytorun() was done too early. As a result, nxsem_post()
would have a chance to enter the critical section even nxsem_wait() is
still not in blocked state. This patch moves clearing global IRQ controls
from sched_addreadytorun() and sched_removereadytorun() to sched_resumescheduler()
to ensure that nxsem_post() can enter the critical section correctly.

For this change, sched_resumescheduler.c is always necessary for SMP configuration.
In addition, by this change, task_exit() had to be modified so that it calls
sched_resumescheduler() because it calls sched_removescheduler() inside the
function, otherwise it will cause a deadlock.

However, I encountered another DEBUGASSERT() in sched_cpu_select() during
HTTP streaming aging test on lc823450-xgevk. Actually sched_cpu_select()
accesses the g_assignedtasks which might be changed by another CPU. Similarly,
other tasklists might be modified simultaneously if both CPUs are executing
scheduling logic. To avoid this, I introduced tasklist protetion APIs.

With these changes, SMP kernel stability has been much improved.

Signed-off-by: Masayuki Ishikawa <Masayuki.Ishikawa@jp.sony.com>
---
 include/nuttx/sched.h                |   2 +-
 sched/sched/Make.defs                |   6 ++
 sched/sched/sched.h                  |   8 ++
 sched/sched/sched_addblocked.c       |  12 +++
 sched/sched/sched_addreadytorun.c    |  14 ++-
 sched/sched/sched_mergepending.c     |  15 ++-
 sched/sched/sched_mergeprioritized.c |  19 +++-
 sched/sched/sched_removereadytorun.c |  16 +++-
 sched/sched/sched_resumescheduler.c  |  42 ++++++++-
 sched/sched/sched_tasklistlock.c     | 133 +++++++++++++++++++++++++++
 sched/task/task_exit.c               |   8 ++
 11 files changed, 259 insertions(+), 16 deletions(-)
 create mode 100644 sched/sched/sched_tasklistlock.c

diff --git a/include/nuttx/sched.h b/include/nuttx/sched.h
index 10cbcbefc88..3810962c8b3 100644
--- a/include/nuttx/sched.h
+++ b/include/nuttx/sched.h
@@ -883,7 +883,7 @@ void task_vforkabort(FAR struct task_tcb_s *child, int errcode);
  ********************************************************************************/
 
 #if CONFIG_RR_INTERVAL > 0 || defined(CONFIG_SCHED_SPORADIC) || \
-    defined(CONFIG_SCHED_INSTRUMENTATION)
+    defined(CONFIG_SCHED_INSTRUMENTATION) || defined(CONFIG_SMP)
 void sched_resume_scheduler(FAR struct tcb_s *tcb);
 #else
 #  define sched_resume_scheduler(tcb)
diff --git a/sched/sched/Make.defs b/sched/sched/Make.defs
index 0626d9e39b0..c9088453c07 100644
--- a/sched/sched/Make.defs
+++ b/sched/sched/Make.defs
@@ -77,6 +77,8 @@ else ifeq ($(CONFIG_SCHED_SPORADIC),y)
 CSRCS += sched_resumescheduler.c
 else ifeq ($(CONFIG_SCHED_INSTRUMENTATION),y)
 CSRCS += sched_resumescheduler.c
+else ifeq ($(CONFIG_SMP),y)
+CSRCS += sched_resumescheduler.c
 endif
 
 ifeq ($(CONFIG_SCHED_CPULOAD),y)
@@ -96,6 +98,10 @@ ifeq ($(CONFIG_SCHED_INSTRUMENTATION_BUFFER),y)
 CSRCS += sched_note.c
 endif
 
+ifeq ($(CONFIG_SMP),y)
+CSRCS += sched_tasklistlock.c
+endif
+
 # Include sched build support
 
 DEPPATH += --dep-path sched
diff --git a/sched/sched/sched.h b/sched/sched/sched.h
index e05b88692c6..63c081de856 100644
--- a/sched/sched/sched.h
+++ b/sched/sched/sched.h
@@ -364,6 +364,10 @@ extern volatile spinlock_t g_cpu_schedlock SP_SECTION;
 extern volatile spinlock_t g_cpu_locksetlock SP_SECTION;
 extern volatile cpu_set_t g_cpu_lockset SP_SECTION;
 
+/* Used to lock tasklist to prevent from concurrent access */
+
+extern volatile spinlock_t g_cpu_tasklistlock SP_SECTION;
+
 #endif /* CONFIG_SMP */
 
 /****************************************************************************
@@ -425,6 +429,10 @@ void sched_sporadic_lowpriority(FAR struct tcb_s *tcb);
 #ifdef CONFIG_SMP
 int  sched_cpu_select(cpu_set_t affinity);
 int  sched_cpu_pause(FAR struct tcb_s *tcb);
+
+irqstate_t sched_tasklist_lock(void);
+void sched_tasklist_unlock(irqstate_t lock);
+
 #  define sched_islocked(tcb) spin_islocked(&g_cpu_schedlock)
 #else
 #  define sched_cpu_select(a) (0)
diff --git a/sched/sched/sched_addblocked.c b/sched/sched/sched_addblocked.c
index 34303681c5c..151faedb692 100644
--- a/sched/sched/sched_addblocked.c
+++ b/sched/sched/sched_addblocked.c
@@ -77,6 +77,12 @@ void sched_addblocked(FAR struct tcb_s *btcb, tstate_t task_state)
   DEBUGASSERT(task_state >= FIRST_BLOCKED_STATE &&
               task_state <= LAST_BLOCKED_STATE);
 
+#ifdef CONFIG_SMP
+  /* Lock the tasklists before accessing */
+
+  irqstate_t lock = sched_tasklist_lock();
+#endif
+
   /* Add the TCB to the blocked task list associated with this state. */
 
   tasklist = TLIST_BLOCKED(task_state);
@@ -96,6 +102,12 @@ void sched_addblocked(FAR struct tcb_s *btcb, tstate_t task_state)
       dq_addlast((FAR dq_entry_t *)btcb, tasklist);
     }
 
+#ifdef CONFIG_SMP
+  /* Unlock the tasklists */
+
+  sched_tasklist_unlock(lock);
+#endif
+
   /* Make sure the TCB's state corresponds to the list */
 
   btcb->task_state = task_state;
diff --git a/sched/sched/sched_addreadytorun.c b/sched/sched/sched_addreadytorun.c
index dcd07ac9a6b..880f4b85bb0 100644
--- a/sched/sched/sched_addreadytorun.c
+++ b/sched/sched/sched_addreadytorun.c
@@ -173,6 +173,10 @@ bool sched_addreadytorun(FAR struct tcb_s *btcb)
   int cpu;
   int me;
 
+  /* Lock the tasklists before accessing */
+
+  irqstate_t lock = sched_tasklist_lock();
+
   /* Check if the blocked TCB is locked to this CPU */
 
   if ((btcb->flags & TCB_FLAG_CPU_LOCKED) != 0)
@@ -339,10 +343,9 @@ bool sched_addreadytorun(FAR struct tcb_s *btcb)
 
           else if (g_cpu_nestcount[me] <= 0)
             {
-              /* Release our hold on the IRQ lock. */
-
-              spin_clrbit(&g_cpu_irqset, cpu, &g_cpu_irqsetlock,
-                          &g_cpu_irqlock);
+              /* Do nothing here
+               * NOTE: spin_clrbit() will be done in sched_resumescheduler()
+               */
             }
 
           /* Sanity check.  g_cpu_netcount should be greater than zero
@@ -426,6 +429,9 @@ bool sched_addreadytorun(FAR struct tcb_s *btcb)
         }
     }
 
+  /* Unlock the tasklists */
+
+  sched_tasklist_unlock(lock);
   return doswitch;
 }
 
diff --git a/sched/sched/sched_mergepending.c b/sched/sched/sched_mergepending.c
index 39968e6e116..6e258c7554a 100644
--- a/sched/sched/sched_mergepending.c
+++ b/sched/sched/sched_mergepending.c
@@ -199,6 +199,10 @@ bool sched_mergepending(void)
   int cpu;
   int me;
 
+  /* Lock the tasklist before accessing */
+
+  irqstate_t lock = sched_tasklist_lock();
+
   /* Remove and process every TCB in the g_pendingtasks list.
    *
    * Do nothing if (1) pre-emption is still disabled (by any CPU), or (2) if
@@ -215,7 +219,7 @@ bool sched_mergepending(void)
         {
           /* The pending task list is empty */
 
-          return ret;
+          goto errout_with_lock;
         }
 
       cpu  = sched_cpu_select(ALL_CPUS /* ptcb->affinity */);
@@ -259,7 +263,7 @@ bool sched_mergepending(void)
                * pending task list.
                */
 
-              return ret;
+              goto errout_with_lock;
             }
 
           /* Set up for the next time through the loop */
@@ -269,7 +273,7 @@ bool sched_mergepending(void)
             {
               /* The pending task list is empty */
 
-              return ret;
+              goto errout_with_lock;
             }
 
           cpu  = sched_cpu_select(ALL_CPUS /* ptcb->affinity */);
@@ -285,6 +289,11 @@ bool sched_mergepending(void)
                              TSTATE_TASK_READYTORUN);
     }
 
+errout_with_lock:
+
+  /* Unlock the tasklist */
+
+  sched_tasklist_unlock(lock);
   return ret;
 }
 #endif /* CONFIG_SMP */
diff --git a/sched/sched/sched_mergeprioritized.c b/sched/sched/sched_mergeprioritized.c
index 90ef9dfbad9..7ff70b14df8 100644
--- a/sched/sched/sched_mergeprioritized.c
+++ b/sched/sched/sched_mergeprioritized.c
@@ -83,6 +83,12 @@ void sched_mergeprioritized(FAR dq_queue_t *list1, FAR dq_queue_t *list2,
   FAR struct tcb_s *tcb2;
   FAR struct tcb_s *tmp;
 
+#ifdef CONFIG_SMP
+  /* Lock the tasklists before accessing */
+
+  irqstate_t lock = sched_tasklist_lock();
+#endif
+
   DEBUGASSERT(list1 != NULL && list2 != NULL);
 
   /* Get a private copy of list1, clearing list1.  We do this early so that
@@ -99,7 +105,7 @@ void sched_mergeprioritized(FAR dq_queue_t *list1, FAR dq_queue_t *list2,
     {
       /* Special case.. list1 is empty.  There is nothing to be done. */
 
-      return;
+      goto ret_with_lock;
     }
 
   /* Now the TCBs are no longer accessible and we can change the state on
@@ -122,7 +128,7 @@ void sched_mergeprioritized(FAR dq_queue_t *list1, FAR dq_queue_t *list2,
       /* Special case.. list2 is empty.  Move list1 to list2. */
 
       dq_move(&clone, list2);
-      return;
+      goto ret_with_lock;
     }
 
   /* Now loop until all entries from list1 have been merged into list2. tcb1
@@ -171,4 +177,13 @@ void sched_mergeprioritized(FAR dq_queue_t *list1, FAR dq_queue_t *list2,
         }
     }
   while (tcb1 != NULL);
+
+ret_with_lock:
+
+#ifdef CONFIG_SMP
+  /* Unlock the tasklists */
+
+  sched_tasklist_unlock(lock);
+#endif
+  return;
 }
diff --git a/sched/sched/sched_removereadytorun.c b/sched/sched/sched_removereadytorun.c
index 231310f62db..f5e93b881c8 100644
--- a/sched/sched/sched_removereadytorun.c
+++ b/sched/sched/sched_removereadytorun.c
@@ -42,6 +42,7 @@
 #include <stdbool.h>
 #include <queue.h>
 #include <assert.h>
+#include <nuttx/sched_note.h>
 
 #include "irq/irq.h"
 #include "sched/sched.h"
@@ -138,6 +139,10 @@ bool sched_removereadytorun(FAR struct tcb_s *rtcb)
   bool doswitch = false;
   int cpu;
 
+  /* Lock the tasklists before accessing */
+
+  irqstate_t lock = sched_tasklist_lock();
+
   /* Which CPU (if any) is the task running on?  Which task list holds the
    * TCB?
    */
@@ -283,10 +288,9 @@ bool sched_removereadytorun(FAR struct tcb_s *rtcb)
 
       else if (g_cpu_nestcount[me] <= 0)
         {
-          /* Release our hold on the IRQ lock. */
-
-          spin_clrbit(&g_cpu_irqset, cpu, &g_cpu_irqsetlock,
-                      &g_cpu_irqlock);
+          /* Do nothing here
+           * NOTE: spin_clrbit() will be done in sched_resumescheduler()
+           */
         }
 
       /* Sanity check.  g_cpu_netcount should be greater than zero
@@ -330,6 +334,10 @@ bool sched_removereadytorun(FAR struct tcb_s *rtcb)
   /* Since the TCB is no longer in any list, it is now invalid */
 
   rtcb->task_state = TSTATE_TASK_INVALID;
+
+  /* Unlock the tasklists */
+
+  sched_tasklist_unlock(lock);
   return doswitch;
 }
 #endif /* CONFIG_SMP */
diff --git a/sched/sched/sched_resumescheduler.c b/sched/sched/sched_resumescheduler.c
index 96097d4fcd6..08f104a5d83 100644
--- a/sched/sched/sched_resumescheduler.c
+++ b/sched/sched/sched_resumescheduler.c
@@ -45,10 +45,11 @@
 #include <nuttx/clock.h>
 #include <nuttx/sched_note.h>
 
+#include "irq/irq.h"
 #include "sched/sched.h"
 
 #if CONFIG_RR_INTERVAL > 0 || defined(CONFIG_SCHED_SPORADIC) || \
-    defined(CONFIG_SCHED_INSTRUMENTATION)
+    defined(CONFIG_SCHED_INSTRUMENTATION) || defined(CONFIG_SMP)
 
 /****************************************************************************
  * Public Functions
@@ -101,7 +102,44 @@ void sched_resume_scheduler(FAR struct tcb_s *tcb)
   sched_note_resume(tcb);
 #endif
 
+#ifdef CONFIG_SMP
+  /* NOTE: The following logic for adjusting global IRQ controls were
+   * derived from sched_addreadytorun() and sched_removedreadytorun()
+   * Here, we only handles clearing logic to defer unlocking IRQ lock
+   * followed by context switching.
+   */
+
+  int me = this_cpu();
+
+  /* Adjust global IRQ controls.  If irqcount is greater than zero,
+   * then this task/this CPU holds the IRQ lock
+   */
+
+  if (tcb->irqcount > 0)
+    {
+      /* Do notihing here
+       * NOTE: spin_setbit() is done in sched_addreadytorun()
+       * and sched_removereadytorun()
+       */
+    }
+
+  /* No.. This CPU will be relinquishing the lock.  But this works
+   * differently if we are performing a context switch from an
+   * interrupt handler and the interrupt handler has established
+   * a critical section.  We can detect this case when
+   * g_cpu_nestcount[me] > 0.
+   */
+
+  else if (g_cpu_nestcount[me] <= 0)
+    {
+      /* Release our hold on the IRQ lock. */
+
+      spin_clrbit(&g_cpu_irqset, me, &g_cpu_irqsetlock,
+                  &g_cpu_irqlock);
+    }
+#endif /* CONFIG_SMP */
+
 }
 
 #endif /* CONFIG_RR_INTERVAL > 0 || CONFIG_SCHED_SPORADIC || \
-        * CONFIG_SCHED_INSTRUMENTATION */
+        * CONFIG_SCHED_INSTRUMENTATION || CONFIG_SMP */
diff --git a/sched/sched/sched_tasklistlock.c b/sched/sched/sched_tasklistlock.c
new file mode 100644
index 00000000000..d8946d4b740
--- /dev/null
+++ b/sched/sched/sched_tasklistlock.c
@@ -0,0 +1,133 @@
+/****************************************************************************
+ * sched/sched/sched_tasklistlock.c
+ *
+ *   Copyright (C) 2018 Sony Corporation. All rights reserved.
+ *   Author: Masayuki Ishikawa <Masayuki.Ishikawa@jp.sony.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name NuttX nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include <nuttx/config.h>
+#include <nuttx/spinlock.h>
+
+#include <sys/types.h>
+#include <arch/irq.h>
+
+#include "sched/sched.h"
+
+/****************************************************************************
+ * Public Data
+ ****************************************************************************/
+
+/* Splinlock to protect the tasklists */
+
+static volatile spinlock_t g_tasklist_lock SP_SECTION = SP_UNLOCKED;
+
+/* Handles nested calls */
+
+static volatile uint8_t g_tasklist_lock_count[CONFIG_SMP_NCPUS];
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+/****************************************************************************
+ * Name: sched_tasklist_lock()
+ *
+ * Description:
+ *   Disable local interrupts and take the global spinlock (g_tasklist_lock)
+ *   if the call counter (g_tasklist_lock_count[cpu]) equals to 0. Then the
+ *   counter on the CPU is incremented to allow nested call.
+ *
+ *   NOTE: This API is used to protect tasklists in the scheduler. So do not
+ *   use this API for other purposes.
+ *
+ * Returned Value:
+ *   An opaque, architecture-specific value that represents the state of
+ *   the interrupts prior to the call to sched_tasklist_lock();
+ ****************************************************************************/
+
+irqstate_t sched_tasklist_lock(void)
+{
+  int me;
+  irqstate_t ret;
+
+  ret = up_irq_save();
+  me  = this_cpu();
+
+  if (0 == g_tasklist_lock_count[me])
+    {
+      spin_lock(&g_tasklist_lock);
+    }
+
+  g_tasklist_lock_count[me]++;
+  ASSERT(0 != g_tasklist_lock_count[me]);
+  return ret;
+}
+
+/****************************************************************************
+ * Name: sched_tasklist_unlock()
+ *
+ * Description:
+ *   Decrement the call counter (g_tasklist_lock_count[cpu]) and if it
+ *   decrements to zero then release the spinlock (g_tasklist_lock) and
+ *   restore the interrupt state as it was prior to the previous call to
+ *   sched_tasklist_lock().
+ *
+ *   NOTE: This API is used to protect tasklists in the scheduler. So do not
+ *   use this API for other purposes.
+ *
+ * Input Parameters:
+ *   lock - The architecture-specific value that represents the state of
+ *          the interrupts prior to the call to sched_tasklist_lock().
+ *
+ * Returned Value:
+ *   None
+ ****************************************************************************/
+
+void sched_tasklist_unlock(irqstate_t lock)
+{
+  int me;
+
+  me = this_cpu();
+
+  ASSERT(0 < g_tasklist_lock_count[me]);
+  g_tasklist_lock_count[me]--;
+
+  if (0 == g_tasklist_lock_count[me])
+    {
+      spin_unlock(&g_tasklist_lock);
+    }
+
+  up_irq_restore(lock);
+}
diff --git a/sched/task/task_exit.c b/sched/task/task_exit.c
index 6a04073f294..f42ef1421ee 100644
--- a/sched/task/task_exit.c
+++ b/sched/task/task_exit.c
@@ -97,6 +97,14 @@ int task_exit(void)
   (void)sched_removereadytorun(dtcb);
   rtcb = this_task();
 
+#ifdef CONFIG_SMP
+  /* Because clearing the global IRQ control in sched_removereadytorun()
+   * was moved to sched_resume_scheduler(). So call the API here.
+   */
+
+  sched_resume_scheduler(rtcb);
+#endif
+
   /* We are now in a bad state -- the head of the ready to run task list
    * does not correspond to the thread that is running.  Disabling pre-
    * emption on this TCB and marking the new ready-to-run task as not