Merge tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 12 Oct 2020 19:56:01 +0000 (12:56 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 12 Oct 2020 19:56:01 +0000 (12:56 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 12 Oct 2020 19:56:01 +0000 (12:56 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 12 Oct 2020 19:56:01 +0000 (12:56 -0700)
diff --git a/MAINTAINERS b/MAINTAINERS

index 10a54b2..80ee365 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15407,6 +15407,7 @@ R:      Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
  R:     Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
  R:     Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
  R:     Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
+R:     Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
  L:     linux-kernel@vger.kernel.org
  S:     Maintained
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c

index b5adaf7..ef0058d 100644 (file)
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -177,15 +177,6 @@ static inline void parse_dt_topology(void) {}
  static inline void update_cpu_capacity(unsigned int cpuid) {}
  #endif
  
-/*
- * The current assumption is that we can power gate each core independently.
- * This will be superseded by DT binding once available.
- */
-const struct cpumask *cpu_corepower_mask(int cpu)
-{
-       return &cpu_topology[cpu].thread_sibling;
-}
-
  /*
   * store_cpu_topology is called at boot when only one cpu is running
   * and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
@@ -241,20 +232,6 @@ topology_populated:
         update_siblings_masks(cpuid);
  }
  
-static inline int cpu_corepower_flags(void)
-{
-       return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
-}
-
-static struct sched_domain_topology_level arm_topology[] = {
-#ifdef CONFIG_SCHED_MC
-       { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
-       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
-       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-       { NULL, },
-};
-
  /*
   * init_cpu_topology is called at boot when only one cpu is running
   * which prevent simultaneous write access to cpu_topology array
@@ -265,7 +242,4 @@ void __init init_cpu_topology(void)
         smp_wmb();
  
         parse_dt_topology();
-
-       /* Set scheduler topology descriptor */
-       set_sched_topology(arm_topology);
  }
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 3a3aaf0..d383cf0 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1491,9 +1491,10 @@ extern struct pid *cad_pid;
  /*
   * Per process flags
   */
+#define PF_VCPU                        0x00000001      /* I'm a virtual CPU */
  #define PF_IDLE                        0x00000002      /* I am an IDLE thread */
  #define PF_EXITING             0x00000004      /* Getting shut down */
-#define PF_VCPU                        0x00000010      /* I'm a virtual CPU */
+#define PF_IO_WORKER           0x00000010      /* Task is an IO worker */
  #define PF_WQ_WORKER           0x00000020      /* I'm a workqueue worker */
  #define PF_FORKNOEXEC          0x00000040      /* Forked but didn't exec */
  #define PF_MCE_PROCESS         0x00000080      /* Process policy on mce errors */
@@ -1517,7 +1518,6 @@ extern struct pid *cad_pid;
  #define PF_NO_SETAFFINITY      0x04000000      /* Userland is not allowed to meddle with cpus_mask */
  #define PF_MCE_EARLY           0x08000000      /* Early kill for mce process policy */
  #define PF_MEMALLOC_NOCMA      0x10000000      /* All allocation request will have _GFP_MOVABLE cleared */
-#define PF_IO_WORKER           0x20000000      /* Task is an IO worker */
  #define PF_FREEZER_SKIP                0x40000000      /* Freezer should not count it as freezable */
  #define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */
  
@@ -2046,6 +2046,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
  const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
  
  int sched_trace_rq_cpu(struct rq *rq);
+int sched_trace_rq_cpu_capacity(struct rq *rq);
  int sched_trace_rq_nr_running(struct rq *rq);
  
  const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h

index f889e33..15bfb06 100644 (file)
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -348,10 +348,13 @@ enum {
         MEMBARRIER_STATE_GLOBAL_EXPEDITED                       = (1U << 3),
         MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY      = (1U << 4),
         MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE            = (1U << 5),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY           = (1U << 6),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                 = (1U << 7),
  };
  
  enum {
         MEMBARRIER_FLAG_SYNC_CORE       = (1U << 0),
+       MEMBARRIER_FLAG_RSEQ            = (1U << 1),
  };
  
  #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h

new file mode 100644 (file)

index 0000000..34b21e9
--- /dev/null
+++ b/include/linux/sched/sd_flags.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sched-domains (multiprocessor balancing) flag declarations.
+ */
+
+#ifndef SD_FLAG
+# error "Incorrect import of SD flags definitions"
+#endif
+
+/*
+ * Hierarchical metaflags
+ *
+ * SHARED_CHILD: These flags are meant to be set from the base domain upwards.
+ * If a domain has this flag set, all of its children should have it set. This
+ * is usually because the flag describes some shared resource (all CPUs in that
+ * domain share the same resource), or because they are tied to a scheduling
+ * behaviour that we want to disable at some point in the hierarchy for
+ * scalability reasons.
+ *
+ * In those cases it doesn't make sense to have the flag set for a domain but
+ * not have it in (some of) its children: sched domains ALWAYS span their child
+ * domains, so operations done with parent domains will cover CPUs in the lower
+ * child domains.
+ *
+ *
+ * SHARED_PARENT: These flags are meant to be set from the highest domain
+ * downwards. If a domain has this flag set, all of its parents should have it
+ * set. This is usually for topology properties that start to appear above a
+ * certain level (e.g. domain starts spanning CPUs outside of the base CPU's
+ * socket).
+ */
+#define SDF_SHARED_CHILD       0x1
+#define SDF_SHARED_PARENT      0x2
+
+/*
+ * Behavioural metaflags
+ *
+ * NEEDS_GROUPS: These flags are only relevant if the domain they are set on has
+ * more than one group. This is usually for balancing flags (load balancing
+ * involves equalizing a metric between groups), or for flags describing some
+ * shared resource (which would be shared between groups).
+ */
+#define SDF_NEEDS_GROUPS       0x4
+
+/*
+ * Balance when about to become idle
+ *
+ * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Balance on exec
+ *
+ * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Balance on fork, clone
+ *
+ * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Balance on wakeup
+ *
+ * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Consider waking task on waking CPU.
+ *
+ * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
+ */
+SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
+
+/*
+ * Domain members have different CPU capacities
+ *
+ * SHARED_PARENT: Set from the topmost domain down to the first domain where
+ *                asymmetry is detected.
+ * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
+ */
+SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
+ * Domain members share CPU capacity (i.e. SMT)
+ *
+ * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
+ *               CPU capacity.
+ * NEEDS_GROUPS: Capacity is shared between groups.
+ */
+SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Domain members share CPU package resources (i.e. caches)
+ *
+ * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
+ *               the same cache(s).
+ * NEEDS_GROUPS: Caches are shared between groups.
+ */
+SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Only a single load balancing instance
+ *
+ * SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a
+ *                different level upwards, but it doesn't change that if a
+ *                domain has this flag set, then all of its parents need to have
+ *                it too (otherwise the serialization doesn't make sense).
+ * NEEDS_GROUPS: No point in preserving domain if it has a single group.
+ */
+SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
+ * Place busy tasks earlier in the domain
+ *
+ * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
+ *               up, but currently assumed to be set from the base domain
+ *               upwards (see update_top_cache_domain()).
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Prefer to place tasks in a sibling domain
+ *
+ * Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD
+ * flag, but cleared below domains with SD_ASYM_CPUCAPACITY.
+ *
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
+
+/*
+ * sched_groups of this level overlap
+ *
+ * SHARED_PARENT: Set for all NUMA levels above NODE.
+ * NEEDS_GROUPS: Overlaps can only exist with more than one group.
+ */
+SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
+ * Cross-node balancing
+ *
+ * SHARED_PARENT: Set for all NUMA levels above NODE.
+ * NEEDS_GROUPS: No point in preserving domain if it has a single group.
+ */
+SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 8205112..9ef7bf6 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -11,20 +11,29 @@
   */
  #ifdef CONFIG_SMP
  
-#define SD_BALANCE_NEWIDLE     0x0001  /* Balance when about to become idle */
-#define SD_BALANCE_EXEC                0x0002  /* Balance on exec */
-#define SD_BALANCE_FORK                0x0004  /* Balance on fork, clone */
-#define SD_BALANCE_WAKE                0x0008  /* Balance on wakeup */
-#define SD_WAKE_AFFINE         0x0010  /* Wake task to waking CPU */
-#define SD_ASYM_CPUCAPACITY    0x0020  /* Domain members have different CPU capacities */
-#define SD_SHARE_CPUCAPACITY   0x0040  /* Domain members share CPU capacity */
-#define SD_SHARE_POWERDOMAIN   0x0080  /* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES 0x0100  /* Domain members share CPU pkg resources */
-#define SD_SERIALIZE           0x0200  /* Only a single load balancing instance */
-#define SD_ASYM_PACKING                0x0400  /* Place busy groups earlier in the domain */
-#define SD_PREFER_SIBLING      0x0800  /* Prefer to place tasks in a sibling domain */
-#define SD_OVERLAP             0x1000  /* sched_domains of this level overlap */
-#define SD_NUMA                        0x2000  /* cross-node balancing */
+/* Generate SD flag indexes */
+#define SD_FLAG(name, mflags) __##name,
+enum {
+       #include <linux/sched/sd_flags.h>
+       __SD_FLAG_CNT,
+};
+#undef SD_FLAG
+/* Generate SD flag bits */
+#define SD_FLAG(name, mflags) name = 1 << __##name,
+enum {
+       #include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
+#ifdef CONFIG_SCHED_DEBUG
+
+struct sd_flag_debug {
+       unsigned int meta_flags;
+       char *name;
+};
+extern const struct sd_flag_debug sd_flag_debug[];
+
+#endif
  
  #ifdef CONFIG_SCHED_SMT
  static inline int cpu_smt_flags(void)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h

index 75ac7f8..06db098 100644 (file)
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
                         const char __user *const __user *argv,
                         const char __user *const __user *envp, int flags);
  asmlinkage long sys_userfaultfd(int flags);
-asmlinkage long sys_membarrier(int cmd, int flags);
+asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id);
  asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
  asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
                                     int fd_out, loff_t __user *off_out,
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

index fec25b9..c96a433 100644 (file)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -630,6 +630,10 @@ DECLARE_TRACE(pelt_se_tp,
         TP_PROTO(struct sched_entity *se),
         TP_ARGS(se));
  
+DECLARE_TRACE(sched_cpu_capacity_tp,
+       TP_PROTO(struct rq *rq),
+       TP_ARGS(rq));
+
  DECLARE_TRACE(sched_overutilized_tp,
         TP_PROTO(struct root_domain *rd, bool overutilized),
         TP_ARGS(rd, overutilized));
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h

index 5891d76..7376058 100644 (file)
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -114,6 +114,26 @@
   *                          If this command is not implemented by an
   *                          architecture, -EINVAL is returned.
   *                          Returns 0 on success.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ *                          Ensure the caller thread, upon return from
+ *                          system call, that all its running thread
+ *                          siblings have any currently running rseq
+ *                          critical sections restarted if @flags
+ *                          parameter is 0; if @flags parameter is
+ *                          MEMBARRIER_CMD_FLAG_CPU,
+ *                          then this operation is performed only
+ *                          on CPU indicated by @cpu_id. If this command is
+ *                          not implemented by an architecture, -EINVAL
+ *                          is returned. A process needs to register its
+ *                          intent to use the private expedited rseq
+ *                          command prior to using it, otherwise
+ *                          this command returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ *                          Register the process intent to use
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ.
+ *                          If this command is not implemented by an
+ *                          architecture, -EINVAL is returned.
+ *                          Returns 0 on success.
   * @MEMBARRIER_CMD_SHARED:
   *                          Alias to MEMBARRIER_CMD_GLOBAL. Provided for
   *                          header backward compatibility.
@@ -131,9 +151,15 @@ enum membarrier_cmd {
         MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED               = (1 << 4),
         MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE              = (1 << 5),
         MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE     = (1 << 6),
+       MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ                   = (1 << 7),
+       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ          = (1 << 8),
  
         /* Alias for header backward compatibility. */
         MEMBARRIER_CMD_SHARED                   = MEMBARRIER_CMD_GLOBAL,
  };
  
+enum membarrier_cmd_flag {
+       MEMBARRIER_CMD_FLAG_CPU         = (1 << 0),
+};
+
  #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 2d95dc3..8160ab5 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
  EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
  EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
  EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
@@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
         return clamp_value / UCLAMP_BUCKET_DELTA;
  }
  
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
-{
-       return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
-}
-
  static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
  {
         if (clamp_id == UCLAMP_MIN)
@@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void)
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
+       unsigned int task_flags;
+
         if (!tsk->state)
                 return;
  
+       task_flags = tsk->flags;
         /*
          * If a worker went to sleep, notify and ask workqueue whether
          * it wants to wake up a task to maintain concurrency.
@@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
          * in the possible wakeup of a kworker and because wq_worker_sleeping()
          * requires it.
          */
-       if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+       if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
                 preempt_disable();
-               if (tsk->flags & PF_WQ_WORKER)
+               if (task_flags & PF_WQ_WORKER)
                         wq_worker_sleeping(tsk);
                 else
                         io_wq_worker_sleeping(tsk);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 3862a28..6d93f45 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
          */
         if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
                 pi_se = &pi_task->dl;
+               /*
+                * Because of delays in the detection of the overrun of a
+                * thread's runtime, it might be the case that a thread
+                * goes to sleep in a rt mutex with negative runtime. As
+                * a consequence, the thread will be throttled.
+                *
+                * While waiting for the mutex, this thread can also be
+                * boosted via PI, resulting in a thread that is throttled
+                * and boosted at the same time.
+                *
+                * In this case, the boost overrides the throttle.
+                */
+               if (p->dl.dl_throttled) {
+                       /*
+                        * The replenish timer needs to be canceled. No
+                        * problem if it fires concurrently: boosted threads
+                        * are ignored in dl_task_timer().
+                        */
+                       hrtimer_try_to_cancel(&p->dl.dl_timer);
+                       p->dl.dl_throttled = 0;
+               }
         } else if (!dl_prio(p->normal_prio)) {
                 /*
-                * Special case in which we have a !SCHED_DEADLINE task
-                * that is going to be deboosted, but exceeds its
-                * runtime while doing so. No point in replenishing
-                * it, as it's going to return back to its original
-                * scheduling class after this.
+                * Special case in which we have a !SCHED_DEADLINE task that is going
+                * to be deboosted, but exceeds its runtime while doing so. No point in
+                * replenishing it, as it's going to return back to its original
+                * scheduling class after this. If it has been throttled, we need to
+                * clear the flag, otherwise the task may wake up as throttled after
+                * being boosted again with no means to replenish the runtime and clear
+                * the throttle.
                  */
+               p->dl.dl_throttled = 0;
                 BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
                 return;
         }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 36c5426..0655524 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry,
         entry->proc_handler = proc_handler;
  }
  
+static int sd_ctl_doflags(struct ctl_table *table, int write,
+                         void *buffer, size_t *lenp, loff_t *ppos)
+{
+       unsigned long flags = *(unsigned long *)table->data;
+       size_t data_size = 0;
+       size_t len = 0;
+       char *tmp;
+       int idx;
+
+       if (write)
+               return 0;
+
+       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+               char *name = sd_flag_debug[idx].name;
+
+               /* Name plus whitespace */
+               data_size += strlen(name) + 1;
+       }
+
+       if (*ppos > data_size) {
+               *lenp = 0;
+               return 0;
+       }
+
+       tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
+       if (!tmp)
+               return -ENOMEM;
+
+       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+               char *name = sd_flag_debug[idx].name;
+
+               len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
+       }
+
+       tmp += *ppos;
+       len -= *ppos;
+
+       if (len > *lenp)
+               len = *lenp;
+       if (len)
+               memcpy(buffer, tmp, len);
+       if (len < *lenp) {
+               ((char *)buffer)[len] = '\n';
+               len++;
+       }
+
+       *lenp = len;
+       *ppos += len;
+
+       kfree(tmp);
+
+       return 0;
+}
+
  static struct ctl_table *
  sd_alloc_ctl_domain_table(struct sched_domain *sd)
  {
@@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
         set_table_entry(&table[2], "busy_factor",         &sd->busy_factor,         sizeof(int),  0644, proc_dointvec_minmax);
         set_table_entry(&table[3], "imbalance_pct",       &sd->imbalance_pct,       sizeof(int),  0644, proc_dointvec_minmax);
         set_table_entry(&table[4], "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
-       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0444, proc_dointvec_minmax);
+       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0444, sd_ctl_doflags);
         set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
         set_table_entry(&table[7], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring);
         /* &table[8] is terminator */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 1a68a05..aa4c622 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
  void post_init_entity_util_avg(struct task_struct *p)
  {
  }
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
  {
  }
  #endif /* CONFIG_SMP */
@@ -1504,6 +1504,7 @@ enum numa_type {
  /* Cached statistics for all CPUs within a node */
  struct numa_stats {
         unsigned long load;
+       unsigned long runnable;
         unsigned long util;
         /* Total compute capacity of CPUs on a node */
         unsigned long compute_capacity;
@@ -1547,19 +1548,22 @@ struct task_numa_env {
  };
  
  static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_runnable(struct rq *rq);
  static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
  
  static inline enum
  numa_type numa_classify(unsigned int imbalance_pct,
                          struct numa_stats *ns)
  {
         if ((ns->nr_running > ns->weight) &&
-           ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+           (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
+            ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
                 return node_overloaded;
  
         if ((ns->nr_running < ns->weight) ||
-           ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+           (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
+            ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
                 return node_has_spare;
  
         return node_fully_busy;
@@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env,
                 struct rq *rq = cpu_rq(cpu);
  
                 ns->load += cpu_load(rq);
+               ns->runnable += cpu_runnable(rq);
                 ns->util += cpu_util(cpu);
                 ns->nr_running += rq->cfs.h_nr_running;
                 ns->compute_capacity += capacity_of(cpu);
@@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
                 src_running = env->src_stats.nr_running - 1;
                 dst_running = env->dst_stats.nr_running + 1;
                 imbalance = max(0, dst_running - src_running);
-               imbalance = adjust_numa_imbalance(imbalance, src_running);
+               imbalance = adjust_numa_imbalance(imbalance, dst_running);
  
                 /* Use idle CPU if there is no imbalance */
                 if (!imbalance) {
@@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                 /* commit outstanding execution time */
                 if (cfs_rq->curr == se)
                         update_curr(cfs_rq);
-               account_entity_dequeue(cfs_rq, se);
+               update_load_sub(&cfs_rq->load, se->load.weight);
         }
         dequeue_load_avg(cfs_rq, se);
  
@@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  
         enqueue_load_avg(cfs_rq, se);
         if (se->on_rq)
-               account_entity_enqueue(cfs_rq, se);
+               update_load_add(&cfs_rq->load, se->load.weight);
  
  }
  
@@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
  /**
   * update_tg_load_avg - update the tg's load avg
   * @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
   *
   * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
   * However, because tg->load_avg is a global value there are performance
@@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
   *
   * Updating tg's load_avg is necessary before update_cfs_share().
   */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
  {
         long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
  
@@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
         if (cfs_rq->tg == &root_task_group)
                 return;
  
-       if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+       if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
         }
@@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
  
  #else /* CONFIG_FAIR_GROUP_SCHED */
  
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
  
  static inline int propagate_entity_load_avg(struct sched_entity *se)
  {
@@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                  * IOW we're enqueueing a task on a new CPU.
                  */
                 attach_entity_load_avg(cfs_rq, se);
-               update_tg_load_avg(cfs_rq, 0);
+               update_tg_load_avg(cfs_rq);
  
         } else if (decayed) {
                 cfs_rq_util_change(cfs_rq, 0);
  
                 if (flags & UPDATE_TG)
-                       update_tg_load_avg(cfs_rq, 0);
+                       update_tg_load_avg(cfs_rq);
         }
  }
  
@@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                         se = second;
         }
  
-       /*
-        * Prefer last buddy, try to return the CPU to a preempted task.
-        */
-       if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
-               se = cfs_rq->last;
-
-       /*
-        * Someone really wants this to run. If it's not unfair, run it.
-        */
-       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+               /*
+                * Someone really wants this to run. If it's not unfair, run it.
+                */
                 se = cfs_rq->next;
+       } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
+               /*
+                * Prefer last buddy, try to return the CPU to a preempted task.
+                */
+               se = cfs_rq->last;
+       }
  
         clear_buddies(cfs_rq, se);
  
@@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
  /*
   * Scan the local SMT mask for idle CPUs.
   */
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
  {
         int cpu;
  
@@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target)
                 return -1;
  
         for_each_cpu(cpu, cpu_smt_mask(target)) {
-               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+               if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+                   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
                         continue;
                 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
                         return cpu;
@@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
         return -1;
  }
  
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
  {
         return -1;
  }
@@ -6274,7 +6279,7 @@ symmetric:
         if ((unsigned)i < nr_cpumask_bits)
                 return i;
  
-       i = select_idle_smt(p, target);
+       i = select_idle_smt(p, sd, target);
         if ((unsigned)i < nr_cpumask_bits)
                 return i;
  
@@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
  
                         util = cpu_util_next(cpu, p, cpu);
                         cpu_cap = capacity_of(cpu);
-                       spare_cap = cpu_cap - util;
+                       spare_cap = cpu_cap;
+                       lsub_positive(&spare_cap, util);
  
                         /*
                          * Skip CPUs that cannot satisfy the capacity request.
@@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
         if (unlikely(task_has_idle_policy(p)))
                 return 0;
  
+       /* SMT siblings share cache */
+       if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+               return 0;
+
         /*
          * Buddy candidates are cache hot:
          */
@@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env)
                          * scheduler fails to find a good waiting task to
                          * migrate.
                          */
-                       if (load/2 > env->imbalance &&
-                           env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
+
+                       if ((load >> env->sd->nr_balance_failed) > env->imbalance)
                                 goto next;
  
                         env->imbalance -= load;
@@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
                 struct sched_entity *se;
  
                 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
-                       update_tg_load_avg(cfs_rq, 0);
+                       update_tg_load_avg(cfs_rq);
  
                         if (cfs_rq == &rq->cfs)
                                 decayed = true;
@@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
                 capacity = 1;
  
         cpu_rq(cpu)->cpu_capacity = capacity;
+       trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
         sdg->sgc->capacity = capacity;
         sdg->sgc->min_capacity = capacity;
         sdg->sgc->max_capacity = capacity;
@@ -8957,7 +8969,7 @@ next_group:
         }
  }
  
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
  {
         unsigned int imbalance_min;
  
@@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
          * tasks that remain local when the source domain is almost idle.
          */
         imbalance_min = 2;
-       if (src_nr_running <= imbalance_min)
+       if (nr_running <= imbalance_min)
                 return 0;
  
         return imbalance;
@@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
  
         /* scale ms to jiffies */
         interval = msecs_to_jiffies(interval);
+
+       /*
+        * Reduce likelihood of busy balancing at higher domains racing with
+        * balancing at lower domains by preventing their balancing periods
+        * from being multiples of each other.
+        */
+       if (cpu_busy)
+               interval -= 1;
+
         interval = clamp(interval, 1UL, max_load_balance_interval);
  
         return interval;
@@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
         /* Catch up with the cfs_rq and remove our load when we leave */
         update_load_avg(cfs_rq, se, 0);
         detach_entity_load_avg(cfs_rq, se);
-       update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq);
         propagate_entity_cfs_rq(se);
  }
  
@@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
         /* Synchronize entity with its cfs_rq */
         update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
         attach_entity_load_avg(cfs_rq, se);
-       update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq);
         propagate_entity_cfs_rq(se);
  }
  
@@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq)
  }
  EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
  
+int sched_trace_rq_cpu_capacity(struct rq *rq)
+{
+       return rq ?
+#ifdef CONFIG_SMP
+               rq->cpu_capacity
+#else
+               SCHED_CAPACITY_SCALE
+#endif
+               : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
+
  const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
  {
  #ifdef CONFIG_SMP
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index 7481cd9..68d369c 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
  SCHED_FEAT(RT_PUSH_IPI, true)
  #endif
  
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_SHARE, false)
  SCHED_FEAT(LB_MIN, false)
  SCHED_FEAT(ATTACH_AGE_LOAD, true)
  
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c

index 168479a..e23e74d 100644 (file)
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,14 @@
  #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
  #endif
  
+#ifdef CONFIG_RSEQ
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK          \
+       (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ                  \
+       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+#else
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK  0
+#endif
+
  #define MEMBARRIER_CMD_BITMASK                                         \
         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
@@ -30,6 +38,11 @@ static void ipi_mb(void *info)
         smp_mb();       /* IPIs should be serializing but paranoid. */
  }
  
+static void ipi_rseq(void *info)
+{
+       rseq_preempt(current);
+}
+
  static void ipi_sync_rq_state(void *info)
  {
         struct mm_struct *mm = (struct mm_struct *) info;
@@ -129,19 +142,27 @@ static int membarrier_global_expedited(void)
         return 0;
  }
  
-static int membarrier_private_expedited(int flags)
+static int membarrier_private_expedited(int flags, int cpu_id)
  {
-       int cpu;
         cpumask_var_t tmpmask;
         struct mm_struct *mm = current->mm;
+       smp_call_func_t ipi_func = ipi_mb;
  
-       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+       if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
                         return -EINVAL;
                 if (!(atomic_read(&mm->membarrier_state) &
                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
                         return -EPERM;
+       } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+               if (!IS_ENABLED(CONFIG_RSEQ))
+                       return -EINVAL;
+               if (!(atomic_read(&mm->membarrier_state) &
+                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
+                       return -EPERM;
+               ipi_func = ipi_rseq;
         } else {
+               WARN_ON_ONCE(flags);
                 if (!(atomic_read(&mm->membarrier_state) &
                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
                         return -EPERM;
@@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags)
          */
         smp_mb();       /* system call entry is not a mb. */
  
-       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+       if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
                 return -ENOMEM;
  
         cpus_read_lock();
-       rcu_read_lock();
-       for_each_online_cpu(cpu) {
+
+       if (cpu_id >= 0) {
                 struct task_struct *p;
  
-               /*
-                * Skipping the current CPU is OK even through we can be
-                * migrated at any point. The current CPU, at the point
-                * where we read raw_smp_processor_id(), is ensured to
-                * be in program order with respect to the caller
-                * thread. Therefore, we can skip this CPU from the
-                * iteration.
-                */
-               if (cpu == raw_smp_processor_id())
-                       continue;
-               p = rcu_dereference(cpu_rq(cpu)->curr);
-               if (p && p->mm == mm)
-                       __cpumask_set_cpu(cpu, tmpmask);
+               if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+                       goto out;
+               if (cpu_id == raw_smp_processor_id())
+                       goto out;
+               rcu_read_lock();
+               p = rcu_dereference(cpu_rq(cpu_id)->curr);
+               if (!p || p->mm != mm) {
+                       rcu_read_unlock();
+                       goto out;
+               }
+               rcu_read_unlock();
+       } else {
+               int cpu;
+
+               rcu_read_lock();
+               for_each_online_cpu(cpu) {
+                       struct task_struct *p;
+
+                       /*
+                        * Skipping the current CPU is OK even through we can be
+                        * migrated at any point. The current CPU, at the point
+                        * where we read raw_smp_processor_id(), is ensured to
+                        * be in program order with respect to the caller
+                        * thread. Therefore, we can skip this CPU from the
+                        * iteration.
+                        */
+                       if (cpu == raw_smp_processor_id())
+                               continue;
+                       p = rcu_dereference(cpu_rq(cpu)->curr);
+                       if (p && p->mm == mm)
+                               __cpumask_set_cpu(cpu, tmpmask);
+               }
+               rcu_read_unlock();
         }
-       rcu_read_unlock();
  
         preempt_disable();
-       smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+       if (cpu_id >= 0)
+               smp_call_function_single(cpu_id, ipi_func, NULL, 1);
+       else
+               smp_call_function_many(tmpmask, ipi_func, NULL, 1);
         preempt_enable();
  
-       free_cpumask_var(tmpmask);
+out:
+       if (cpu_id < 0)
+               free_cpumask_var(tmpmask);
         cpus_read_unlock();
  
         /*
@@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags)
             set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
             ret;
  
-       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+       if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
                         return -EINVAL;
                 ready_state =
                         MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+       } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+               if (!IS_ENABLED(CONFIG_RSEQ))
+                       return -EINVAL;
+               ready_state =
+                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
+       } else {
+               WARN_ON_ONCE(flags);
         }
  
         /*
@@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags)
                 return 0;
         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
                 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+       if (flags & MEMBARRIER_FLAG_RSEQ)
+               set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
         atomic_or(set_state, &mm->membarrier_state);
         ret = sync_runqueues_membarrier_state(mm);
         if (ret)
@@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
  
  /**
   * sys_membarrier - issue memory barriers on a set of threads
- * @cmd:   Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
+ * @cmd:    Takes command values defined in enum membarrier_cmd.
+ * @flags:  Currently needs to be 0 for all commands other than
+ *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
+ *          case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
+ *          contains the CPU on which to interrupt (= restart)
+ *          the RSEQ critical section.
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
+ *          RSEQ CS should be interrupted (@cmd must be
+ *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
   *
   * If this system call is not implemented, -ENOSYS is returned. If the
   * command specified does not exist, not available on the running
@@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags)
   *        smp_mb()           X           O            O
   *        sys_membarrier()   O           O            O
   */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
  {
-       if (unlikely(flags))
-               return -EINVAL;
+       switch (cmd) {
+       case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+               if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
+                       return -EINVAL;
+               break;
+       default:
+               if (unlikely(flags))
+                       return -EINVAL;
+       }
+
+       if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
+               cpu_id = -1;
+
         switch (cmd) {
         case MEMBARRIER_CMD_QUERY:
         {
@@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
                 return membarrier_register_global_expedited();
         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-               return membarrier_private_expedited(0);
+               return membarrier_private_expedited(0, cpu_id);
         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
                 return membarrier_register_private_expedited(0);
         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
-               return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+               return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+       case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+               return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
+       case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+               return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
         default:
                 return -EINVAL;
         }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index 1bd7e3a..dd77702 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -25,10 +25,18 @@ static inline bool sched_debug(void)
         return sched_debug_enabled;
  }
  
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
+const struct sd_flag_debug sd_flag_debug[] = {
+#include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                   struct cpumask *groupmask)
  {
         struct sched_group *group = sd->groups;
+       unsigned long flags = sd->flags;
+       unsigned int idx;
  
         cpumask_clear(groupmask);
  
@@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
         }
  
+       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+               unsigned int flag = BIT(idx);
+               unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
+
+               if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
+                   !(sd->child->flags & flag))
+                       printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
+                              sd_flag_debug[idx].name);
+
+               if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
+                   !(sd->parent->flags & flag))
+                       printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
+                              sd_flag_debug[idx].name);
+       }
+
         printk(KERN_DEBUG "%*s groups:", level + 1, "");
         do {
                 if (!group) {
@@ -137,22 +160,22 @@ static inline bool sched_debug(void)
  }
  #endif /* CONFIG_SCHED_DEBUG */
  
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
  static int sd_degenerate(struct sched_domain *sd)
  {
         if (cpumask_weight(sched_domain_span(sd)) == 1)
                 return 1;
  
         /* Following flags need at least 2 groups */
-       if (sd->flags & (SD_BALANCE_NEWIDLE |
-                        SD_BALANCE_FORK |
-                        SD_BALANCE_EXEC |
-                        SD_SHARE_CPUCAPACITY |
-                        SD_ASYM_CPUCAPACITY |
-                        SD_SHARE_PKG_RESOURCES |
-                        SD_SHARE_POWERDOMAIN)) {
-               if (sd->groups != sd->groups->next)
-                       return 0;
-       }
+       if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
+           (sd->groups != sd->groups->next))
+               return 0;
  
         /* Following flags don't use groups */
         if (sd->flags & (SD_WAKE_AFFINE))
@@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                 return 0;
  
         /* Flags needing groups don't count if only 1 group in parent */
-       if (parent->groups == parent->groups->next) {
-               pflags &= ~(SD_BALANCE_NEWIDLE |
-                           SD_BALANCE_FORK |
-                           SD_BALANCE_EXEC |
-                           SD_ASYM_CPUCAPACITY |
-                           SD_SHARE_CPUCAPACITY |
-                           SD_SHARE_PKG_RESOURCES |
-                           SD_PREFER_SIBLING |
-                           SD_SHARE_POWERDOMAIN);
-               if (nr_node_ids == 1)
-                       pflags &= ~SD_SERIALIZE;
-       }
+       if (parent->groups == parent->groups->next)
+               pflags &= ~SD_DEGENERATE_GROUPS_MASK;
+
         if (~cflags & pflags)
                 return 0;
  
@@ -1292,7 +1306,6 @@ int __read_mostly         node_reclaim_distance = RECLAIM_DISTANCE;
   *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
   *   SD_SHARE_PKG_RESOURCES - describes shared caches
   *   SD_NUMA                - describes NUMA topologies
- *   SD_SHARE_POWERDOMAIN   - describes shared power domain
   *
   * Odd one out, which beside describing the topology has a quirk also
   * prescribes the desired behaviour that goes along with it:
@@ -1303,8 +1316,7 @@ int __read_mostly         node_reclaim_distance = RECLAIM_DISTANCE;
         (SD_SHARE_CPUCAPACITY   |       \
          SD_SHARE_PKG_RESOURCES |       \
          SD_NUMA                |       \
-        SD_ASYM_PACKING        |       \
-        SD_SHARE_POWERDOMAIN)
+        SD_ASYM_PACKING)
  
  static struct sched_domain *
  sd_init(struct sched_domain_topology_level *tl,
@@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl,
         *sd = (struct sched_domain){
                 .min_interval           = sd_weight,
                 .max_interval           = 2*sd_weight,
-               .busy_factor            = 32,
-               .imbalance_pct          = 125,
+               .busy_factor            = 16,
+               .imbalance_pct          = 117,
  
                 .cache_nice_tries       = 0,
  
@@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
         /* Set up domains for CPUs specified by the cpu_map: */
         for_each_cpu(i, cpu_map) {
                 struct sched_domain_topology_level *tl;
+               int dflags = 0;
  
                 sd = NULL;
                 for_each_sd_topology(tl) {
-                       int dflags = 0;
-
                         if (tl == tl_asym) {
                                 dflags |= SD_ASYM_CPUCAPACITY;
                                 has_asym = true;
diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c

index e8a657a..3845890 100644 (file)
--- a/tools/testing/selftests/rseq/param_test.c
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -1,8 +1,10 @@
  // SPDX-License-Identifier: LGPL-2.1
  #define _GNU_SOURCE
  #include <assert.h>
+#include <linux/membarrier.h>
  #include <pthread.h>
  #include <sched.h>
+#include <stdatomic.h>
  #include <stdint.h>
  #include <stdio.h>
  #include <stdlib.h>
@@ -1131,6 +1133,220 @@ static int set_signal_handler(void)
         return ret;
  }
  
+struct test_membarrier_thread_args {
+       int stop;
+       intptr_t percpu_list_ptr;
+};
+
+/* Worker threads modify data in their "active" percpu lists. */
+void *test_membarrier_worker_thread(void *arg)
+{
+       struct test_membarrier_thread_args *args =
+               (struct test_membarrier_thread_args *)arg;
+       const int iters = opt_reps;
+       int i;
+
+       if (rseq_register_current_thread()) {
+               fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+
+       /* Wait for initialization. */
+       while (!atomic_load(&args->percpu_list_ptr)) {}
+
+       for (i = 0; i < iters; ++i) {
+               int ret;
+
+               do {
+                       int cpu = rseq_cpu_start();
+
+                       ret = rseq_offset_deref_addv(&args->percpu_list_ptr,
+                               sizeof(struct percpu_list_entry) * cpu, 1, cpu);
+               } while (rseq_unlikely(ret));
+       }
+
+       if (rseq_unregister_current_thread()) {
+               fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+       return NULL;
+}
+
+void test_membarrier_init_percpu_list(struct percpu_list *list)
+{
+       int i;
+
+       memset(list, 0, sizeof(*list));
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               struct percpu_list_node *node;
+
+               node = malloc(sizeof(*node));
+               assert(node);
+               node->data = 0;
+               node->next = NULL;
+               list->c[i].head = node;
+       }
+}
+
+void test_membarrier_free_percpu_list(struct percpu_list *list)
+{
+       int i;
+
+       for (i = 0; i < CPU_SETSIZE; i++)
+               free(list->c[i].head);
+}
+
+static int sys_membarrier(int cmd, int flags, int cpu_id)
+{
+       return syscall(__NR_membarrier, cmd, flags, cpu_id);
+}
+
+/*
+ * The manager thread swaps per-cpu lists that worker threads see,
+ * and validates that there are no unexpected modifications.
+ */
+void *test_membarrier_manager_thread(void *arg)
+{
+       struct test_membarrier_thread_args *args =
+               (struct test_membarrier_thread_args *)arg;
+       struct percpu_list list_a, list_b;
+       intptr_t expect_a = 0, expect_b = 0;
+       int cpu_a = 0, cpu_b = 0;
+
+       if (rseq_register_current_thread()) {
+               fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+
+       /* Init lists. */
+       test_membarrier_init_percpu_list(&list_a);
+       test_membarrier_init_percpu_list(&list_b);
+
+       atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
+
+       while (!atomic_load(&args->stop)) {
+               /* list_a is "active". */
+               cpu_a = rand() % CPU_SETSIZE;
+               /*
+                * As list_b is "inactive", we should never see changes
+                * to list_b.
+                */
+               if (expect_b != atomic_load(&list_b.c[cpu_b].head->data)) {
+                       fprintf(stderr, "Membarrier test failed\n");
+                       abort();
+               }
+
+               /* Make list_b "active". */
+               atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b);
+               if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+                                       MEMBARRIER_CMD_FLAG_CPU, cpu_a) &&
+                               errno != ENXIO /* missing CPU */) {
+                       perror("sys_membarrier");
+                       abort();
+               }
+               /*
+                * Cpu A should now only modify list_b, so the values
+                * in list_a should be stable.
+                */
+               expect_a = atomic_load(&list_a.c[cpu_a].head->data);
+
+               cpu_b = rand() % CPU_SETSIZE;
+               /*
+                * As list_a is "inactive", we should never see changes
+                * to list_a.
+                */
+               if (expect_a != atomic_load(&list_a.c[cpu_a].head->data)) {
+                       fprintf(stderr, "Membarrier test failed\n");
+                       abort();
+               }
+
+               /* Make list_a "active". */
+               atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
+               if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+                                       MEMBARRIER_CMD_FLAG_CPU, cpu_b) &&
+                               errno != ENXIO /* missing CPU*/) {
+                       perror("sys_membarrier");
+                       abort();
+               }
+               /* Remember a value from list_b. */
+               expect_b = atomic_load(&list_b.c[cpu_b].head->data);
+       }
+
+       test_membarrier_free_percpu_list(&list_a);
+       test_membarrier_free_percpu_list(&list_b);
+
+       if (rseq_unregister_current_thread()) {
+               fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+       return NULL;
+}
+
+/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */
+#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+void test_membarrier(void)
+{
+       const int num_threads = opt_threads;
+       struct test_membarrier_thread_args thread_args;
+       pthread_t worker_threads[num_threads];
+       pthread_t manager_thread;
+       int i, ret;
+
+       if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) {
+               perror("sys_membarrier");
+               abort();
+       }
+
+       thread_args.stop = 0;
+       thread_args.percpu_list_ptr = 0;
+       ret = pthread_create(&manager_thread, NULL,
+                       test_membarrier_manager_thread, &thread_args);
+       if (ret) {
+               errno = ret;
+               perror("pthread_create");
+               abort();
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_create(&worker_threads[i], NULL,
+                               test_membarrier_worker_thread, &thread_args);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_create");
+                       abort();
+               }
+       }
+
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_join(worker_threads[i], NULL);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_join");
+                       abort();
+               }
+       }
+
+       atomic_store(&thread_args.stop, 1);
+       ret = pthread_join(manager_thread, NULL);
+       if (ret) {
+               errno = ret;
+               perror("pthread_join");
+               abort();
+       }
+}
+#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */
+void test_membarrier(void)
+{
+       fprintf(stderr, "rseq_offset_deref_addv is not implemented on this architecture. "
+                       "Skipping membarrier test.\n");
+}
+#endif
+
  static void show_usage(int argc, char **argv)
  {
         printf("Usage : %s <OPTIONS>\n",
@@ -1153,7 +1369,7 @@ static void show_usage(int argc, char **argv)
         printf("        [-r N] Number of repetitions per thread (default 5000)\n");
         printf("        [-d] Disable rseq system call (no initialization)\n");
         printf("        [-D M] Disable rseq for each M threads\n");
-       printf("        [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
+       printf("        [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n");
         printf("        [-M] Push into buffer and memcpy buffer with memory barriers.\n");
         printf("        [-v] Verbose output.\n");
         printf("        [-h] Show this help.\n");
@@ -1268,6 +1484,7 @@ int main(int argc, char **argv)
                         case 'i':
                         case 'b':
                         case 'm':
+                       case 'r':
                                 break;
                         default:
                                 show_usage(argc, argv);
@@ -1320,6 +1537,10 @@ int main(int argc, char **argv)
                 printf_verbose("counter increment\n");
                 test_percpu_inc();
                 break;
+       case 'r':
+               printf_verbose("membarrier\n");
+               test_membarrier();
+               break;
         }
         if (!opt_disable_rseq && rseq_unregister_current_thread())
                 abort();
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h

index b2da600..6404115 100644 (file)
--- a/tools/testing/selftests/rseq/rseq-x86.h
+++ b/tools/testing/selftests/rseq/rseq-x86.h
@@ -279,6 +279,63 @@ error1:
  #endif
  }
  
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+
+/*
+ *   pval = *(ptr+off)
+ *  *pval += inc;
+ */
+static inline __attribute__((always_inline))
+int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
+{
+       RSEQ_INJECT_C(9)
+
+       __asm__ __volatile__ goto (
+               RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+               /* Start rseq by storing table entry pointer into rseq_cs. */
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
+               RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
+#endif
+               /* get p+v */
+               "movq %[ptr], %%rbx\n\t"
+               "addq %[off], %%rbx\n\t"
+               /* get pv */
+               "movq (%%rbx), %%rcx\n\t"
+               /* *pv += inc */
+               "addq %[inc], (%%rcx)\n\t"
+               "2:\n\t"
+               RSEQ_INJECT_ASM(4)
+               RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+               : /* gcc asm goto does not allow outputs */
+               : [cpu_id]              "r" (cpu),
+                 [rseq_abi]            "r" (&__rseq_abi),
+                 /* final store input */
+                 [ptr]                 "m" (*ptr),
+                 [off]                 "er" (off),
+                 [inc]                 "er" (inc)
+               : "memory", "cc", "rax", "rbx", "rcx"
+                 RSEQ_INJECT_CLOBBER
+               : abort
+#ifdef RSEQ_COMPARE_TWICE
+                 , error1
+#endif
+       );
+       return 0;
+abort:
+       RSEQ_INJECT_FAILED
+       return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+       rseq_bug("cpu_id comparison failed");
+#endif
+}
+
  static inline __attribute__((always_inline))
  int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
                                  intptr_t *v2, intptr_t newv2,
diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh

index e426304..f51bc83 100755 (executable)
--- a/tools/testing/selftests/rseq/run_param_test.sh
+++ b/tools/testing/selftests/rseq/run_param_test.sh
@@ -15,6 +15,7 @@ TEST_LIST=(
         "-T m"
         "-T m -M"
         "-T i"
+       "-T r"
  )
  
  TEST_NAME=(
@@ -25,6 +26,7 @@ TEST_NAME=(
         "memcpy"
         "memcpy with barrier"
         "increment"
+       "membarrier"
  )
  IFS="$OLDIFS"
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 12 Oct 2020 19:56:01 +0000 (12:56 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 12 Oct 2020 19:56:01 +0000 (12:56 -0700)
MAINTAINERS		patch \| blob \| history
arch/arm/kernel/topology.c		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/sched/mm.h		patch \| blob \| history
include/linux/sched/sd_flags.h	[new file with mode: 0644]	patch \| blob
include/linux/sched/topology.h		patch \| blob \| history
include/linux/syscalls.h		patch \| blob \| history
include/trace/events/sched.h		patch \| blob \| history
include/uapi/linux/membarrier.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/deadline.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/features.h		patch \| blob \| history
kernel/sched/membarrier.c		patch \| blob \| history
kernel/sched/topology.c		patch \| blob \| history
tools/testing/selftests/rseq/param_test.c		patch \| blob \| history
tools/testing/selftests/rseq/rseq-x86.h		patch \| blob \| history
tools/testing/selftests/rseq/run_param_test.sh		patch \| blob \| history