Merge tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 12 Oct 2020 19:56:01 +0000 (12:56 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 12 Oct 2020 19:56:01 +0000 (12:56 -0700)
Pull scheduler updates from Ingo Molnar:

 - reorganize & clean up the SD* flags definitions and add a bunch of
   sanity checks. These new checks caught quite a few bugs or at least
   inconsistencies, resulting in another set of patches.

 - rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ

 - add a new tracepoint to improve CPU capacity tracking

 - improve overloaded SMP system load-balancing behavior

 - tweak SMT balancing

 - energy-aware scheduling updates

 - NUMA balancing improvements

 - deadline scheduler fixes and improvements

 - CPU isolation fixes

 - misc cleanups, simplifications and smaller optimizations

* tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (42 commits)
  sched/deadline: Unthrottle PI boosted threads while enqueuing
  sched/debug: Add new tracepoint to track cpu_capacity
  sched/fair: Tweak pick_next_entity()
  rseq/selftests: Test MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
  rseq/selftests,x86_64: Add rseq_offset_deref_addv()
  rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
  sched/fair: Use dst group while checking imbalance for NUMA balancer
  sched/fair: Reduce busy load balance interval
  sched/fair: Minimize concurrent LBs between domain level
  sched/fair: Reduce minimal imbalance threshold
  sched/fair: Relax constraint on task's load during load balance
  sched/fair: Remove the force parameter of update_tg_load_avg()
  sched/fair: Fix wrong cpu selecting from isolated domain
  sched: Remove unused inline function uclamp_bucket_base_value()
  sched/rt: Disable RT_RUNTIME_SHARE by default
  sched/deadline: Fix stale throttling on de-/boosted tasks
  sched/numa: Use runnable_avg to classify node
  sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL
  MAINTAINERS: Add myself as SCHED_DEADLINE reviewer
  sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h
  ...

19 files changed:
MAINTAINERS
arch/arm/kernel/topology.c
include/linux/sched.h
include/linux/sched/mm.h
include/linux/sched/sd_flags.h [new file with mode: 0644]
include/linux/sched/topology.h
include/linux/syscalls.h
include/trace/events/sched.h
include/uapi/linux/membarrier.h
kernel/sched/core.c
kernel/sched/deadline.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/features.h
kernel/sched/membarrier.c
kernel/sched/topology.c
tools/testing/selftests/rseq/param_test.c
tools/testing/selftests/rseq/rseq-x86.h
tools/testing/selftests/rseq/run_param_test.sh

index 10a54b2..80ee365 100644 (file)
@@ -15407,6 +15407,7 @@ R:      Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
 R:     Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
 R:     Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
 R:     Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
+R:     Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
 L:     linux-kernel@vger.kernel.org
 S:     Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
index b5adaf7..ef0058d 100644 (file)
@@ -177,15 +177,6 @@ static inline void parse_dt_topology(void) {}
 static inline void update_cpu_capacity(unsigned int cpuid) {}
 #endif
 
-/*
- * The current assumption is that we can power gate each core independently.
- * This will be superseded by DT binding once available.
- */
-const struct cpumask *cpu_corepower_mask(int cpu)
-{
-       return &cpu_topology[cpu].thread_sibling;
-}
-
 /*
  * store_cpu_topology is called at boot when only one cpu is running
  * and with the mutex cpu_hotplug.lock locked, when several cpus have booted,
@@ -241,20 +232,6 @@ topology_populated:
        update_siblings_masks(cpuid);
 }
 
-static inline int cpu_corepower_flags(void)
-{
-       return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
-}
-
-static struct sched_domain_topology_level arm_topology[] = {
-#ifdef CONFIG_SCHED_MC
-       { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
-       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
-       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-       { NULL, },
-};
-
 /*
  * init_cpu_topology is called at boot when only one cpu is running
  * which prevent simultaneous write access to cpu_topology array
@@ -265,7 +242,4 @@ void __init init_cpu_topology(void)
        smp_wmb();
 
        parse_dt_topology();
-
-       /* Set scheduler topology descriptor */
-       set_sched_topology(arm_topology);
 }
index 3a3aaf0..d383cf0 100644 (file)
@@ -1491,9 +1491,10 @@ extern struct pid *cad_pid;
 /*
  * Per process flags
  */
+#define PF_VCPU                        0x00000001      /* I'm a virtual CPU */
 #define PF_IDLE                        0x00000002      /* I am an IDLE thread */
 #define PF_EXITING             0x00000004      /* Getting shut down */
-#define PF_VCPU                        0x00000010      /* I'm a virtual CPU */
+#define PF_IO_WORKER           0x00000010      /* Task is an IO worker */
 #define PF_WQ_WORKER           0x00000020      /* I'm a workqueue worker */
 #define PF_FORKNOEXEC          0x00000040      /* Forked but didn't exec */
 #define PF_MCE_PROCESS         0x00000080      /* Process policy on mce errors */
@@ -1517,7 +1518,6 @@ extern struct pid *cad_pid;
 #define PF_NO_SETAFFINITY      0x04000000      /* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY           0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA      0x10000000      /* All allocation request will have _GFP_MOVABLE cleared */
-#define PF_IO_WORKER           0x20000000      /* Task is an IO worker */
 #define PF_FREEZER_SKIP                0x40000000      /* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
@@ -2046,6 +2046,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
 const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
 
 int sched_trace_rq_cpu(struct rq *rq);
+int sched_trace_rq_cpu_capacity(struct rq *rq);
 int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
index f889e33..15bfb06 100644 (file)
@@ -348,10 +348,13 @@ enum {
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                       = (1U << 3),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY      = (1U << 4),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE            = (1U << 5),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY           = (1U << 6),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                 = (1U << 7),
 };
 
 enum {
        MEMBARRIER_FLAG_SYNC_CORE       = (1U << 0),
+       MEMBARRIER_FLAG_RSEQ            = (1U << 1),
 };
 
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
new file mode 100644 (file)
index 0000000..34b21e9
--- /dev/null
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sched-domains (multiprocessor balancing) flag declarations.
+ */
+
+#ifndef SD_FLAG
+# error "Incorrect import of SD flags definitions"
+#endif
+
+/*
+ * Hierarchical metaflags
+ *
+ * SHARED_CHILD: These flags are meant to be set from the base domain upwards.
+ * If a domain has this flag set, all of its children should have it set. This
+ * is usually because the flag describes some shared resource (all CPUs in that
+ * domain share the same resource), or because they are tied to a scheduling
+ * behaviour that we want to disable at some point in the hierarchy for
+ * scalability reasons.
+ *
+ * In those cases it doesn't make sense to have the flag set for a domain but
+ * not have it in (some of) its children: sched domains ALWAYS span their child
+ * domains, so operations done with parent domains will cover CPUs in the lower
+ * child domains.
+ *
+ *
+ * SHARED_PARENT: These flags are meant to be set from the highest domain
+ * downwards. If a domain has this flag set, all of its parents should have it
+ * set. This is usually for topology properties that start to appear above a
+ * certain level (e.g. domain starts spanning CPUs outside of the base CPU's
+ * socket).
+ */
+#define SDF_SHARED_CHILD       0x1
+#define SDF_SHARED_PARENT      0x2
+
+/*
+ * Behavioural metaflags
+ *
+ * NEEDS_GROUPS: These flags are only relevant if the domain they are set on has
+ * more than one group. This is usually for balancing flags (load balancing
+ * involves equalizing a metric between groups), or for flags describing some
+ * shared resource (which would be shared between groups).
+ */
+#define SDF_NEEDS_GROUPS       0x4
+
+/*
+ * Balance when about to become idle
+ *
+ * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Balance on exec
+ *
+ * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Balance on fork, clone
+ *
+ * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Balance on wakeup
+ *
+ * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level.
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Consider waking task on waking CPU.
+ *
+ * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level.
+ */
+SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
+
+/*
+ * Domain members have different CPU capacities
+ *
+ * SHARED_PARENT: Set from the topmost domain down to the first domain where
+ *                asymmetry is detected.
+ * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
+ */
+SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
+ * Domain members share CPU capacity (i.e. SMT)
+ *
+ * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
+ *               CPU capacity.
+ * NEEDS_GROUPS: Capacity is shared between groups.
+ */
+SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Domain members share CPU package resources (i.e. caches)
+ *
+ * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
+ *               the same cache(s).
+ * NEEDS_GROUPS: Caches are shared between groups.
+ */
+SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Only a single load balancing instance
+ *
+ * SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a
+ *                different level upwards, but it doesn't change that if a
+ *                domain has this flag set, then all of its parents need to have
+ *                it too (otherwise the serialization doesn't make sense).
+ * NEEDS_GROUPS: No point in preserving domain if it has a single group.
+ */
+SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
+ * Place busy tasks earlier in the domain
+ *
+ * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
+ *               up, but currently assumed to be set from the base domain
+ *               upwards (see update_top_cache_domain()).
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+
+/*
+ * Prefer to place tasks in a sibling domain
+ *
+ * Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD
+ * flag, but cleared below domains with SD_ASYM_CPUCAPACITY.
+ *
+ * NEEDS_GROUPS: Load balancing flag.
+ */
+SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
+
+/*
+ * sched_groups of this level overlap
+ *
+ * SHARED_PARENT: Set for all NUMA levels above NODE.
+ * NEEDS_GROUPS: Overlaps can only exist with more than one group.
+ */
+SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
+ * Cross-node balancing
+ *
+ * SHARED_PARENT: Set for all NUMA levels above NODE.
+ * NEEDS_GROUPS: No point in preserving domain if it has a single group.
+ */
+SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
index 8205112..9ef7bf6 100644 (file)
  */
 #ifdef CONFIG_SMP
 
-#define SD_BALANCE_NEWIDLE     0x0001  /* Balance when about to become idle */
-#define SD_BALANCE_EXEC                0x0002  /* Balance on exec */
-#define SD_BALANCE_FORK                0x0004  /* Balance on fork, clone */
-#define SD_BALANCE_WAKE                0x0008  /* Balance on wakeup */
-#define SD_WAKE_AFFINE         0x0010  /* Wake task to waking CPU */
-#define SD_ASYM_CPUCAPACITY    0x0020  /* Domain members have different CPU capacities */
-#define SD_SHARE_CPUCAPACITY   0x0040  /* Domain members share CPU capacity */
-#define SD_SHARE_POWERDOMAIN   0x0080  /* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES 0x0100  /* Domain members share CPU pkg resources */
-#define SD_SERIALIZE           0x0200  /* Only a single load balancing instance */
-#define SD_ASYM_PACKING                0x0400  /* Place busy groups earlier in the domain */
-#define SD_PREFER_SIBLING      0x0800  /* Prefer to place tasks in a sibling domain */
-#define SD_OVERLAP             0x1000  /* sched_domains of this level overlap */
-#define SD_NUMA                        0x2000  /* cross-node balancing */
+/* Generate SD flag indexes */
+#define SD_FLAG(name, mflags) __##name,
+enum {
+       #include <linux/sched/sd_flags.h>
+       __SD_FLAG_CNT,
+};
+#undef SD_FLAG
+/* Generate SD flag bits */
+#define SD_FLAG(name, mflags) name = 1 << __##name,
+enum {
+       #include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
+#ifdef CONFIG_SCHED_DEBUG
+
+struct sd_flag_debug {
+       unsigned int meta_flags;
+       char *name;
+};
+extern const struct sd_flag_debug sd_flag_debug[];
+
+#endif
 
 #ifdef CONFIG_SCHED_SMT
 static inline int cpu_smt_flags(void)
index 75ac7f8..06db098 100644 (file)
@@ -974,7 +974,7 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
                        const char __user *const __user *argv,
                        const char __user *const __user *envp, int flags);
 asmlinkage long sys_userfaultfd(int flags);
-asmlinkage long sys_membarrier(int cmd, int flags);
+asmlinkage long sys_membarrier(int cmd, unsigned int flags, int cpu_id);
 asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
 asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
                                    int fd_out, loff_t __user *off_out,
index fec25b9..c96a433 100644 (file)
@@ -630,6 +630,10 @@ DECLARE_TRACE(pelt_se_tp,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));
 
+DECLARE_TRACE(sched_cpu_capacity_tp,
+       TP_PROTO(struct rq *rq),
+       TP_ARGS(rq));
+
 DECLARE_TRACE(sched_overutilized_tp,
        TP_PROTO(struct root_domain *rd, bool overutilized),
        TP_ARGS(rd, overutilized));
index 5891d76..7376058 100644 (file)
  *                          If this command is not implemented by an
  *                          architecture, -EINVAL is returned.
  *                          Returns 0 on success.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ *                          Ensure the caller thread, upon return from
+ *                          system call, that all its running thread
+ *                          siblings have any currently running rseq
+ *                          critical sections restarted if @flags
+ *                          parameter is 0; if @flags parameter is
+ *                          MEMBARRIER_CMD_FLAG_CPU,
+ *                          then this operation is performed only
+ *                          on CPU indicated by @cpu_id. If this command is
+ *                          not implemented by an architecture, -EINVAL
+ *                          is returned. A process needs to register its
+ *                          intent to use the private expedited rseq
+ *                          command prior to using it, otherwise
+ *                          this command returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ *                          Register the process intent to use
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ.
+ *                          If this command is not implemented by an
+ *                          architecture, -EINVAL is returned.
+ *                          Returns 0 on success.
  * @MEMBARRIER_CMD_SHARED:
  *                          Alias to MEMBARRIER_CMD_GLOBAL. Provided for
  *                          header backward compatibility.
@@ -131,9 +151,15 @@ enum membarrier_cmd {
        MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED               = (1 << 4),
        MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE              = (1 << 5),
        MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE     = (1 << 6),
+       MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ                   = (1 << 7),
+       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ          = (1 << 8),
 
        /* Alias for header backward compatibility. */
        MEMBARRIER_CMD_SHARED                   = MEMBARRIER_CMD_GLOBAL,
 };
 
+enum membarrier_cmd_flag {
+       MEMBARRIER_CMD_FLAG_CPU         = (1 << 0),
+};
+
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
index 2d95dc3..8160ab5 100644 (file)
@@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
@@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
        return clamp_value / UCLAMP_BUCKET_DELTA;
 }
 
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
-{
-       return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
-}
-
 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
 {
        if (clamp_id == UCLAMP_MIN)
@@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void)
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
+       unsigned int task_flags;
+
        if (!tsk->state)
                return;
 
+       task_flags = tsk->flags;
        /*
         * If a worker went to sleep, notify and ask workqueue whether
         * it wants to wake up a task to maintain concurrency.
@@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
         * in the possible wakeup of a kworker and because wq_worker_sleeping()
         * requires it.
         */
-       if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+       if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
                preempt_disable();
-               if (tsk->flags & PF_WQ_WORKER)
+               if (task_flags & PF_WQ_WORKER)
                        wq_worker_sleeping(tsk);
                else
                        io_wq_worker_sleeping(tsk);
index 3862a28..6d93f45 100644 (file)
@@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
         */
        if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
                pi_se = &pi_task->dl;
+               /*
+                * Because of delays in the detection of the overrun of a
+                * thread's runtime, it might be the case that a thread
+                * goes to sleep in a rt mutex with negative runtime. As
+                * a consequence, the thread will be throttled.
+                *
+                * While waiting for the mutex, this thread can also be
+                * boosted via PI, resulting in a thread that is throttled
+                * and boosted at the same time.
+                *
+                * In this case, the boost overrides the throttle.
+                */
+               if (p->dl.dl_throttled) {
+                       /*
+                        * The replenish timer needs to be canceled. No
+                        * problem if it fires concurrently: boosted threads
+                        * are ignored in dl_task_timer().
+                        */
+                       hrtimer_try_to_cancel(&p->dl.dl_timer);
+                       p->dl.dl_throttled = 0;
+               }
        } else if (!dl_prio(p->normal_prio)) {
                /*
-                * Special case in which we have a !SCHED_DEADLINE task
-                * that is going to be deboosted, but exceeds its
-                * runtime while doing so. No point in replenishing
-                * it, as it's going to return back to its original
-                * scheduling class after this.
+                * Special case in which we have a !SCHED_DEADLINE task that is going
+                * to be deboosted, but exceeds its runtime while doing so. No point in
+                * replenishing it, as it's going to return back to its original
+                * scheduling class after this. If it has been throttled, we need to
+                * clear the flag, otherwise the task may wake up as throttled after
+                * being boosted again with no means to replenish the runtime and clear
+                * the throttle.
                 */
+               p->dl.dl_throttled = 0;
                BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
                return;
        }
index 36c5426..0655524 100644 (file)
@@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry,
        entry->proc_handler = proc_handler;
 }
 
+static int sd_ctl_doflags(struct ctl_table *table, int write,
+                         void *buffer, size_t *lenp, loff_t *ppos)
+{
+       unsigned long flags = *(unsigned long *)table->data;
+       size_t data_size = 0;
+       size_t len = 0;
+       char *tmp;
+       int idx;
+
+       if (write)
+               return 0;
+
+       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+               char *name = sd_flag_debug[idx].name;
+
+               /* Name plus whitespace */
+               data_size += strlen(name) + 1;
+       }
+
+       if (*ppos > data_size) {
+               *lenp = 0;
+               return 0;
+       }
+
+       tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
+       if (!tmp)
+               return -ENOMEM;
+
+       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+               char *name = sd_flag_debug[idx].name;
+
+               len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
+       }
+
+       tmp += *ppos;
+       len -= *ppos;
+
+       if (len > *lenp)
+               len = *lenp;
+       if (len)
+               memcpy(buffer, tmp, len);
+       if (len < *lenp) {
+               ((char *)buffer)[len] = '\n';
+               len++;
+       }
+
+       *lenp = len;
+       *ppos += len;
+
+       kfree(tmp);
+
+       return 0;
+}
+
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
@@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
        set_table_entry(&table[2], "busy_factor",         &sd->busy_factor,         sizeof(int),  0644, proc_dointvec_minmax);
        set_table_entry(&table[3], "imbalance_pct",       &sd->imbalance_pct,       sizeof(int),  0644, proc_dointvec_minmax);
        set_table_entry(&table[4], "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
-       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0444, proc_dointvec_minmax);
+       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0444, sd_ctl_doflags);
        set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
        set_table_entry(&table[7], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring);
        /* &table[8] is terminator */
index 1a68a05..aa4c622 100644 (file)
@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 void post_init_entity_util_avg(struct task_struct *p)
 {
 }
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1504,6 +1504,7 @@ enum numa_type {
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
        unsigned long load;
+       unsigned long runnable;
        unsigned long util;
        /* Total compute capacity of CPUs on a node */
        unsigned long compute_capacity;
@@ -1547,19 +1548,22 @@ struct task_numa_env {
 };
 
 static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_runnable(struct rq *rq);
 static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
                         struct numa_stats *ns)
 {
        if ((ns->nr_running > ns->weight) &&
-           ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+           (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
+            ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
                return node_overloaded;
 
        if ((ns->nr_running < ns->weight) ||
-           ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+           (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
+            ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
                return node_has_spare;
 
        return node_fully_busy;
@@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env,
                struct rq *rq = cpu_rq(cpu);
 
                ns->load += cpu_load(rq);
+               ns->runnable += cpu_runnable(rq);
                ns->util += cpu_util(cpu);
                ns->nr_running += rq->cfs.h_nr_running;
                ns->compute_capacity += capacity_of(cpu);
@@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
                src_running = env->src_stats.nr_running - 1;
                dst_running = env->dst_stats.nr_running + 1;
                imbalance = max(0, dst_running - src_running);
-               imbalance = adjust_numa_imbalance(imbalance, src_running);
+               imbalance = adjust_numa_imbalance(imbalance, dst_running);
 
                /* Use idle CPU if there is no imbalance */
                if (!imbalance) {
@@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                /* commit outstanding execution time */
                if (cfs_rq->curr == se)
                        update_curr(cfs_rq);
-               account_entity_dequeue(cfs_rq, se);
+               update_load_sub(&cfs_rq->load, se->load.weight);
        }
        dequeue_load_avg(cfs_rq, se);
 
@@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
        enqueue_load_avg(cfs_rq, se);
        if (se->on_rq)
-               account_entity_enqueue(cfs_rq, se);
+               update_load_add(&cfs_rq->load, se->load.weight);
 
 }
 
@@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 /**
  * update_tg_load_avg - update the tg's load avg
  * @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
  *
  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
  * However, because tg->load_avg is a global value there are performance
@@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
  *
  * Updating tg's load_avg is necessary before update_cfs_share().
  */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
        long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 
@@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
        if (cfs_rq->tg == &root_task_group)
                return;
 
-       if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+       if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
                atomic_long_add(delta, &cfs_rq->tg->load_avg);
                cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
        }
@@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
 
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
@@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                 * IOW we're enqueueing a task on a new CPU.
                 */
                attach_entity_load_avg(cfs_rq, se);
-               update_tg_load_avg(cfs_rq, 0);
+               update_tg_load_avg(cfs_rq);
 
        } else if (decayed) {
                cfs_rq_util_change(cfs_rq, 0);
 
                if (flags & UPDATE_TG)
-                       update_tg_load_avg(cfs_rq, 0);
+                       update_tg_load_avg(cfs_rq);
        }
 }
 
@@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                        se = second;
        }
 
-       /*
-        * Prefer last buddy, try to return the CPU to a preempted task.
-        */
-       if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
-               se = cfs_rq->last;
-
-       /*
-        * Someone really wants this to run. If it's not unfair, run it.
-        */
-       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+               /*
+                * Someone really wants this to run. If it's not unfair, run it.
+                */
                se = cfs_rq->next;
+       } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
+               /*
+                * Prefer last buddy, try to return the CPU to a preempted task.
+                */
+               se = cfs_rq->last;
+       }
 
        clear_buddies(cfs_rq, se);
 
@@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 /*
  * Scan the local SMT mask for idle CPUs.
  */
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 {
        int cpu;
 
@@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target)
                return -1;
 
        for_each_cpu(cpu, cpu_smt_mask(target)) {
-               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+               if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+                   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
                        continue;
                if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
                        return cpu;
@@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
        return -1;
 }
 
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 {
        return -1;
 }
@@ -6274,7 +6279,7 @@ symmetric:
        if ((unsigned)i < nr_cpumask_bits)
                return i;
 
-       i = select_idle_smt(p, target);
+       i = select_idle_smt(p, sd, target);
        if ((unsigned)i < nr_cpumask_bits)
                return i;
 
@@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 
                        util = cpu_util_next(cpu, p, cpu);
                        cpu_cap = capacity_of(cpu);
-                       spare_cap = cpu_cap - util;
+                       spare_cap = cpu_cap;
+                       lsub_positive(&spare_cap, util);
 
                        /*
                         * Skip CPUs that cannot satisfy the capacity request.
@@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
        if (unlikely(task_has_idle_policy(p)))
                return 0;
 
+       /* SMT siblings share cache */
+       if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+               return 0;
+
        /*
         * Buddy candidates are cache hot:
         */
@@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env)
                         * scheduler fails to find a good waiting task to
                         * migrate.
                         */
-                       if (load/2 > env->imbalance &&
-                           env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
+
+                       if ((load >> env->sd->nr_balance_failed) > env->imbalance)
                                goto next;
 
                        env->imbalance -= load;
@@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
                struct sched_entity *se;
 
                if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
-                       update_tg_load_avg(cfs_rq, 0);
+                       update_tg_load_avg(cfs_rq);
 
                        if (cfs_rq == &rq->cfs)
                                decayed = true;
@@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
                capacity = 1;
 
        cpu_rq(cpu)->cpu_capacity = capacity;
+       trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
        sdg->sgc->capacity = capacity;
        sdg->sgc->min_capacity = capacity;
        sdg->sgc->max_capacity = capacity;
@@ -8957,7 +8969,7 @@ next_group:
        }
 }
 
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
 {
        unsigned int imbalance_min;
 
@@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
         * tasks that remain local when the source domain is almost idle.
         */
        imbalance_min = 2;
-       if (src_nr_running <= imbalance_min)
+       if (nr_running <= imbalance_min)
                return 0;
 
        return imbalance;
@@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
 
        /* scale ms to jiffies */
        interval = msecs_to_jiffies(interval);
+
+       /*
+        * Reduce likelihood of busy balancing at higher domains racing with
+        * balancing at lower domains by preventing their balancing periods
+        * from being multiples of each other.
+        */
+       if (cpu_busy)
+               interval -= 1;
+
        interval = clamp(interval, 1UL, max_load_balance_interval);
 
        return interval;
@@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
        /* Catch up with the cfs_rq and remove our load when we leave */
        update_load_avg(cfs_rq, se, 0);
        detach_entity_load_avg(cfs_rq, se);
-       update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq);
        propagate_entity_cfs_rq(se);
 }
 
@@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
        /* Synchronize entity with its cfs_rq */
        update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
        attach_entity_load_avg(cfs_rq, se);
-       update_tg_load_avg(cfs_rq, false);
+       update_tg_load_avg(cfs_rq);
        propagate_entity_cfs_rq(se);
 }
 
@@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq)
 }
 EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
 
+int sched_trace_rq_cpu_capacity(struct rq *rq)
+{
+       return rq ?
+#ifdef CONFIG_SMP
+               rq->cpu_capacity
+#else
+               SCHED_CAPACITY_SCALE
+#endif
+               : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
+
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
 {
 #ifdef CONFIG_SMP
index 7481cd9..68d369c 100644 (file)
@@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
 SCHED_FEAT(RT_PUSH_IPI, true)
 #endif
 
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_SHARE, false)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
index 168479a..e23e74d 100644 (file)
 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
 #endif
 
+#ifdef CONFIG_RSEQ
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK          \
+       (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ                  \
+       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+#else
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK  0
+#endif
+
 #define MEMBARRIER_CMD_BITMASK                                         \
        (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
        | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
@@ -30,6 +38,11 @@ static void ipi_mb(void *info)
        smp_mb();       /* IPIs should be serializing but paranoid. */
 }
 
+static void ipi_rseq(void *info)
+{
+       rseq_preempt(current);
+}
+
 static void ipi_sync_rq_state(void *info)
 {
        struct mm_struct *mm = (struct mm_struct *) info;
@@ -129,19 +142,27 @@ static int membarrier_global_expedited(void)
        return 0;
 }
 
-static int membarrier_private_expedited(int flags)
+static int membarrier_private_expedited(int flags, int cpu_id)
 {
-       int cpu;
        cpumask_var_t tmpmask;
        struct mm_struct *mm = current->mm;
+       smp_call_func_t ipi_func = ipi_mb;
 
-       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+       if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
                if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
                        return -EINVAL;
                if (!(atomic_read(&mm->membarrier_state) &
                      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
                        return -EPERM;
+       } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+               if (!IS_ENABLED(CONFIG_RSEQ))
+                       return -EINVAL;
+               if (!(atomic_read(&mm->membarrier_state) &
+                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
+                       return -EPERM;
+               ipi_func = ipi_rseq;
        } else {
+               WARN_ON_ONCE(flags);
                if (!(atomic_read(&mm->membarrier_state) &
                      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
                        return -EPERM;
@@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags)
         */
        smp_mb();       /* system call entry is not a mb. */
 
-       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+       if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
                return -ENOMEM;
 
        cpus_read_lock();
-       rcu_read_lock();
-       for_each_online_cpu(cpu) {
+
+       if (cpu_id >= 0) {
                struct task_struct *p;
 
-               /*
-                * Skipping the current CPU is OK even through we can be
-                * migrated at any point. The current CPU, at the point
-                * where we read raw_smp_processor_id(), is ensured to
-                * be in program order with respect to the caller
-                * thread. Therefore, we can skip this CPU from the
-                * iteration.
-                */
-               if (cpu == raw_smp_processor_id())
-                       continue;
-               p = rcu_dereference(cpu_rq(cpu)->curr);
-               if (p && p->mm == mm)
-                       __cpumask_set_cpu(cpu, tmpmask);
+               if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+                       goto out;
+               if (cpu_id == raw_smp_processor_id())
+                       goto out;
+               rcu_read_lock();
+               p = rcu_dereference(cpu_rq(cpu_id)->curr);
+               if (!p || p->mm != mm) {
+                       rcu_read_unlock();
+                       goto out;
+               }
+               rcu_read_unlock();
+       } else {
+               int cpu;
+
+               rcu_read_lock();
+               for_each_online_cpu(cpu) {
+                       struct task_struct *p;
+
+                       /*
+                        * Skipping the current CPU is OK even through we can be
+                        * migrated at any point. The current CPU, at the point
+                        * where we read raw_smp_processor_id(), is ensured to
+                        * be in program order with respect to the caller
+                        * thread. Therefore, we can skip this CPU from the
+                        * iteration.
+                        */
+                       if (cpu == raw_smp_processor_id())
+                               continue;
+                       p = rcu_dereference(cpu_rq(cpu)->curr);
+                       if (p && p->mm == mm)
+                               __cpumask_set_cpu(cpu, tmpmask);
+               }
+               rcu_read_unlock();
        }
-       rcu_read_unlock();
 
        preempt_disable();
-       smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+       if (cpu_id >= 0)
+               smp_call_function_single(cpu_id, ipi_func, NULL, 1);
+       else
+               smp_call_function_many(tmpmask, ipi_func, NULL, 1);
        preempt_enable();
 
-       free_cpumask_var(tmpmask);
+out:
+       if (cpu_id < 0)
+               free_cpumask_var(tmpmask);
        cpus_read_unlock();
 
        /*
@@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags)
            set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
            ret;
 
-       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+       if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
                if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
                        return -EINVAL;
                ready_state =
                        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+       } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+               if (!IS_ENABLED(CONFIG_RSEQ))
+                       return -EINVAL;
+               ready_state =
+                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
+       } else {
+               WARN_ON_ONCE(flags);
        }
 
        /*
@@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags)
                return 0;
        if (flags & MEMBARRIER_FLAG_SYNC_CORE)
                set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+       if (flags & MEMBARRIER_FLAG_RSEQ)
+               set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
        atomic_or(set_state, &mm->membarrier_state);
        ret = sync_runqueues_membarrier_state(mm);
        if (ret)
@@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
 
 /**
  * sys_membarrier - issue memory barriers on a set of threads
- * @cmd:   Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
+ * @cmd:    Takes command values defined in enum membarrier_cmd.
+ * @flags:  Currently needs to be 0 for all commands other than
+ *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
+ *          case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
+ *          contains the CPU on which to interrupt (= restart)
+ *          the RSEQ critical section.
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
+ *          RSEQ CS should be interrupted (@cmd must be
+ *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
  *
  * If this system call is not implemented, -ENOSYS is returned. If the
  * command specified does not exist, not available on the running
@@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags)
  *        smp_mb()           X           O            O
  *        sys_membarrier()   O           O            O
  */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
 {
-       if (unlikely(flags))
-               return -EINVAL;
+       switch (cmd) {
+       case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+               if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
+                       return -EINVAL;
+               break;
+       default:
+               if (unlikely(flags))
+                       return -EINVAL;
+       }
+
+       if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
+               cpu_id = -1;
+
        switch (cmd) {
        case MEMBARRIER_CMD_QUERY:
        {
@@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
        case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
                return membarrier_register_global_expedited();
        case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-               return membarrier_private_expedited(0);
+               return membarrier_private_expedited(0, cpu_id);
        case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
                return membarrier_register_private_expedited(0);
        case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
-               return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+               return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
        case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
                return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+       case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+               return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
+       case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+               return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
        default:
                return -EINVAL;
        }
index 1bd7e3a..dd77702 100644 (file)
@@ -25,10 +25,18 @@ static inline bool sched_debug(void)
        return sched_debug_enabled;
 }
 
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
+const struct sd_flag_debug sd_flag_debug[] = {
+#include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
 {
        struct sched_group *group = sd->groups;
+       unsigned long flags = sd->flags;
+       unsigned int idx;
 
        cpumask_clear(groupmask);
 
@@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
        }
 
+       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+               unsigned int flag = BIT(idx);
+               unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
+
+               if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
+                   !(sd->child->flags & flag))
+                       printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
+                              sd_flag_debug[idx].name);
+
+               if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
+                   !(sd->parent->flags & flag))
+                       printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
+                              sd_flag_debug[idx].name);
+       }
+
        printk(KERN_DEBUG "%*s groups:", level + 1, "");
        do {
                if (!group) {
@@ -137,22 +160,22 @@ static inline bool sched_debug(void)
 }
 #endif /* CONFIG_SCHED_DEBUG */
 
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
 static int sd_degenerate(struct sched_domain *sd)
 {
        if (cpumask_weight(sched_domain_span(sd)) == 1)
                return 1;
 
        /* Following flags need at least 2 groups */
-       if (sd->flags & (SD_BALANCE_NEWIDLE |
-                        SD_BALANCE_FORK |
-                        SD_BALANCE_EXEC |
-                        SD_SHARE_CPUCAPACITY |
-                        SD_ASYM_CPUCAPACITY |
-                        SD_SHARE_PKG_RESOURCES |
-                        SD_SHARE_POWERDOMAIN)) {
-               if (sd->groups != sd->groups->next)
-                       return 0;
-       }
+       if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
+           (sd->groups != sd->groups->next))
+               return 0;
 
        /* Following flags don't use groups */
        if (sd->flags & (SD_WAKE_AFFINE))
@@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                return 0;
 
        /* Flags needing groups don't count if only 1 group in parent */
-       if (parent->groups == parent->groups->next) {
-               pflags &= ~(SD_BALANCE_NEWIDLE |
-                           SD_BALANCE_FORK |
-                           SD_BALANCE_EXEC |
-                           SD_ASYM_CPUCAPACITY |
-                           SD_SHARE_CPUCAPACITY |
-                           SD_SHARE_PKG_RESOURCES |
-                           SD_PREFER_SIBLING |
-                           SD_SHARE_POWERDOMAIN);
-               if (nr_node_ids == 1)
-                       pflags &= ~SD_SERIALIZE;
-       }
+       if (parent->groups == parent->groups->next)
+               pflags &= ~SD_DEGENERATE_GROUPS_MASK;
+
        if (~cflags & pflags)
                return 0;
 
@@ -1292,7 +1306,6 @@ int __read_mostly         node_reclaim_distance = RECLAIM_DISTANCE;
  *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
  *   SD_SHARE_PKG_RESOURCES - describes shared caches
  *   SD_NUMA                - describes NUMA topologies
- *   SD_SHARE_POWERDOMAIN   - describes shared power domain
  *
  * Odd one out, which beside describing the topology has a quirk also
  * prescribes the desired behaviour that goes along with it:
@@ -1303,8 +1316,7 @@ int __read_mostly         node_reclaim_distance = RECLAIM_DISTANCE;
        (SD_SHARE_CPUCAPACITY   |       \
         SD_SHARE_PKG_RESOURCES |       \
         SD_NUMA                |       \
-        SD_ASYM_PACKING        |       \
-        SD_SHARE_POWERDOMAIN)
+        SD_ASYM_PACKING)
 
 static struct sched_domain *
 sd_init(struct sched_domain_topology_level *tl,
@@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl,
        *sd = (struct sched_domain){
                .min_interval           = sd_weight,
                .max_interval           = 2*sd_weight,
-               .busy_factor            = 32,
-               .imbalance_pct          = 125,
+               .busy_factor            = 16,
+               .imbalance_pct          = 117,
 
                .cache_nice_tries       = 0,
 
@@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
        /* Set up domains for CPUs specified by the cpu_map: */
        for_each_cpu(i, cpu_map) {
                struct sched_domain_topology_level *tl;
+               int dflags = 0;
 
                sd = NULL;
                for_each_sd_topology(tl) {
-                       int dflags = 0;
-
                        if (tl == tl_asym) {
                                dflags |= SD_ASYM_CPUCAPACITY;
                                has_asym = true;
index e8a657a..3845890 100644 (file)
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: LGPL-2.1
 #define _GNU_SOURCE
 #include <assert.h>
+#include <linux/membarrier.h>
 #include <pthread.h>
 #include <sched.h>
+#include <stdatomic.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -1131,6 +1133,220 @@ static int set_signal_handler(void)
        return ret;
 }
 
+struct test_membarrier_thread_args {
+       int stop;
+       intptr_t percpu_list_ptr;
+};
+
+/* Worker threads modify data in their "active" percpu lists. */
+void *test_membarrier_worker_thread(void *arg)
+{
+       struct test_membarrier_thread_args *args =
+               (struct test_membarrier_thread_args *)arg;
+       const int iters = opt_reps;
+       int i;
+
+       if (rseq_register_current_thread()) {
+               fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+
+       /* Wait for initialization. */
+       while (!atomic_load(&args->percpu_list_ptr)) {}
+
+       for (i = 0; i < iters; ++i) {
+               int ret;
+
+               do {
+                       int cpu = rseq_cpu_start();
+
+                       ret = rseq_offset_deref_addv(&args->percpu_list_ptr,
+                               sizeof(struct percpu_list_entry) * cpu, 1, cpu);
+               } while (rseq_unlikely(ret));
+       }
+
+       if (rseq_unregister_current_thread()) {
+               fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+       return NULL;
+}
+
+void test_membarrier_init_percpu_list(struct percpu_list *list)
+{
+       int i;
+
+       memset(list, 0, sizeof(*list));
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               struct percpu_list_node *node;
+
+               node = malloc(sizeof(*node));
+               assert(node);
+               node->data = 0;
+               node->next = NULL;
+               list->c[i].head = node;
+       }
+}
+
+void test_membarrier_free_percpu_list(struct percpu_list *list)
+{
+       int i;
+
+       for (i = 0; i < CPU_SETSIZE; i++)
+               free(list->c[i].head);
+}
+
+static int sys_membarrier(int cmd, int flags, int cpu_id)
+{
+       return syscall(__NR_membarrier, cmd, flags, cpu_id);
+}
+
+/*
+ * The manager thread swaps per-cpu lists that worker threads see,
+ * and validates that there are no unexpected modifications.
+ */
+void *test_membarrier_manager_thread(void *arg)
+{
+       struct test_membarrier_thread_args *args =
+               (struct test_membarrier_thread_args *)arg;
+       struct percpu_list list_a, list_b;
+       intptr_t expect_a = 0, expect_b = 0;
+       int cpu_a = 0, cpu_b = 0;
+
+       if (rseq_register_current_thread()) {
+               fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+
+       /* Init lists. */
+       test_membarrier_init_percpu_list(&list_a);
+       test_membarrier_init_percpu_list(&list_b);
+
+       atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
+
+       while (!atomic_load(&args->stop)) {
+               /* list_a is "active". */
+               cpu_a = rand() % CPU_SETSIZE;
+               /*
+                * As list_b is "inactive", we should never see changes
+                * to list_b.
+                */
+               if (expect_b != atomic_load(&list_b.c[cpu_b].head->data)) {
+                       fprintf(stderr, "Membarrier test failed\n");
+                       abort();
+               }
+
+               /* Make list_b "active". */
+               atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b);
+               if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+                                       MEMBARRIER_CMD_FLAG_CPU, cpu_a) &&
+                               errno != ENXIO /* missing CPU */) {
+                       perror("sys_membarrier");
+                       abort();
+               }
+               /*
+                * Cpu A should now only modify list_b, so the values
+                * in list_a should be stable.
+                */
+               expect_a = atomic_load(&list_a.c[cpu_a].head->data);
+
+               cpu_b = rand() % CPU_SETSIZE;
+               /*
+                * As list_a is "inactive", we should never see changes
+                * to list_a.
+                */
+               if (expect_a != atomic_load(&list_a.c[cpu_a].head->data)) {
+                       fprintf(stderr, "Membarrier test failed\n");
+                       abort();
+               }
+
+               /* Make list_a "active". */
+               atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
+               if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+                                       MEMBARRIER_CMD_FLAG_CPU, cpu_b) &&
+                               errno != ENXIO /* missing CPU*/) {
+                       perror("sys_membarrier");
+                       abort();
+               }
+               /* Remember a value from list_b. */
+               expect_b = atomic_load(&list_b.c[cpu_b].head->data);
+       }
+
+       test_membarrier_free_percpu_list(&list_a);
+       test_membarrier_free_percpu_list(&list_b);
+
+       if (rseq_unregister_current_thread()) {
+               fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+       return NULL;
+}
+
+/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */
+#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+void test_membarrier(void)
+{
+       const int num_threads = opt_threads;
+       struct test_membarrier_thread_args thread_args;
+       pthread_t worker_threads[num_threads];
+       pthread_t manager_thread;
+       int i, ret;
+
+       if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) {
+               perror("sys_membarrier");
+               abort();
+       }
+
+       thread_args.stop = 0;
+       thread_args.percpu_list_ptr = 0;
+       ret = pthread_create(&manager_thread, NULL,
+                       test_membarrier_manager_thread, &thread_args);
+       if (ret) {
+               errno = ret;
+               perror("pthread_create");
+               abort();
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_create(&worker_threads[i], NULL,
+                               test_membarrier_worker_thread, &thread_args);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_create");
+                       abort();
+               }
+       }
+
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_join(worker_threads[i], NULL);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_join");
+                       abort();
+               }
+       }
+
+       atomic_store(&thread_args.stop, 1);
+       ret = pthread_join(manager_thread, NULL);
+       if (ret) {
+               errno = ret;
+               perror("pthread_join");
+               abort();
+       }
+}
+#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */
+void test_membarrier(void)
+{
+       fprintf(stderr, "rseq_offset_deref_addv is not implemented on this architecture. "
+                       "Skipping membarrier test.\n");
+}
+#endif
+
 static void show_usage(int argc, char **argv)
 {
        printf("Usage : %s <OPTIONS>\n",
@@ -1153,7 +1369,7 @@ static void show_usage(int argc, char **argv)
        printf("        [-r N] Number of repetitions per thread (default 5000)\n");
        printf("        [-d] Disable rseq system call (no initialization)\n");
        printf("        [-D M] Disable rseq for each M threads\n");
-       printf("        [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
+       printf("        [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n");
        printf("        [-M] Push into buffer and memcpy buffer with memory barriers.\n");
        printf("        [-v] Verbose output.\n");
        printf("        [-h] Show this help.\n");
@@ -1268,6 +1484,7 @@ int main(int argc, char **argv)
                        case 'i':
                        case 'b':
                        case 'm':
+                       case 'r':
                                break;
                        default:
                                show_usage(argc, argv);
@@ -1320,6 +1537,10 @@ int main(int argc, char **argv)
                printf_verbose("counter increment\n");
                test_percpu_inc();
                break;
+       case 'r':
+               printf_verbose("membarrier\n");
+               test_membarrier();
+               break;
        }
        if (!opt_disable_rseq && rseq_unregister_current_thread())
                abort();
index b2da600..6404115 100644 (file)
@@ -279,6 +279,63 @@ error1:
 #endif
 }
 
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+
+/*
+ *   pval = *(ptr+off)
+ *  *pval += inc;
+ */
+static inline __attribute__((always_inline))
+int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
+{
+       RSEQ_INJECT_C(9)
+
+       __asm__ __volatile__ goto (
+               RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+               /* Start rseq by storing table entry pointer into rseq_cs. */
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
+               RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
+#endif
+               /* get p+v */
+               "movq %[ptr], %%rbx\n\t"
+               "addq %[off], %%rbx\n\t"
+               /* get pv */
+               "movq (%%rbx), %%rcx\n\t"
+               /* *pv += inc */
+               "addq %[inc], (%%rcx)\n\t"
+               "2:\n\t"
+               RSEQ_INJECT_ASM(4)
+               RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+               : /* gcc asm goto does not allow outputs */
+               : [cpu_id]              "r" (cpu),
+                 [rseq_abi]            "r" (&__rseq_abi),
+                 /* final store input */
+                 [ptr]                 "m" (*ptr),
+                 [off]                 "er" (off),
+                 [inc]                 "er" (inc)
+               : "memory", "cc", "rax", "rbx", "rcx"
+                 RSEQ_INJECT_CLOBBER
+               : abort
+#ifdef RSEQ_COMPARE_TWICE
+                 , error1
+#endif
+       );
+       return 0;
+abort:
+       RSEQ_INJECT_FAILED
+       return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+       rseq_bug("cpu_id comparison failed");
+#endif
+}
+
 static inline __attribute__((always_inline))
 int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
                                 intptr_t *v2, intptr_t newv2,
index e426304..f51bc83 100755 (executable)
@@ -15,6 +15,7 @@ TEST_LIST=(
        "-T m"
        "-T m -M"
        "-T i"
+       "-T r"
 )
 
 TEST_NAME=(
@@ -25,6 +26,7 @@ TEST_NAME=(
        "memcpy"
        "memcpy with barrier"
        "increment"
+       "membarrier"
 )
 IFS="$OLDIFS"