[PATCH] sched: new sched domain for representing multi-core

author Siddha, Suresh B <suresh.b.siddha@intel.com>

Mon, 27 Mar 2006 09:15:22 +0000 (01:15 -0800)

committer Linus Torvalds <torvalds@g5.osdl.org>

Mon, 27 Mar 2006 16:44:43 +0000 (08:44 -0800)
author Siddha, Suresh B <suresh.b.siddha@intel.com>
Mon, 27 Mar 2006 09:15:22 +0000 (01:15 -0800)
committer Linus Torvalds <torvalds@g5.osdl.org>
Mon, 27 Mar 2006 16:44:43 +0000 (08:44 -0800)
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig

index f7db71d..f17bd1d 100644 (file)
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -231,6 +231,15 @@ config SCHED_SMT
           cost of slightly increased overhead in some places. If unsure say
           N here.
  
+config SCHED_MC
+       bool "Multi-core scheduler support"
+       depends on SMP
+       default y
+       help
+         Multi-core scheduler support improves the CPU scheduler's decision
+         making when dealing with multi-core CPU chips at a cost of slightly
+         increased overhead in some places. If unsure say N here.
+
  source "kernel/Kconfig.preempt"
  
  config X86_UP_APIC
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c

index 7e3d6b6..a06a490 100644 (file)
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
  void __cpuinit generic_identify(struct cpuinfo_x86 * c)
  {
         u32 tfms, xlvl;
-       int junk;
+       int ebx;
  
         if (have_cpuid_p()) {
                 /* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
                 /* Intel-defined flags: level 0x00000001 */
                 if ( c->cpuid_level >= 0x00000001 ) {
                         u32 capability, excap;
-                       cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+                       cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
                         c->x86_capability[0] = capability;
                         c->x86_capability[4] = excap;
                         c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
                         if (c->x86 >= 0x6)
                                 c->x86_model += ((tfms >> 16) & 0xF) << 4;
                         c->x86_mask = tfms & 15;
+#ifdef CONFIG_SMP
+                       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+                       c->apicid = (ebx >> 24) & 0xFF;
+#endif
                 } else {
                         /* Have CPUID level 0 only - unheard of */
                         c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
  
         cpuid(1, &eax, &ebx, &ecx, &edx);
  
-       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
  
         if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                 return;
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c

index ce61921..7e7fd4e 100644 (file)
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
         unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
         unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
         unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+       unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+       unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
  
         if (c->cpuid_level > 3) {
                 static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                                         break;
                                     case 2:
                                         new_l2 = this_leaf.size/1024;
+                                       num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+                                       index_msb = get_count_order(num_threads_sharing);
+                                       l2_id = c->apicid >> index_msb;
                                         break;
                                     case 3:
                                         new_l3 = this_leaf.size/1024;
+                                       num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+                                       index_msb = get_count_order(num_threads_sharing);
+                                       l3_id = c->apicid >> index_msb;
                                         break;
                                     default:
                                         break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                 if (new_l1i)
                         l1i = new_l1i;
  
-               if (new_l2)
+               if (new_l2) {
                         l2 = new_l2;
+#ifdef CONFIG_SMP
+                       cpu_llc_id[cpu] = l2_id;
+#endif
+               }
  
-               if (new_l3)
+               if (new_l3) {
                         l3 = new_l3;
+#ifdef CONFIG_SMP
+                       cpu_llc_id[cpu] = l3_id;
+#endif
+               }
  
                 if ( trace )
                         printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c

index 82371d8..a696990 100644 (file)
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
  /* Core ID of each logical CPU */
  int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
  
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
  /* representing HT siblings of each logical CPU */
  cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
  EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
  
  static int cpucount;
  
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       /*
+        * For perf, we return last level cache shared map.
+        * TBD: when power saving sched policy is added, we will return
+        *      cpu_core_map when power saving policy is enabled
+        */
+       return c->llc_shared_map;
+}
+
  /* representing cpus for which sibling maps can be computed */
  static cpumask_t cpu_sibling_setup_map;
  
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
                                 cpu_set(cpu, cpu_sibling_map[i]);
                                 cpu_set(i, cpu_core_map[cpu]);
                                 cpu_set(cpu, cpu_core_map[i]);
+                               cpu_set(i, c[cpu].llc_shared_map);
+                               cpu_set(cpu, c[i].llc_shared_map);
                         }
                 }
         } else {
                 cpu_set(cpu, cpu_sibling_map[cpu]);
         }
  
+       cpu_set(cpu, c[cpu].llc_shared_map);
+
         if (current_cpu_data.x86_max_cores == 1) {
                 cpu_core_map[cpu] = cpu_sibling_map[cpu];
                 c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
         }
  
         for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (cpu_llc_id[cpu] != BAD_APICID &&
+                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                       cpu_set(i, c[cpu].llc_shared_map);
+                       cpu_set(cpu, c[i].llc_shared_map);
+               }
                 if (phys_proc_id[cpu] == phys_proc_id[i]) {
                         cpu_set(i, cpu_core_map[cpu]);
                         cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig

index 45efe0c..1cb4aa2 100644 (file)
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -250,6 +250,15 @@ config SCHED_SMT
           cost of slightly increased overhead in some places. If unsure say
           N here.
  
+config SCHED_MC
+       bool "Multi-core scheduler support"
+       depends on SMP
+       default y
+       help
+         Multi-core scheduler support improves the CPU scheduler's decision
+         making when dealing with multi-core CPU chips at a cost of slightly
+         increased overhead in some places. If unsure say N here.
+
  source "kernel/Kconfig.preempt"
  
  config NUMA
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c

index a57eec8..d1f3e92 100644 (file)
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
  
         cpuid(1, &eax, &ebx, &ecx, &edx);
  
-       c->apicid = phys_pkg_id(0);
  
         if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                 return;
@@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                         c->x86_capability[2] = cpuid_edx(0x80860001);
         }
  
+       c->apicid = phys_pkg_id(0);
+
         /*
          * Vendor-specific initialization.  In this section we
          * canonicalize the feature flags, meaning if there are
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c

index 66e9865..ea48fa6 100644 (file)
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
  /* core ID of each logical CPU */
  u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
  
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+
  /* Bitmask of currently online CPUs */
  cpumask_t cpu_online_map __read_mostly;
  
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
         cpu_set(cpuid, cpu_callin_map);
  }
  
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       /*
+        * For perf, we return last level cache shared map.
+        * TBD: when power saving sched policy is added, we will return
+        *      cpu_core_map when power saving policy is enabled
+        */
+       return c->llc_shared_map;
+}
+
  /* representing cpus for which sibling maps can be computed */
  static cpumask_t cpu_sibling_setup_map;
  
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
                                 cpu_set(cpu, cpu_sibling_map[i]);
                                 cpu_set(i, cpu_core_map[cpu]);
                                 cpu_set(cpu, cpu_core_map[i]);
+                               cpu_set(i, c[cpu].llc_shared_map);
+                               cpu_set(cpu, c[i].llc_shared_map);
                         }
                 }
         } else {
                 cpu_set(cpu, cpu_sibling_map[cpu]);
         }
  
+       cpu_set(cpu, c[cpu].llc_shared_map);
+
         if (current_cpu_data.x86_max_cores == 1) {
                 cpu_core_map[cpu] = cpu_sibling_map[cpu];
                 c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
         }
  
         for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (cpu_llc_id[cpu] != BAD_APICID &&
+                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                       cpu_set(i, c[cpu].llc_shared_map);
+                       cpu_set(cpu, c[i].llc_shared_map);
+               }
                 if (phys_proc_id[cpu] == phys_proc_id[i]) {
                         cpu_set(i, cpu_core_map[cpu]);
                         cpu_set(cpu, cpu_core_map[i]);
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h

index feca5d9..af4bfd0 100644 (file)
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
  #include <linux/config.h>
  #include <linux/threads.h>
  #include <asm/percpu.h>
+#include <linux/cpumask.h>
  
  /* flag for disabling the tsc */
  extern int tsc_disable;
@@ -67,6 +68,9 @@ struct cpuinfo_x86 {
         char    pad0;
         int     x86_power;
         unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
+#endif
         unsigned char x86_max_cores;    /* cpuid returned max cores value */
         unsigned char booted_cores;     /* number of cores as seen by OS */
         unsigned char apicid;
@@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];
  
  extern int phys_proc_id[NR_CPUS];
  extern int cpu_core_id[NR_CPUS];
+extern int cpu_llc_id[NR_CPUS];
  extern char ignore_fpu_irq;
  
  extern void identify_cpu(struct cpuinfo_x86 *);
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h

index aa958c6..b94e5ee 100644 (file)
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];
  
  #endif /* CONFIG_NUMA */
  
+extern cpumask_t cpu_coregroup_map(int cpu);
+
  #endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h

index 8c8d88c..1aa2cee 100644 (file)
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -20,6 +20,7 @@
  #include <asm/mmsegment.h>
  #include <asm/percpu.h>
  #include <linux/personality.h>
+#include <linux/cpumask.h>
  
  #define TF_MASK                0x00000100
  #define IF_MASK                0x00000200
@@ -65,6 +66,9 @@ struct cpuinfo_x86 {
          __u32   x86_power;     
         __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
         unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
+#endif
         __u8    apicid;
         __u8    booted_cores;   /* number of cores as seen by OS */
  } ____cacheline_aligned;
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h

index 9ccbb2c..a4fdaeb 100644 (file)
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
  extern cpumask_t cpu_core_map[NR_CPUS];
  extern u8 phys_proc_id[NR_CPUS];
  extern u8 cpu_core_id[NR_CPUS];
+extern u8 cpu_llc_id[NR_CPUS];
  
  #define SMP_TRAMPOLINE_BASE 0x6000
  
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h

index c642f5d..9db54e9 100644 (file)
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -68,4 +68,6 @@ extern int __node_distance(int, int);
  
  #include <asm-generic/topology.h>
  
+extern cpumask_t cpu_coregroup_map(int cpu);
+
  #endif
diff --git a/include/linux/topology.h b/include/linux/topology.h

index e8eb004..a305ae2 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -164,6 +164,15 @@
         .nr_balance_failed      = 0,                    \
  }
  
+#ifdef CONFIG_SCHED_MC
+#ifndef SD_MC_INIT
+/* for now its same as SD_CPU_INIT.
+ * TBD: Tune Domain parameters!
+ */
+#define SD_MC_INIT   SD_CPU_INIT
+#endif
+#endif
+
  #ifdef CONFIG_NUMA
  #ifndef SD_NODE_INIT
  #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c

index a96a05d..8a8b71b 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
  }
  #endif
  
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+       return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+       return cpu;
+}
+#endif
+
  static DEFINE_PER_CPU(struct sched_domain, phys_domains);
  static struct sched_group sched_group_phys[NR_CPUS];
  static int cpu_to_phys_group(int cpu)
  {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+       cpumask_t mask = cpu_coregroup_map(cpu);
+       return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
         return first_cpu(cpu_sibling_map[cpu]);
  #else
         return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
                 sd->parent = p;
                 sd->groups = &sched_group_phys[group];
  
+#ifdef CONFIG_SCHED_MC
+               p = sd;
+               sd = &per_cpu(core_domains, i);
+               group = cpu_to_core_group(i);
+               *sd = SD_MC_INIT;
+               sd->span = cpu_coregroup_map(i);
+               cpus_and(sd->span, sd->span, *cpu_map);
+               sd->parent = p;
+               sd->groups = &sched_group_core[group];
+#endif
+
  #ifdef CONFIG_SCHED_SMT
                 p = sd;
                 sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
         }
  #endif
  
+#ifdef CONFIG_SCHED_MC
+       /* Set up multi-core groups */
+       for_each_cpu_mask(i, *cpu_map) {
+               cpumask_t this_core_map = cpu_coregroup_map(i);
+               cpus_and(this_core_map, this_core_map, *cpu_map);
+               if (i != first_cpu(this_core_map))
+                       continue;
+               init_sched_build_groups(sched_group_core, this_core_map,
+                                       &cpu_to_core_group);
+       }
+#endif
+
+
         /* Set up physical groups */
         for (i = 0; i < MAX_NUMNODES; i++) {
                 cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
                 power = SCHED_LOAD_SCALE;
                 sd->groups->cpu_power = power;
  #endif
+#ifdef CONFIG_SCHED_MC
+               sd = &per_cpu(core_domains, i);
+               power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+                                           * SCHED_LOAD_SCALE / 10;
+               sd->groups->cpu_power = power;
+
+               sd = &per_cpu(phys_domains, i);
  
+               /*
+                * This has to be < 2 * SCHED_LOAD_SCALE
+                * Lets keep it SCHED_LOAD_SCALE, so that
+                * while calculating NUMA group's cpu_power
+                * we can simply do
+                *  numa_group->cpu_power += phys_group->cpu_power;
+                *
+                * See "only add power once for each physical pkg"
+                * comment below
+                */
+               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
                 sd = &per_cpu(phys_domains, i);
                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
                 sd->groups->cpu_power = power;
+#endif
  
  #ifdef CONFIG_NUMA
                 sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
  next_sg:
                 for_each_cpu_mask(j, sg->cpumask) {
                         struct sched_domain *sd;
-                       int power;
  
                         sd = &per_cpu(phys_domains, j);
                         if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
                                  */
                                 continue;
                         }
-                       power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                               (cpus_weight(sd->groups->cpumask)-1) / 10;
  
-                       sg->cpu_power += power;
+                       sg->cpu_power += sd->groups->cpu_power;
                 }
                 sg = sg->next;
                 if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
                 struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
                 sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+               sd = &per_cpu(core_domains, i);
  #else
                 sd = &per_cpu(phys_domains, i);
  #endif
author	Siddha, Suresh B <suresh.b.siddha@intel.com>
	Mon, 27 Mar 2006 09:15:22 +0000 (01:15 -0800)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Mon, 27 Mar 2006 16:44:43 +0000 (08:44 -0800)
arch/i386/Kconfig		patch \| blob \| history
arch/i386/kernel/cpu/common.c		patch \| blob \| history
arch/i386/kernel/cpu/intel_cacheinfo.c		patch \| blob \| history
arch/i386/kernel/smpboot.c		patch \| blob \| history
arch/x86_64/Kconfig		patch \| blob \| history
arch/x86_64/kernel/setup.c		patch \| blob \| history
arch/x86_64/kernel/smpboot.c		patch \| blob \| history
include/asm-i386/processor.h		patch \| blob \| history
include/asm-i386/topology.h		patch \| blob \| history
include/asm-x86_64/processor.h		patch \| blob \| history
include/asm-x86_64/smp.h		patch \| blob \| history
include/asm-x86_64/topology.h		patch \| blob \| history
include/linux/topology.h		patch \| blob \| history
kernel/sched.c		patch \| blob \| history