mm: avoid false sharing of mm_counter

author KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Fri, 5 Mar 2010 21:41:40 +0000 (13:41 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 6 Mar 2010 19:26:24 +0000 (11:26 -0800)
author KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Fri, 5 Mar 2010 21:41:40 +0000 (13:41 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 6 Mar 2010 19:26:24 +0000 (11:26 -0800)
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt

index 0d07513..e418f3d 100644 (file)
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3.  The stat file
  contains details information about the process itself.  Its fields are
  explained in Table 1-4.
  
+(for SMP CONFIG users)
+For making accounting scalable, RSS related information are handled in
+asynchronous manner and the vaule may not be very precise. To see a precise
+snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
+It's slow but very precise.
+
  Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
  ..............................................................................
   Field                       Content
diff --git a/fs/exec.c b/fs/exec.c

index cce6bbd..ea78617 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
         /* Notify parent that we're no longer interested in the old VM */
         tsk = current;
         old_mm = current->mm;
+       sync_mm_rss(tsk, old_mm);
         mm_release(tsk, old_mm);
  
         if (old_mm) {
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 2124cdb..8e580c0 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  /*
   * per-process(per-mm_struct) statistics.
   */
-#if USE_SPLIT_PTLOCKS
+#if defined(SPLIT_RSS_COUNTING)
  /*
   * The mm counters are not protected by its page_table_lock,
   * so must be incremented atomically.
@@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
         atomic_long_set(&mm->rss_stat.count[member], value);
  }
  
-static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
-{
-       return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]);
-}
+unsigned long get_mm_counter(struct mm_struct *mm, int member);
  
  static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
  {
@@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
                 *maxrss = hiwater_rss;
  }
  
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
  
  /*
   * A callback you can register to apply pressure to ageable caches.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index e1ca64b..2186123 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -202,9 +202,15 @@ enum {
  };
  
  #if USE_SPLIT_PTLOCKS
+#define SPLIT_RSS_COUNTING
  struct mm_rss_stat {
         atomic_long_t count[NR_MM_COUNTERS];
  };
+/* per-thread cached information, */
+struct task_rss_stat {
+       int events;     /* for synchronization threshold */
+       int count[NR_MM_COUNTERS];
+};
  #else  /* !USE_SPLIT_PTLOCKS */
  struct mm_rss_stat {
         unsigned long count[NR_MM_COUNTERS];
diff --git a/include/linux/sched.h b/include/linux/sched.h

index cbeafa4..46c6f8d 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,9 @@ struct task_struct {
         struct plist_node pushable_tasks;
  
         struct mm_struct *mm, *active_mm;
-
+#if defined(SPLIT_RSS_COUNTING)
+       struct task_rss_stat    rss_stat;
+#endif
  /* task state */
         int exit_state;
         int exit_code, exit_signal;
diff --git a/kernel/exit.c b/kernel/exit.c

index 45ed043..10d3c5d 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code)
                                 preempt_count());
  
         acct_update_integrals(tsk);
-
+       /* sync mm's RSS info before statistics gathering */
+       sync_mm_rss(tsk, tsk->mm);
         group_dead = atomic_dec_and_test(&tsk->signal->live);
         if (group_dead) {
                 hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/mm/memory.c b/mm/memory.c

index c576784..a459761 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void)
  core_initcall(init_zero_pfn);
  
  
+#if defined(SPLIT_RSS_COUNTING)
+
+void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+{
+       int i;
+
+       for (i = 0; i < NR_MM_COUNTERS; i++) {
+               if (task->rss_stat.count[i]) {
+                       add_mm_counter(mm, i, task->rss_stat.count[i]);
+                       task->rss_stat.count[i] = 0;
+               }
+       }
+       task->rss_stat.events = 0;
+}
+
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+{
+       struct task_struct *task = current;
+
+       if (likely(task->mm == mm))
+               task->rss_stat.count[member] += val;
+       else
+               add_mm_counter(mm, member, val);
+}
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
+
+/* sync counter once per 64 page faults */
+#define TASK_RSS_EVENTS_THRESH (64)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+       if (unlikely(task != current))
+               return;
+       if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
+               __sync_task_rss_stat(task, task->mm);
+}
+
+unsigned long get_mm_counter(struct mm_struct *mm, int member)
+{
+       long val = 0;
+
+       /*
+        * Don't use task->mm here...for avoiding to use task_get_mm()..
+        * The caller must guarantee task->mm is not invalid.
+        */
+       val = atomic_long_read(&mm->rss_stat.count[member]);
+       /*
+        * counter is updated in asynchronous manner and may go to minus.
+        * But it's never be expected number for users.
+        */
+       if (val < 0)
+               return 0;
+       return (unsigned long)val;
+}
+
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+       __sync_task_rss_stat(task, mm);
+}
+#else
+
+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+
+static void check_sync_rss_stat(struct task_struct *task)
+{
+}
+
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+}
+#endif
+
  /*
   * If a p?d_bad entry is found while walking page tables, report
   * the error, before resetting entry to p?d_none.  Usually (but
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
  {
         int i;
  
+       if (current->mm == mm)
+               sync_mm_rss(current, mm);
         for (i = 0; i < NR_MM_COUNTERS; i++)
                 if (rss[i])
                         add_mm_counter(mm, i, rss[i]);
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
  
         /* Ok, finally just insert the thing.. */
         get_page(page);
-       inc_mm_counter(mm, MM_FILEPAGES);
+       inc_mm_counter_fast(mm, MM_FILEPAGES);
         page_add_file_rmap(page);
         set_pte_at(mm, addr, pte, mk_pte(page, prot));
  
@@ -2175,11 +2250,11 @@ gotten:
         if (likely(pte_same(*page_table, orig_pte))) {
                 if (old_page) {
                         if (!PageAnon(old_page)) {
-                               dec_mm_counter(mm, MM_FILEPAGES);
-                               inc_mm_counter(mm, MM_ANONPAGES);
+                               dec_mm_counter_fast(mm, MM_FILEPAGES);
+                               inc_mm_counter_fast(mm, MM_ANONPAGES);
                         }
                 } else
-                       inc_mm_counter(mm, MM_ANONPAGES);
+                       inc_mm_counter_fast(mm, MM_ANONPAGES);
                 flush_cache_page(vma, address, pte_pfn(orig_pte));
                 entry = mk_pte(new_page, vma->vm_page_prot);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
          * discarded at swap_free().
          */
  
-       inc_mm_counter(mm, MM_ANONPAGES);
+       inc_mm_counter_fast(mm, MM_ANONPAGES);
         pte = mk_pte(page, vma->vm_page_prot);
         if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
                 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         if (!pte_none(*page_table))
                 goto release;
  
-       inc_mm_counter(mm, MM_ANONPAGES);
+       inc_mm_counter_fast(mm, MM_ANONPAGES);
         page_add_new_anon_rmap(page, vma, address);
  setpte:
         set_pte_at(mm, address, page_table, entry);
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 if (flags & FAULT_FLAG_WRITE)
                         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                 if (anon) {
-                       inc_mm_counter(mm, MM_ANONPAGES);
+                       inc_mm_counter_fast(mm, MM_ANONPAGES);
                         page_add_new_anon_rmap(page, vma, address);
                 } else {
-                       inc_mm_counter(mm, MM_FILEPAGES);
+                       inc_mm_counter_fast(mm, MM_FILEPAGES);
                         page_add_file_rmap(page);
                         if (flags & FAULT_FLAG_WRITE) {
                                 dirty_page = page;
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         count_vm_event(PGFAULT);
  
+       /* do counter updates before entering really critical section. */
+       check_sync_rss_stat(current);
+
         if (unlikely(is_vm_hugetlb_page(vma)))
                 return hugetlb_fault(mm, vma, address, flags);
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
	Fri, 5 Mar 2010 21:41:40 +0000 (13:41 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 6 Mar 2010 19:26:24 +0000 (11:26 -0800)
Documentation/filesystems/proc.txt		patch \| blob \| history
fs/exec.c		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history