mm: delay page_remove_rmap() until after the TLB has been flushed

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 9 Nov 2022 20:30:51 +0000 (12:30 -0800)

committer Andrew Morton <akpm@linux-foundation.org>

Wed, 30 Nov 2022 23:58:50 +0000 (15:58 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 9 Nov 2022 20:30:51 +0000 (12:30 -0800)
committer Andrew Morton <akpm@linux-foundation.org>
Wed, 30 Nov 2022 23:58:50 +0000 (15:58 -0800)
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h

index 0514222..b91f4a9 100644 (file)
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -41,6 +41,9 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
   * Release the page cache reference for a pte removed by
   * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
   * has already been freed, so just do free_page_and_swap_cache.
+ *
+ * s390 doesn't delay rmap removal, so there is nothing encoded in
+ * the page pointer.
   */
  static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
                                           struct encoded_page *page,
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h

index 54d03d1..b466172 100644 (file)
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -263,6 +263,28 @@ struct mmu_gather_batch {
  extern bool __tlb_remove_page_size(struct mmu_gather *tlb,
                                    struct encoded_page *page,
                                    int page_size);
+
+#ifdef CONFIG_SMP
+/*
+ * This both sets 'delayed_rmap', and returns true. It would be an inline
+ * function, except we define it before the 'struct mmu_gather'.
+ */
+#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true)
+extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma);
+#endif
+
+#endif
+
+/*
+ * We have a no-op version of the rmap removal that doesn't
+ * delay anything. That is used on S390, which flushes remote
+ * TLBs synchronously, and on UP, which doesn't have any
+ * remote TLBs to flush and is not preemptible due to this
+ * all happening under the page table lock.
+ */
+#ifndef tlb_delay_rmap
+#define tlb_delay_rmap(tlb) (false)
+static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
  #endif
  
  /*
@@ -295,6 +317,11 @@ struct mmu_gather {
          */
         unsigned int            freed_tables : 1;
  
+       /*
+        * Do we have pending delayed rmap removals?
+        */
+       unsigned int            delayed_rmap : 1;
+
         /*
          * at which levels have we cleared entries?
          */
@@ -440,9 +467,9 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
                 tlb_flush_mmu(tlb);
  }
  
-static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags)
  {
-       return __tlb_remove_page_size(tlb, encode_page(page, 0), PAGE_SIZE);
+       return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE);
  }
  
  /* tlb_remove_page
diff --git a/mm/memory.c b/mm/memory.c

index 1749c63..6c85cba 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1374,6 +1374,8 @@ again:
                         break;
  
                 if (pte_present(ptent)) {
+                       unsigned int delay_rmap;
+
                         page = vm_normal_page(vma, addr, ptent);
                         if (unlikely(!should_zap_page(details, page)))
                                 continue;
@@ -1385,20 +1387,26 @@ again:
                         if (unlikely(!page))
                                 continue;
  
+                       delay_rmap = 0;
                         if (!PageAnon(page)) {
                                 if (pte_dirty(ptent)) {
-                                       force_flush = 1;
                                         set_page_dirty(page);
+                                       if (tlb_delay_rmap(tlb)) {
+                                               delay_rmap = 1;
+                                               force_flush = 1;
+                                       }
                                 }
                                 if (pte_young(ptent) &&
                                     likely(!(vma->vm_flags & VM_SEQ_READ)))
                                         mark_page_accessed(page);
                         }
                         rss[mm_counter(page)]--;
-                       page_remove_rmap(page, vma, false);
-                       if (unlikely(page_mapcount(page) < 0))
-                               print_bad_pte(vma, addr, ptent, page);
-                       if (unlikely(__tlb_remove_page(tlb, page))) {
+                       if (!delay_rmap) {
+                               page_remove_rmap(page, vma, false);
+                               if (unlikely(page_mapcount(page) < 0))
+                                       print_bad_pte(vma, addr, ptent, page);
+                       }
+                       if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
                                 force_flush = 1;
                                 addr += PAGE_SIZE;
                                 break;
@@ -1455,8 +1463,11 @@ again:
         arch_leave_lazy_mmu_mode();
  
         /* Do the actual TLB flush before dropping ptl */
-       if (force_flush)
+       if (force_flush) {
                 tlb_flush_mmu_tlbonly(tlb);
+               if (tlb->delayed_rmap)
+                       tlb_flush_rmaps(tlb, vma);
+       }
         pte_unmap_unlock(start_pte, ptl);
  
         /*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c

index 382581c..1de1cf9 100644 (file)
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -9,6 +9,7 @@
  #include <linux/rcupdate.h>
  #include <linux/smp.h>
  #include <linux/swap.h>
+#include <linux/rmap.h>
  
  #include <asm/pgalloc.h>
  #include <asm/tlb.h>
@@ -19,6 +20,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
  {
         struct mmu_gather_batch *batch;
  
+       /* No more batching if we have delayed rmaps pending */
+       if (tlb->delayed_rmap)
+               return false;
+
         batch = tlb->active;
         if (batch->next) {
                 tlb->active = batch->next;
@@ -43,6 +48,33 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
         return true;
  }
  
+#ifdef CONFIG_SMP
+/**
+ * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
+ * @tlb: the current mmu_gather
+ *
+ * Note that because of how tlb_next_batch() above works, we will
+ * never start new batches with pending delayed rmaps, so we only
+ * need to walk through the current active batch.
+ */
+void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+       struct mmu_gather_batch *batch;
+
+       batch = tlb->active;
+       for (int i = 0; i < batch->nr; i++) {
+               struct encoded_page *enc = batch->encoded_pages[i];
+
+               if (encoded_page_flags(enc)) {
+                       struct page *page = encoded_page_ptr(enc);
+                       page_remove_rmap(page, vma, false);
+               }
+       }
+
+       tlb->delayed_rmap = 0;
+}
+#endif
+
  static void tlb_batch_pages_flush(struct mmu_gather *tlb)
  {
         struct mmu_gather_batch *batch;
@@ -284,6 +316,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
         tlb->active     = &tlb->local;
         tlb->batch_count = 0;
  #endif
+       tlb->delayed_rmap = 0;
  
         tlb_table_init(tlb);
  #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 9 Nov 2022 20:30:51 +0000 (12:30 -0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Wed, 30 Nov 2022 23:58:50 +0000 (15:58 -0800)
arch/s390/include/asm/tlb.h		patch \| blob \| history
include/asm-generic/tlb.h		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/mmu_gather.c		patch \| blob \| history