mm/mmu_gather.c

   1 #include <linux/gfp.h>
   2 #include <linux/highmem.h>
   3 #include <linux/kernel.h>
   4 #include <linux/mmdebug.h>
   5 #include <linux/mm_types.h>
   6 #include <linux/mm_inline.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/rcupdate.h>
   9 #include <linux/smp.h>
  10 #include <linux/swap.h>
  11 #include <linux/rmap.h>
  12
  13 #include <asm/pgalloc.h>
  14 #include <asm/tlb.h>
  15
  16 #ifndef CONFIG_MMU_GATHER_NO_GATHER
  17
  18 static bool tlb_next_batch(struct mmu_gather *tlb)
  19 {
  20         struct mmu_gather_batch *batch;
  21
  22         /* Limit batching if we have delayed rmaps pending */
  23         if (tlb->delayed_rmap && tlb->active != &tlb->local)
  24                 return false;
  25
  26         batch = tlb->active;
  27         if (batch->next) {
  28                 tlb->active = batch->next;
  29                 return true;
  30         }
  31
  32         if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
  33                 return false;
  34
  35         batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
  36         if (!batch)
  37                 return false;
  38
  39         tlb->batch_count++;
  40         batch->next = NULL;
  41         batch->nr   = 0;
  42         batch->max  = MAX_GATHER_BATCH;
  43
  44         tlb->active->next = batch;
  45         tlb->active = batch;
  46
  47         return true;
  48 }
  49
  50 #ifdef CONFIG_SMP
  51 static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
  52 {
  53         for (int i = 0; i < batch->nr; i++) {
  54                 struct encoded_page *enc = batch->encoded_pages[i];
  55
  56                 if (encoded_page_flags(enc)) {
  57                         struct page *page = encoded_page_ptr(enc);
  58                         folio_remove_rmap_pte(page_folio(page), page, vma);
  59                 }
  60         }
  61 }
  62
  63 /**
  64  * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
  65  * @tlb: the current mmu_gather
  66  * @vma: The memory area from which the pages are being removed.
  67  *
  68  * Note that because of how tlb_next_batch() above works, we will
  69  * never start multiple new batches with pending delayed rmaps, so
  70  * we only need to walk through the current active batch and the
  71  * original local one.
  72  */
  73 void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
  74 {
  75         if (!tlb->delayed_rmap)
  76                 return;
  77
  78         tlb_flush_rmap_batch(&tlb->local, vma);
  79         if (tlb->active != &tlb->local)
  80                 tlb_flush_rmap_batch(tlb->active, vma);
  81         tlb->delayed_rmap = 0;
  82 }
  83 #endif
  84
  85 static void tlb_batch_pages_flush(struct mmu_gather *tlb)
  86 {
  87         struct mmu_gather_batch *batch;
  88
  89         for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
  90                 struct encoded_page **pages = batch->encoded_pages;
  91
  92                 do {
  93                         /*
  94                          * limit free batch count when PAGE_SIZE > 4K
  95                          */
  96                         unsigned int nr = min(512U, batch->nr);
  97
  98                         free_pages_and_swap_cache(pages, nr);
  99                         pages += nr;
 100                         batch->nr -= nr;
 101
 102                         cond_resched();
 103                 } while (batch->nr);
 104         }
 105         tlb->active = &tlb->local;
 106 }
 107
 108 static void tlb_batch_list_free(struct mmu_gather *tlb)
 109 {
 110         struct mmu_gather_batch *batch, *next;
 111
 112         for (batch = tlb->local.next; batch; batch = next) {
 113                 next = batch->next;
 114                 free_pages((unsigned long)batch, 0);
 115         }
 116         tlb->local.next = NULL;
 117 }
 118
 119 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
 120 {
 121         struct mmu_gather_batch *batch;
 122
 123         VM_BUG_ON(!tlb->end);
 124
 125 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
 126         VM_WARN_ON(tlb->page_size != page_size);
 127 #endif
 128
 129         batch = tlb->active;
 130         /*
 131          * Add the page and check if we are full. If so
 132          * force a flush.
 133          */
 134         batch->encoded_pages[batch->nr++] = page;
 135         if (batch->nr == batch->max) {
 136                 if (!tlb_next_batch(tlb))
 137                         return true;
 138                 batch = tlb->active;
 139         }
 140         VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
 141
 142         return false;
 143 }
 144
 145 #endif /* MMU_GATHER_NO_GATHER */
 146
 147 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
 148
 149 static void __tlb_remove_table_free(struct mmu_table_batch *batch)
 150 {
 151         int i;
 152
 153         for (i = 0; i < batch->nr; i++)
 154                 __tlb_remove_table(batch->tables[i]);
 155
 156         free_page((unsigned long)batch);
 157 }
 158
 159 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
 160
 161 /*
 162  * Semi RCU freeing of the page directories.
 163  *
 164  * This is needed by some architectures to implement software pagetable walkers.
 165  *
 166  * gup_fast() and other software pagetable walkers do a lockless page-table
 167  * walk and therefore needs some synchronization with the freeing of the page
 168  * directories. The chosen means to accomplish that is by disabling IRQs over
 169  * the walk.
 170  *
 171  * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 172  * since we unlink the page, flush TLBs, free the page. Since the disabling of
 173  * IRQs delays the completion of the TLB flush we can never observe an already
 174  * freed page.
 175  *
 176  * Architectures that do not have this (PPC) need to delay the freeing by some
 177  * other means, this is that means.
 178  *
 179  * What we do is batch the freed directory pages (tables) and RCU free them.
 180  * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 181  * holds off grace periods.
 182  *
 183  * However, in order to batch these pages we need to allocate storage, this
 184  * allocation is deep inside the MM code and can thus easily fail on memory
 185  * pressure. To guarantee progress we fall back to single table freeing, see
 186  * the implementation of tlb_remove_table_one().
 187  *
 188  */
 189
 190 static void tlb_remove_table_smp_sync(void *arg)
 191 {
 192         /* Simply deliver the interrupt */
 193 }
 194
 195 void tlb_remove_table_sync_one(void)
 196 {
 197         /*
 198          * This isn't an RCU grace period and hence the page-tables cannot be
 199          * assumed to be actually RCU-freed.
 200          *
 201          * It is however sufficient for software page-table walkers that rely on
 202          * IRQ disabling.
 203          */
 204         smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 205 }
 206
 207 static void tlb_remove_table_rcu(struct rcu_head *head)
 208 {
 209         __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
 210 }
 211
 212 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 213 {
 214         call_rcu(&batch->rcu, tlb_remove_table_rcu);
 215 }
 216
 217 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 218
 219 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 220 {
 221         __tlb_remove_table_free(batch);
 222 }
 223
 224 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 225
 226 /*
 227  * If we want tlb_remove_table() to imply TLB invalidates.
 228  */
 229 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 230 {
 231         if (tlb_needs_table_invalidate()) {
 232                 /*
 233                  * Invalidate page-table caches used by hardware walkers. Then
 234                  * we still need to RCU-sched wait while freeing the pages
 235                  * because software walkers can still be in-flight.
 236                  */
 237                 tlb_flush_mmu_tlbonly(tlb);
 238         }
 239 }
 240
 241 static void tlb_remove_table_one(void *table)
 242 {
 243         tlb_remove_table_sync_one();
 244         __tlb_remove_table(table);
 245 }
 246
 247 static void tlb_table_flush(struct mmu_gather *tlb)
 248 {
 249         struct mmu_table_batch **batch = &tlb->batch;
 250
 251         if (*batch) {
 252                 tlb_table_invalidate(tlb);
 253                 tlb_remove_table_free(*batch);
 254                 *batch = NULL;
 255         }
 256 }
 257
 258 void tlb_remove_table(struct mmu_gather *tlb, void *table)
 259 {
 260         struct mmu_table_batch **batch = &tlb->batch;
 261
 262         if (*batch == NULL) {
 263                 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 264                 if (*batch == NULL) {
 265                         tlb_table_invalidate(tlb);
 266                         tlb_remove_table_one(table);
 267                         return;
 268                 }
 269                 (*batch)->nr = 0;
 270         }
 271
 272         (*batch)->tables[(*batch)->nr++] = table;
 273         if ((*batch)->nr == MAX_TABLE_BATCH)
 274                 tlb_table_flush(tlb);
 275 }
 276
 277 static inline void tlb_table_init(struct mmu_gather *tlb)
 278 {
 279         tlb->batch = NULL;
 280 }
 281
 282 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
 283
 284 static inline void tlb_table_flush(struct mmu_gather *tlb) { }
 285 static inline void tlb_table_init(struct mmu_gather *tlb) { }
 286
 287 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
 288
 289 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 290 {
 291         tlb_table_flush(tlb);
 292 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 293         tlb_batch_pages_flush(tlb);
 294 #endif
 295 }
 296
 297 void tlb_flush_mmu(struct mmu_gather *tlb)
 298 {
 299         tlb_flush_mmu_tlbonly(tlb);
 300         tlb_flush_mmu_free(tlb);
 301 }
 302
 303 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 304                              bool fullmm)
 305 {
 306         tlb->mm = mm;
 307         tlb->fullmm = fullmm;
 308
 309 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 310         tlb->need_flush_all = 0;
 311         tlb->local.next = NULL;
 312         tlb->local.nr   = 0;
 313         tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 314         tlb->active     = &tlb->local;
 315         tlb->batch_count = 0;
 316 #endif
 317         tlb->delayed_rmap = 0;
 318
 319         tlb_table_init(tlb);
 320 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
 321         tlb->page_size = 0;
 322 #endif
 323
 324         __tlb_reset_range(tlb);
 325         inc_tlb_flush_pending(tlb->mm);
 326 }
 327
 328 /**
 329  * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 330  * @tlb: the mmu_gather structure to initialize
 331  * @mm: the mm_struct of the target address space
 332  *
 333  * Called to initialize an (on-stack) mmu_gather structure for page-table
 334  * tear-down from @mm.
 335  */
 336 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
 337 {
 338         __tlb_gather_mmu(tlb, mm, false);
 339 }
 340
 341 /**
 342  * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 343  * @tlb: the mmu_gather structure to initialize
 344  * @mm: the mm_struct of the target address space
 345  *
 346  * In this case, @mm is without users and we're going to destroy the
 347  * full address space (exit/execve).
 348  *
 349  * Called to initialize an (on-stack) mmu_gather structure for page-table
 350  * tear-down from @mm.
 351  */
 352 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
 353 {
 354         __tlb_gather_mmu(tlb, mm, true);
 355 }
 356
 357 /**
 358  * tlb_finish_mmu - finish an mmu_gather structure
 359  * @tlb: the mmu_gather structure to finish
 360  *
 361  * Called at the end of the shootdown operation to free up any resources that
 362  * were required.
 363  */
 364 void tlb_finish_mmu(struct mmu_gather *tlb)
 365 {
 366         /*
 367          * If there are parallel threads are doing PTE changes on same range
 368          * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
 369          * flush by batching, one thread may end up seeing inconsistent PTEs
 370          * and result in having stale TLB entries.  So flush TLB forcefully
 371          * if we detect parallel PTE batching threads.
 372          *
 373          * However, some syscalls, e.g. munmap(), may free page tables, this
 374          * needs force flush everything in the given range. Otherwise this
 375          * may result in having stale TLB entries for some architectures,
 376          * e.g. aarch64, that could specify flush what level TLB.
 377          */
 378         if (mm_tlb_flush_nested(tlb->mm)) {
 379                 /*
 380                  * The aarch64 yields better performance with fullmm by
 381                  * avoiding multiple CPUs spamming TLBI messages at the
 382                  * same time.
 383                  *
 384                  * On x86 non-fullmm doesn't yield significant difference
 385                  * against fullmm.
 386                  */
 387                 tlb->fullmm = 1;
 388                 __tlb_reset_range(tlb);
 389                 tlb->freed_tables = 1;
 390         }
 391
 392         tlb_flush_mmu(tlb);
 393
 394 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 395         tlb_batch_list_free(tlb);
 396 #endif
 397         dec_tlb_flush_pending(tlb->mm);
 398 }