Linux 6.9-rc1
[linux-2.6-microblaze.git] / mm / filemap.c
index 750e779..7437b2b 100644 (file)
  *    ->private_lock           (zap_pte_range->block_dirty_folio)
  */
 
+static void mapping_set_update(struct xa_state *xas,
+               struct address_space *mapping)
+{
+       if (dax_mapping(mapping) || shmem_mapping(mapping))
+               return;
+       xas_set_update(xas, workingset_update_node);
+       xas_set_lru(xas, &shadow_nodes);
+}
+
 static void page_cache_delete(struct address_space *mapping,
                                   struct folio *folio, void *shadow)
 {
@@ -843,7 +852,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
                struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
 {
        XA_STATE(xas, &mapping->i_pages, index);
-       int huge = folio_test_hugetlb(folio);
+       bool huge = folio_test_hugetlb(folio);
        bool charged = false;
        long nr = 1;
 
@@ -1354,7 +1363,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
        unsigned long pflags;
        bool in_thrashing;
        wait_queue_head_t *q;
-       struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+       struct folio *folio = pfn_swap_entry_folio(entry);
 
        q = folio_waitqueue(folio);
        if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
@@ -1912,8 +1921,6 @@ no_page:
                        gfp_t alloc_gfp = gfp;
 
                        err = -ENOMEM;
-                       if (order == 1)
-                               order = 0;
                        if (order > 0)
                                alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
                        folio = filemap_alloc_folio(alloc_gfp, order);
@@ -2608,15 +2615,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
                        goto put_folios;
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
 
-               /*
-                * Pairs with a barrier in
-                * block_write_end()->mark_buffer_dirty() or other page
-                * dirtying routines like iomap_write_end() to ensure
-                * changes to page contents are visible before we see
-                * increased inode size.
-                */
-               smp_rmb();
-
                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
@@ -3183,6 +3181,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
        return fpin;
 }
 
+static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       vm_fault_t ret = 0;
+       pte_t *ptep;
+
+       /*
+        * We might have COW'ed a pagecache folio and might now have an mlocked
+        * anon folio mapped. The original pagecache folio is not mlocked and
+        * might have been evicted. During a read+clear/modify/write update of
+        * the PTE, such as done in do_numa_page()/change_pte_range(), we
+        * temporarily clear the PTE under PT lock and might detect it here as
+        * "none" when not holding the PT lock.
+        *
+        * Not rechecking the PTE under PT lock could result in an unexpected
+        * major fault in an mlock'ed region. Recheck only for this special
+        * scenario while holding the PT lock, to not degrade non-mlocked
+        * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
+        * the number of times we hold PT lock.
+        */
+       if (!(vma->vm_flags & VM_LOCKED))
+               return 0;
+
+       if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+               return 0;
+
+       ptep = pte_offset_map(vmf->pmd, vmf->address);
+       if (unlikely(!ptep))
+               return VM_FAULT_NOPAGE;
+
+       if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
+               ret = VM_FAULT_NOPAGE;
+       } else {
+               spin_lock(vmf->ptl);
+               if (unlikely(!pte_none(ptep_get(ptep))))
+                       ret = VM_FAULT_NOPAGE;
+               spin_unlock(vmf->ptl);
+       }
+       pte_unmap(ptep);
+       return ret;
+}
+
 /**
  * filemap_fault - read in file data for page fault handling
  * @vmf:       struct vm_fault containing details of the fault
@@ -3238,6 +3278,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
                        mapping_locked = true;
                }
        } else {
+               ret = filemap_fault_recheck_pte_none(vmf);
+               if (unlikely(ret))
+                       return ret;
+
                /* No page in the page cache at all */
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
@@ -4111,28 +4155,40 @@ static void filemap_cachestat(struct address_space *mapping,
 
        rcu_read_lock();
        xas_for_each(&xas, folio, last_index) {
+               int order;
                unsigned long nr_pages;
                pgoff_t folio_first_index, folio_last_index;
 
+               /*
+                * Don't deref the folio. It is not pinned, and might
+                * get freed (and reused) underneath us.
+                *
+                * We *could* pin it, but that would be expensive for
+                * what should be a fast and lightweight syscall.
+                *
+                * Instead, derive all information of interest from
+                * the rcu-protected xarray.
+                */
+
                if (xas_retry(&xas, folio))
                        continue;
 
+               order = xa_get_order(xas.xa, xas.xa_index);
+               nr_pages = 1 << order;
+               folio_first_index = round_down(xas.xa_index, 1 << order);
+               folio_last_index = folio_first_index + nr_pages - 1;
+
+               /* Folios might straddle the range boundaries, only count covered pages */
+               if (folio_first_index < first_index)
+                       nr_pages -= first_index - folio_first_index;
+
+               if (folio_last_index > last_index)
+                       nr_pages -= folio_last_index - last_index;
+
                if (xa_is_value(folio)) {
                        /* page is evicted */
                        void *shadow = (void *)folio;
                        bool workingset; /* not used */
-                       int order = xa_get_order(xas.xa, xas.xa_index);
-
-                       nr_pages = 1 << order;
-                       folio_first_index = round_down(xas.xa_index, 1 << order);
-                       folio_last_index = folio_first_index + nr_pages - 1;
-
-                       /* Folios might straddle the range boundaries, only count covered pages */
-                       if (folio_first_index < first_index)
-                               nr_pages -= first_index - folio_first_index;
-
-                       if (folio_last_index > last_index)
-                               nr_pages -= folio_last_index - last_index;
 
                        cs->nr_evicted += nr_pages;
 
@@ -4150,24 +4206,13 @@ static void filemap_cachestat(struct address_space *mapping,
                        goto resched;
                }
 
-               nr_pages = folio_nr_pages(folio);
-               folio_first_index = folio_pgoff(folio);
-               folio_last_index = folio_first_index + nr_pages - 1;
-
-               /* Folios might straddle the range boundaries, only count covered pages */
-               if (folio_first_index < first_index)
-                       nr_pages -= first_index - folio_first_index;
-
-               if (folio_last_index > last_index)
-                       nr_pages -= folio_last_index - last_index;
-
                /* page is in cache */
                cs->nr_cache += nr_pages;
 
-               if (folio_test_dirty(folio))
+               if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
                        cs->nr_dirty += nr_pages;
 
-               if (folio_test_writeback(folio))
+               if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
                        cs->nr_writeback += nr_pages;
 
 resched: