include/linux/rmap.h

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 #ifndef _LINUX_RMAP_H
   3 #define _LINUX_RMAP_H
   4 /*
   5  * Declarations for Reverse Mapping functions in mm/rmap.c
   6  */
   7
   8 #include <linux/list.h>
   9 #include <linux/slab.h>
  10 #include <linux/mm.h>
  11 #include <linux/rwsem.h>
  12 #include <linux/memcontrol.h>
  13 #include <linux/highmem.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/memremap.h>
  16
  17 /*
  18  * The anon_vma heads a list of private "related" vmas, to scan if
  19  * an anonymous page pointing to this anon_vma needs to be unmapped:
  20  * the vmas on the list will be related by forking, or by splitting.
  21  *
  22  * Since vmas come and go as they are split and merged (particularly
  23  * in mprotect), the mapping field of an anonymous page cannot point
  24  * directly to a vma: instead it points to an anon_vma, on whose list
  25  * the related vmas can be easily linked or unlinked.
  26  *
  27  * After unlinking the last vma on the list, we must garbage collect
  28  * the anon_vma object itself: we're guaranteed no page can be
  29  * pointing to this anon_vma once its vma list is empty.
  30  */
  31 struct anon_vma {
  32         struct anon_vma *root;          /* Root of this anon_vma tree */
  33         struct rw_semaphore rwsem;      /* W: modification, R: walking the list */
  34         /*
  35          * The refcount is taken on an anon_vma when there is no
  36          * guarantee that the vma of page tables will exist for
  37          * the duration of the operation. A caller that takes
  38          * the reference is responsible for clearing up the
  39          * anon_vma if they are the last user on release
  40          */
  41         atomic_t refcount;
  42
  43         /*
  44          * Count of child anon_vmas and VMAs which points to this anon_vma.
  45          *
  46          * This counter is used for making decision about reusing anon_vma
  47          * instead of forking new one. See comments in function anon_vma_clone.
  48          */
  49         unsigned degree;
  50
  51         struct anon_vma *parent;        /* Parent of this anon_vma */
  52
  53         /*
  54          * NOTE: the LSB of the rb_root.rb_node is set by
  55          * mm_take_all_locks() _after_ taking the above lock. So the
  56          * rb_root must only be read/written after taking the above lock
  57          * to be sure to see a valid next pointer. The LSB bit itself
  58          * is serialized by a system wide lock only visible to
  59          * mm_take_all_locks() (mm_all_locks_mutex).
  60          */
  61
  62         /* Interval tree of private "related" vmas */
  63         struct rb_root_cached rb_root;
  64 };
  65
  66 /*
  67  * The copy-on-write semantics of fork mean that an anon_vma
  68  * can become associated with multiple processes. Furthermore,
  69  * each child process will have its own anon_vma, where new
  70  * pages for that process are instantiated.
  71  *
  72  * This structure allows us to find the anon_vmas associated
  73  * with a VMA, or the VMAs associated with an anon_vma.
  74  * The "same_vma" list contains the anon_vma_chains linking
  75  * all the anon_vmas associated with this VMA.
  76  * The "rb" field indexes on an interval tree the anon_vma_chains
  77  * which link all the VMAs associated with this anon_vma.
  78  */
  79 struct anon_vma_chain {
  80         struct vm_area_struct *vma;
  81         struct anon_vma *anon_vma;
  82         struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
  83         struct rb_node rb;                      /* locked by anon_vma->rwsem */
  84         unsigned long rb_subtree_last;
  85 #ifdef CONFIG_DEBUG_VM_RB
  86         unsigned long cached_vma_start, cached_vma_last;
  87 #endif
  88 };
  89
  90 enum ttu_flags {
  91         TTU_SPLIT_HUGE_PMD      = 0x4,  /* split huge PMD if any */
  92         TTU_IGNORE_MLOCK        = 0x8,  /* ignore mlock */
  93         TTU_SYNC                = 0x10, /* avoid racy checks with PVMW_SYNC */
  94         TTU_IGNORE_HWPOISON     = 0x20, /* corrupted page is recoverable */
  95         TTU_BATCH_FLUSH         = 0x40, /* Batch TLB flushes where possible
  96                                          * and caller guarantees they will
  97                                          * do a final flush if necessary */
  98         TTU_RMAP_LOCKED         = 0x80, /* do not grab rmap lock:
  99                                          * caller holds it */
 100 };
 101
 102 #ifdef CONFIG_MMU
 103 static inline void get_anon_vma(struct anon_vma *anon_vma)
 104 {
 105         atomic_inc(&anon_vma->refcount);
 106 }
 107
 108 void __put_anon_vma(struct anon_vma *anon_vma);
 109
 110 static inline void put_anon_vma(struct anon_vma *anon_vma)
 111 {
 112         if (atomic_dec_and_test(&anon_vma->refcount))
 113                 __put_anon_vma(anon_vma);
 114 }
 115
 116 static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 117 {
 118         down_write(&anon_vma->root->rwsem);
 119 }
 120
 121 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
 122 {
 123         up_write(&anon_vma->root->rwsem);
 124 }
 125
 126 static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
 127 {
 128         down_read(&anon_vma->root->rwsem);
 129 }
 130
 131 static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
 132 {
 133         return down_read_trylock(&anon_vma->root->rwsem);
 134 }
 135
 136 static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
 137 {
 138         up_read(&anon_vma->root->rwsem);
 139 }
 140
 141
 142 /*
 143  * anon_vma helper functions.
 144  */
 145 void anon_vma_init(void);       /* create anon_vma_cachep */
 146 int  __anon_vma_prepare(struct vm_area_struct *);
 147 void unlink_anon_vmas(struct vm_area_struct *);
 148 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
 149 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
 150
 151 static inline int anon_vma_prepare(struct vm_area_struct *vma)
 152 {
 153         if (likely(vma->anon_vma))
 154                 return 0;
 155
 156         return __anon_vma_prepare(vma);
 157 }
 158
 159 static inline void anon_vma_merge(struct vm_area_struct *vma,
 160                                   struct vm_area_struct *next)
 161 {
 162         VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
 163         unlink_anon_vmas(next);
 164 }
 165
 166 struct anon_vma *page_get_anon_vma(struct page *page);
 167
 168 /* RMAP flags, currently only relevant for some anon rmap operations. */
 169 typedef int __bitwise rmap_t;
 170
 171 /*
 172  * No special request: if the page is a subpage of a compound page, it is
 173  * mapped via a PTE. The mapped (sub)page is possibly shared between processes.
 174  */
 175 #define RMAP_NONE               ((__force rmap_t)0)
 176
 177 /* The (sub)page is exclusive to a single process. */
 178 #define RMAP_EXCLUSIVE          ((__force rmap_t)BIT(0))
 179
 180 /*
 181  * The compound page is not mapped via PTEs, but instead via a single PMD and
 182  * should be accounted accordingly.
 183  */
 184 #define RMAP_COMPOUND           ((__force rmap_t)BIT(1))
 185
 186 /*
 187  * rmap interfaces called when adding or removing pte of page
 188  */
 189 void page_move_anon_rmap(struct page *, struct vm_area_struct *);
 190 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 191                 unsigned long address, rmap_t flags);
 192 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
 193                 unsigned long address);
 194 void page_add_file_rmap(struct page *, struct vm_area_struct *,
 195                 bool compound);
 196 void page_remove_rmap(struct page *, struct vm_area_struct *,
 197                 bool compound);
 198
 199 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
 200                 unsigned long address, rmap_t flags);
 201 void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
 202                 unsigned long address);
 203
 204 static inline void __page_dup_rmap(struct page *page, bool compound)
 205 {
 206         atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
 207 }
 208
 209 static inline void page_dup_file_rmap(struct page *page, bool compound)
 210 {
 211         __page_dup_rmap(page, compound);
 212 }
 213
 214 /**
 215  * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
 216  *                          anonymous page
 217  * @page: the page to duplicate the mapping for
 218  * @compound: the page is mapped as compound or as a small page
 219  * @vma: the source vma
 220  *
 221  * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
 222  *
 223  * Duplicating the mapping can only fail if the page may be pinned; device
 224  * private pages cannot get pinned and consequently this function cannot fail.
 225  *
 226  * If duplicating the mapping succeeds, the page has to be mapped R/O into
 227  * the parent and the child. It must *not* get mapped writable after this call.
 228  *
 229  * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
 230  */
 231 static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
 232                                          struct vm_area_struct *vma)
 233 {
 234         VM_BUG_ON_PAGE(!PageAnon(page), page);
 235
 236         /*
 237          * No need to check+clear for already shared pages, including KSM
 238          * pages.
 239          */
 240         if (!PageAnonExclusive(page))
 241                 goto dup;
 242
 243         /*
 244          * If this page may have been pinned by the parent process,
 245          * don't allow to duplicate the mapping but instead require to e.g.,
 246          * copy the page immediately for the child so that we'll always
 247          * guarantee the pinned page won't be randomly replaced in the
 248          * future on write faults.
 249          */
 250         if (likely(!is_device_private_page(page) &&
 251             unlikely(page_needs_cow_for_dma(vma, page))))
 252                 return -EBUSY;
 253
 254         ClearPageAnonExclusive(page);
 255         /*
 256          * It's okay to share the anon page between both processes, mapping
 257          * the page R/O into both processes.
 258          */
 259 dup:
 260         __page_dup_rmap(page, compound);
 261         return 0;
 262 }
 263
 264 /**
 265  * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
 266  *                            shared to prepare for KSM or temporary unmapping
 267  * @page: the exclusive anonymous page to try marking possibly shared
 268  *
 269  * The caller needs to hold the PT lock and has to have the page table entry
 270  * cleared/invalidated+flushed, to properly sync against GUP-fast.
 271  *
 272  * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
 273  * to duplicate a mapping, but instead to prepare for KSM or temporarily
 274  * unmapping a page (swap, migration) via page_remove_rmap().
 275  *
 276  * Marking the page shared can only fail if the page may be pinned; device
 277  * private pages cannot get pinned and consequently this function cannot fail.
 278  *
 279  * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
 280  * otherwise.
 281  */
 282 static inline int page_try_share_anon_rmap(struct page *page)
 283 {
 284         VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
 285
 286         /* See page_try_dup_anon_rmap(). */
 287         if (likely(!is_device_private_page(page) &&
 288             unlikely(page_maybe_dma_pinned(page))))
 289                 return -EBUSY;
 290
 291         ClearPageAnonExclusive(page);
 292         return 0;
 293 }
 294
 295 /*
 296  * Called from mm/vmscan.c to handle paging out
 297  */
 298 int folio_referenced(struct folio *, int is_locked,
 299                         struct mem_cgroup *memcg, unsigned long *vm_flags);
 300
 301 void try_to_migrate(struct folio *folio, enum ttu_flags flags);
 302 void try_to_unmap(struct folio *, enum ttu_flags flags);
 303
 304 int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
 305                                 unsigned long end, struct page **pages,
 306                                 void *arg);
 307
 308 /* Avoid racy checks */
 309 #define PVMW_SYNC               (1 << 0)
 310 /* Look for migration entries rather than present PTEs */
 311 #define PVMW_MIGRATION          (1 << 1)
 312
 313 struct page_vma_mapped_walk {
 314         unsigned long pfn;
 315         unsigned long nr_pages;
 316         pgoff_t pgoff;
 317         struct vm_area_struct *vma;
 318         unsigned long address;
 319         pmd_t *pmd;
 320         pte_t *pte;
 321         spinlock_t *ptl;
 322         unsigned int flags;
 323 };
 324
 325 #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags)       \
 326         struct page_vma_mapped_walk name = {                            \
 327                 .pfn = page_to_pfn(_page),                              \
 328                 .nr_pages = compound_nr(_page),                         \
 329                 .pgoff = page_to_pgoff(_page),                          \
 330                 .vma = _vma,                                            \
 331                 .address = _address,                                    \
 332                 .flags = _flags,                                        \
 333         }
 334
 335 #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)     \
 336         struct page_vma_mapped_walk name = {                            \
 337                 .pfn = folio_pfn(_folio),                               \
 338                 .nr_pages = folio_nr_pages(_folio),                     \
 339                 .pgoff = folio_pgoff(_folio),                           \
 340                 .vma = _vma,                                            \
 341                 .address = _address,                                    \
 342                 .flags = _flags,                                        \
 343         }
 344
 345 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
 346 {
 347         /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
 348         if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
 349                 pte_unmap(pvmw->pte);
 350         if (pvmw->ptl)
 351                 spin_unlock(pvmw->ptl);
 352 }
 353
 354 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
 355
 356 /*
 357  * Used by swapoff to help locate where page is expected in vma.
 358  */
 359 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 360
 361 /*
 362  * Cleans the PTEs of shared mappings.
 363  * (and since clean PTEs should also be readonly, write protects them too)
 364  *
 365  * returns the number of cleaned PTEs.
 366  */
 367 int folio_mkclean(struct folio *);
 368
 369 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 370                       struct vm_area_struct *vma);
 371
 372 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 373
 374 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 375
 376 /*
 377  * rmap_walk_control: To control rmap traversing for specific needs
 378  *
 379  * arg: passed to rmap_one() and invalid_vma()
 380  * try_lock: bail out if the rmap lock is contended
 381  * contended: indicate the rmap traversal bailed out due to lock contention
 382  * rmap_one: executed on each vma where page is mapped
 383  * done: for checking traversing termination condition
 384  * anon_lock: for getting anon_lock by optimized way rather than default
 385  * invalid_vma: for skipping uninterested vma
 386  */
 387 struct rmap_walk_control {
 388         void *arg;
 389         bool try_lock;
 390         bool contended;
 391         /*
 392          * Return false if page table scanning in rmap_walk should be stopped.
 393          * Otherwise, return true.
 394          */
 395         bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
 396                                         unsigned long addr, void *arg);
 397         int (*done)(struct folio *folio);
 398         struct anon_vma *(*anon_lock)(struct folio *folio,
 399                                       struct rmap_walk_control *rwc);
 400         bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
 401 };
 402
 403 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
 404 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
 405
 406 /*
 407  * Called by memory-failure.c to kill processes.
 408  */
 409 struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
 410                                           struct rmap_walk_control *rwc);
 411 void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
 412
 413 #else   /* !CONFIG_MMU */
 414
 415 #define anon_vma_init()         do {} while (0)
 416 #define anon_vma_prepare(vma)   (0)
 417 #define anon_vma_link(vma)      do {} while (0)
 418
 419 static inline int folio_referenced(struct folio *folio, int is_locked,
 420                                   struct mem_cgroup *memcg,
 421                                   unsigned long *vm_flags)
 422 {
 423         *vm_flags = 0;
 424         return 0;
 425 }
 426
 427 static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
 428 {
 429 }
 430
 431 static inline int folio_mkclean(struct folio *folio)
 432 {
 433         return 0;
 434 }
 435 #endif  /* CONFIG_MMU */
 436
 437 static inline int page_mkclean(struct page *page)
 438 {
 439         return folio_mkclean(page_folio(page));
 440 }
 441 #endif  /* _LINUX_RMAP_H */