X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=mm%2Fgup.c;h=e4c224cd9661f9c4d1d8372a37a19684cca3e3d0;hb=c3a74f8e25e97166ca0f954414825ae98a3209f6;hp=102877ed77a4b42af7e0a625cfc4b79791494f95;hpb=3c09ec59cdea5b132212d97154d625fd34e436dd;p=linux-2.6-microblaze.git diff --git a/mm/gup.c b/mm/gup.c index 102877ed77a4..e4c224cd9661 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -123,6 +123,28 @@ static __maybe_unused struct page *try_grab_compound_head(struct page *page, return NULL; } +static void put_compound_head(struct page *page, int refs, unsigned int flags) +{ + if (flags & FOLL_PIN) { + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, + refs); + + if (hpage_pincount_available(page)) + hpage_pincount_sub(page, refs); + else + refs *= GUP_PIN_COUNTING_BIAS; + } + + VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); + /* + * Calling put_page() for each ref is unnecessarily slow. Only the last + * ref needs a put_page(). + */ + if (refs > 1) + page_ref_sub(page, refs - 1); + put_page(page); +} + /** * try_grab_page() - elevate a page's refcount by a flag-dependent amount * @@ -177,41 +199,6 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) return true; } -#ifdef CONFIG_DEV_PAGEMAP_OPS -static bool __unpin_devmap_managed_user_page(struct page *page) -{ - int count, refs = 1; - - if (!page_is_devmap_managed(page)) - return false; - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - count = page_ref_sub_return(page, refs); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1); - /* - * devmap page refcounts are 1-based, rather than 0-based: if - * refcount is 1, then the page is free and the refcount is - * stable because nobody holds a reference on the page. - */ - if (count == 1) - free_devmap_managed_page(page); - else if (!count) - __put_page(page); - - return true; -} -#else -static bool __unpin_devmap_managed_user_page(struct page *page) -{ - return false; -} -#endif /* CONFIG_DEV_PAGEMAP_OPS */ - /** * unpin_user_page() - release a dma-pinned page * @page: pointer to page to be released @@ -223,28 +210,7 @@ static bool __unpin_devmap_managed_user_page(struct page *page) */ void unpin_user_page(struct page *page) { - int refs = 1; - - page = compound_head(page); - - /* - * For devmap managed pages we need to catch refcount transition from - * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the - * page is free and we need to inform the device driver through - * callback. See include/linux/memremap.h and HMM for details. - */ - if (__unpin_devmap_managed_user_page(page)) - return; - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - if (page_ref_sub_and_test(page, refs)) - __put_page(page); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1); + put_compound_head(compound_head(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); @@ -923,6 +889,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) return -EFAULT; + if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) + return -EOPNOTSUPP; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) @@ -1060,10 +1029,14 @@ static long __get_user_pages(struct mm_struct *mm, goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) { + if (!vma) { ret = -EFAULT; goto out; } + ret = check_vma_flags(vma, gup_flags); + if (ret) + goto out; + if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -1567,26 +1540,6 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ -#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) -static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - long i; - struct vm_area_struct *vma_prev = NULL; - - for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma == vma_prev) - continue; - - vma_prev = vma; - - if (vma_is_fsdax(vma)) - return true; - } - return false; -} - #ifdef CONFIG_CMA static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned long start, @@ -1647,8 +1600,11 @@ check_again: /* * drop the above get_user_pages reference. */ - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); + if (gup_flags & FOLL_PIN) + unpin_user_pages(pages, nr_pages); + else + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { @@ -1702,60 +1658,23 @@ static long __gup_longterm_locked(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - struct vm_area_struct **vmas_tmp = vmas; unsigned long flags = 0; - long rc, i; + long rc; - if (gup_flags & FOLL_LONGTERM) { - if (!pages) - return -EINVAL; - - if (!vmas_tmp) { - vmas_tmp = kcalloc(nr_pages, - sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas_tmp) - return -ENOMEM; - } + if (gup_flags & FOLL_LONGTERM) flags = memalloc_nocma_save(); - } - rc = __get_user_pages_locked(mm, start, nr_pages, pages, - vmas_tmp, NULL, gup_flags); + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, + gup_flags); if (gup_flags & FOLL_LONGTERM) { - if (rc < 0) - goto out; - - if (check_dax_vmas(vmas_tmp, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; - } - - rc = check_and_migrate_cma_pages(mm, start, rc, pages, - vmas_tmp, gup_flags); -out: + if (rc > 0) + rc = check_and_migrate_cma_pages(mm, start, rc, pages, + vmas, gup_flags); memalloc_nocma_restore(flags); } - - if (vmas_tmp != vmas) - kfree(vmas_tmp); return rc; } -#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ -static __always_inline long __gup_longterm_locked(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int flags) -{ - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, flags); -} -#endif /* CONFIG_FS_DAX || CONFIG_CMA */ static bool is_valid_gup_flags(unsigned int gup_flags) { @@ -1926,7 +1845,19 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages); /** - * get_user_pages_locked() is suitable to replace the form: + * get_user_pages_locked() - variant of get_user_pages() + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * It is suitable to replace the form: * * mmap_read_lock(mm); * do_something() @@ -1942,16 +1873,6 @@ EXPORT_SYMBOL(get_user_pages); * if (locked) * mmap_read_unlock(mm); * - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying lookup behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @locked: pointer to lock flag indicating whether lock is held and - * subsequently whether VM_FAULT_RETRY functionality can be - * utilised. Lock must initially be held. - * * We can leverage the VM_FAULT_RETRY functionality in the page fault * paths better by using either get_user_pages_locked() or * get_user_pages_unlocked(). @@ -2057,84 +1978,6 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ #ifdef CONFIG_HAVE_FAST_GUP -static void put_compound_head(struct page *page, int refs, unsigned int flags) -{ - if (flags & FOLL_PIN) { - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, - refs); - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, refs); - else - refs *= GUP_PIN_COUNTING_BIAS; - } - - VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); - /* - * Calling put_page() for each ref is unnecessarily slow. Only the last - * ref needs a put_page(). - */ - if (refs > 1) - page_ref_sub(page, refs - 1); - put_page(page); -} - -#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH - -/* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking any - * locks. For this we would like to load the pointers atomically, but sometimes - * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What - * we do have is the guarantee that a PTE will only either go from not present - * to present, or present to not present or both -- it will not switch to a - * completely different present page without a TLB flush in between; something - * that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. - * We load pte_high *after* loading pte_low, which ensures we don't see an older - * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't - * picked up a changed pte high. We might have gotten rubbish values from - * pte_low and pte_high, but we are guaranteed that pte_low will not have the - * present bit set *unless* it is 'l'. Because get_user_pages_fast() only - * operates on present ptes we're safe. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - pte_t pte; - - do { - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - } while (unlikely(pte.pte_low != ptep->pte_low)); - - return pte; -} -#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ -/* - * We require that the PTE can be read atomically. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - return ptep_get(ptep); -} -#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ - static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) @@ -2160,7 +2003,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ptem = ptep = pte_offset_map(&pmd, addr); do { - pte_t pte = gup_get_pte(ptep); + pte_t pte = ptep_get_lockless(ptep); struct page *head, *page; /* @@ -2671,13 +2514,61 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages, return ret; } -static int internal_get_user_pages_fast(unsigned long start, int nr_pages, +static unsigned long lockless_pages_from_mm(unsigned long start, + unsigned long end, + unsigned int gup_flags, + struct page **pages) +{ + unsigned long flags; + int nr_pinned = 0; + unsigned seq; + + if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || + !gup_fast_permitted(start, end)) + return 0; + + if (gup_flags & FOLL_PIN) { + seq = raw_read_seqcount(¤t->mm->write_protect_seq); + if (seq & 1) + return 0; + } + + /* + * Disable interrupts. The nested form is used, in order to allow full, + * general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being freed + * from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock() here as we also want to block IPIs + * that come from THPs splitting. + */ + local_irq_save(flags); + gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); + local_irq_restore(flags); + + /* + * When pinning pages for DMA there could be a concurrent write protect + * from fork() via copy_page_range(), in this case always fail fast GUP. + */ + if (gup_flags & FOLL_PIN) { + if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { + unpin_user_pages(pages, nr_pinned); + return 0; + } + } + return nr_pinned; +} + +static int internal_get_user_pages_fast(unsigned long start, + unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { - unsigned long addr, len, end; - unsigned long flags; - int nr_pinned = 0, ret = 0; + unsigned long len, end; + unsigned long nr_pinned; + int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | @@ -2691,54 +2582,33 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, might_lock_read(¤t->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (end <= start) + len = nr_pages << PAGE_SHIFT; + if (check_add_overflow(start, len, &end)) return 0; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - /* - * Disable interrupts. The nested form is used, in order to allow - * full, general purpose use of this routine. - * - * With interrupts disabled, we block page table pages from being - * freed from under us. See struct mmu_table_batch comments in - * include/asm-generic/tlb.h for more details. - * - * We do not adopt an rcu_read_lock(.) here as we also want to - * block IPIs that come from THPs splitting. - */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { - unsigned long fast_flags = gup_flags; - - local_irq_save(flags); - gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); - local_irq_restore(flags); - ret = nr_pinned; - } - - if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) { - /* Try to get the remaining pages with get_user_pages */ - start += nr_pinned << PAGE_SHIFT; - pages += nr_pinned; - - ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, - gup_flags, pages); + nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); + if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) + return nr_pinned; - /* Have to be a bit careful with return values */ - if (nr_pinned > 0) { - if (ret < 0) - ret = nr_pinned; - else - ret += nr_pinned; - } + /* Slow path: try to get the remaining pages with get_user_pages */ + start += nr_pinned << PAGE_SHIFT; + pages += nr_pinned; + ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, + pages); + if (ret < 0) { + /* + * The caller has to unpin the pages we already pinned so + * returning -errno is not an option + */ + if (nr_pinned) + return nr_pinned; + return ret; } - - return ret; + return ret + nr_pinned; } + /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address