workingset_nodereclaim
Number of times a shadow node has been reclaimed
+ pgscan (npn)
+ Amount of scanned pages (in an inactive LRU list)
+
+ pgsteal (npn)
+ Amount of reclaimed pages
+
+ pgscan_kswapd (npn)
+ Amount of scanned pages by kswapd (in an inactive LRU list)
+
+ pgscan_direct (npn)
+ Amount of scanned pages directly (in an inactive LRU list)
+
+ pgsteal_kswapd (npn)
+ Amount of reclaimed pages by kswapd
+
+ pgsteal_direct (npn)
+ Amount of reclaimed pages directly
+
pgfault (npn)
Total number of page faults incurred
pgrefill (npn)
Amount of scanned pages (in an active LRU list)
- pgscan (npn)
- Amount of scanned pages (in an inactive LRU list)
-
- pgsteal (npn)
- Amount of reclaimed pages
-
pgactivate (npn)
Amount of pages moved to the active LRU list
- ``kmemleak_alloc_phys``
- ``kmemleak_free_part_phys``
-- ``kmemleak_not_leak_phys``
- ``kmemleak_ignore_phys``
Dealing with false positives/negatives
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
/*
* Fault retry nuances, mmap_lock already relinquished by core mm
*/
return 0;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (!(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
return 0;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (fault & VM_FAULT_RETRY) {
mm_flags |= FAULT_FLAG_TRIED;
goto retry;
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) {
flags |= FAULT_FLAG_TRIED;
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
/* The most common case -- we are done. */
if (likely(!(fault & VM_FAULT_ERROR))) {
if (fault & VM_FAULT_RETRY) {
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
/*
* We ran out of memory, or some other thing happened
if (fault_signal_pending(fault, regs))
return 0;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
/*
* We hit a shared mapping outside of the file, or some
ret = 0;
*flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (*flt & VM_FAULT_COMPLETED)
+ return 0;
+
if (unlikely(*flt & VM_FAULT_ERROR)) {
if (*flt & VM_FAULT_OOM) {
ret = -ENOMEM;
if (fault_signal_pending(fault, regs))
return user_mode(regs) ? 0 : SIGBUS;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ goto out;
+
/*
* Handle the retry right now, the mmap_lock has been released in that
* case.
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
+out:
/*
* Major/minor page fault accounting.
*/
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_RETRY)) {
flags |= FAULT_FLAG_TRIED;
goto out_up;
goto out;
}
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED) {
+ if (gmap) {
+ mmap_read_lock(mm);
+ goto out_gmap;
+ }
+ fault = 0;
+ goto out;
+ }
+
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
mmap_read_lock(mm);
goto retry;
}
+out_gmap:
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr,
address);
if (mm_fault_error(regs, error_code, address, fault))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
if (fault_signal_pending(fault, regs))
return;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
if (fault_signal_pending(fault, regs))
goto exit_exception;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ goto lock_released;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
}
mmap_read_unlock(mm);
+lock_released:
mm_rss = get_mm_rss(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
mm_rss -= (mm->context.thp_pte_count * (HPAGE_SIZE / PAGE_SIZE));
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
goto out_nosemaphore;
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return 0;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM) {
goto out_of_memory;
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
/*
* If we need to retry the mmap_lock has already been released,
* and if there is a fatal signal pending there is no guarantee
#include <asm/tlbflush.h>
#include <asm/elf.h>
-#if 0 /* This is just for testing */
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
- unsigned long start = address;
- int length = 1;
- int nr;
- struct page *page;
- struct vm_area_struct *vma;
-
- vma = find_vma(mm, addr);
- if (!vma || !is_vm_hugetlb_page(vma))
- return ERR_PTR(-EINVAL);
-
- pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
-
- /* hugetlb should be locked, and hence, prefaulted */
- WARN_ON(!pte || pte_none(*pte));
-
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
- WARN_ON(!PageHead(page));
-
- return page;
-}
-
-int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
-int pud_huge(pud_t pud)
-{
- return 0;
-}
-
-#else
-
/*
* pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
* hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
{
return !!(pud_val(pud) & _PAGE_PSE);
}
-#endif
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
return;
}
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED)
+ return;
+
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M));
if (!nomap)
- kmemleak_alloc_phys(base, size, 0, 0);
+ kmemleak_alloc_phys(base, size, 0);
}
else
pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
* detail.
*
* @kdamond: Kernel thread who does the monitoring.
- * @kdamond_stop: Notifies whether kdamond should stop.
* @kdamond_lock: Mutex for the synchronizations with @kdamond.
*
* For each monitoring context, one kernel thread for the monitoring is
* Once started, the monitoring thread runs until explicitly required to be
* terminated or every monitoring target is invalid. The validity of the
* targets is checked via the &damon_operations.target_valid of @ops. The
- * termination can also be explicitly requested by writing non-zero to
- * @kdamond_stop. The thread sets @kdamond to NULL when it terminates.
- * Therefore, users can know whether the monitoring is ongoing or terminated by
- * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from
- * outside of the monitoring thread must be protected by @kdamond_lock.
- *
- * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
- * @kdamond_lock. Accesses to other fields must be protected by themselves.
+ * termination can also be explicitly requested by calling damon_stop().
+ * The thread sets @kdamond to NULL when it terminates. Therefore, users can
+ * know whether the monitoring is ongoing or terminated by reading @kdamond.
+ * Reads and writes to @kdamond from outside of the monitoring thread must
+ * be protected by @kdamond_lock.
+ *
+ * Note that the monitoring thread protects only @kdamond via @kdamond_lock.
+ * Accesses to other fields must be protected by themselves.
*
* @ops: Set of monitoring operations for given use cases.
* @callback: Set of callbacks for monitoring events notifications.
kunmap_local(dst);
}
-static inline void memmove_page(struct page *dst_page, size_t dst_off,
- struct page *src_page, size_t src_off,
- size_t len)
-{
- char *dst = kmap_local_page(dst_page);
- char *src = kmap_local_page(src_page);
-
- VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
- memmove(dst + dst_off, src + src_off, len);
- kunmap_local(src);
- kunmap_local(dst);
-}
-
static inline void memset_page(struct page *page, size_t offset, int val,
size_t len)
{
extern void kmemleak_ignore(const void *ptr) __ref;
extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
extern void kmemleak_no_scan(const void *ptr) __ref;
-extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
+extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
gfp_t gfp) __ref;
extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref;
-extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref;
extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
{
}
static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
- int min_count, gfp_t gfp)
+ gfp_t gfp)
{
}
static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
{
}
-static inline void kmemleak_not_leak_phys(phys_addr_t phys)
-{
-}
static inline void kmemleak_ignore_phys(phys_addr_t phys)
{
}
}
struct mem_cgroup *mem_cgroup_from_obj(void *p);
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
static inline void count_objcg_event(struct obj_cgroup *objcg,
enum vm_event_item idx)
rcu_read_unlock();
}
+/**
+ * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object.
+ * @p: pointer to object from which memcg should be extracted. It can be NULL.
+ *
+ * Retrieves the memory group into which the memory of the pointed kernel
+ * object is accounted. If memcg is found, its reference is taken.
+ * If a passed kernel object is uncharged, or if proper memcg cannot be found,
+ * as well as if mem_cgroup is disabled, NULL is returned.
+ *
+ * Return: valid memcg pointer with taken reference or NULL.
+ */
+static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ do {
+ memcg = mem_cgroup_from_obj(p);
+ } while (memcg && !css_tryget(&memcg->css));
+ rcu_read_unlock();
+ return memcg;
+}
+
+/**
+ * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup.
+ * @memcg: pointer to a valid memory cgroup or NULL.
+ *
+ * If passed argument is not NULL, returns it without any additional checks
+ * and changes. Otherwise, root_mem_cgroup is returned.
+ *
+ * NOTE: root_mem_cgroup can be NULL during early boot.
+ */
+static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
+{
+ return memcg ? memcg : root_mem_cgroup;
+}
#else
static inline bool mem_cgroup_kmem_disabled(void)
{
static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
- return NULL;
+ return NULL;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+ return NULL;
}
static inline void count_objcg_event(struct obj_cgroup *objcg,
{
}
+static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
+{
+ return NULL;
+}
+
+static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
+{
+ return NULL;
+}
#endif /* CONFIG_MEMCG_KMEM */
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
* @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs
* fsync() to complete (for synchronous page faults
* in DAX)
+ * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
* @VM_FAULT_HINDEX_MASK: mask HINDEX value
*
*/
VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
+ VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
mmap_read_lock(mm);
for (addr = start; addr < end; addr = next) {
- unsigned long mapped;
+ unsigned long mapped = 0;
int i;
if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT))
next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT);
ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
- mapped = dmirror_atomic_map(addr, next, pages, dmirror);
+ /*
+ * Do dmirror_atomic_map() iff all pages are marked for
+ * exclusive access to avoid accessing uninitialized
+ * fields of pages.
+ */
+ if (ret == (next - addr) >> PAGE_SHIFT)
+ mapped = dmirror_atomic_map(addr, next, pages, dmirror);
for (i = 0; i < ret; i++) {
if (pages[i]) {
unlock_page(pages[i]);
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
+ * mmap lock in the page fault handler. Sanity check this.
+ */
+ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
+ if (locked)
+ *locked = 0;
+ /*
+ * We should do the same as VM_FAULT_RETRY, but let's not
+ * return -EBUSY since that's not reflecting the reality of
+ * what has happened - we've just fully completed a page
+ * fault, with the mmap lock released. Use -EAGAIN to show
+ * that we want to take the mmap lock _again_.
+ */
+ return -EAGAIN;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
case 0:
goto retry;
case -EBUSY:
+ case -EAGAIN:
ret = 0;
fallthrough;
case -EFAULT:
return -EINTR;
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * NOTE: it's a pity that we need to retake the lock here
+ * to pair with the unlock() in the callers. Ideally we
+ * could tell the callers so they do not need to unlock.
+ */
+ mmap_read_lock(mm);
+ *unlocked = true;
+ return 0;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
/* VM_FAULT_RETRY couldn't trigger, bypass */
return ret;
- /* VM_FAULT_RETRY cannot return errors */
+ /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
if (!*locked) {
BUG_ON(ret < 0);
BUG_ON(ret >= nr_pages);
* The following locks and mutexes are used by kmemleak:
*
* - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
- * accesses to the object_tree_root. The object_list is the main list
- * holding the metadata (struct kmemleak_object) for the allocated memory
- * blocks. The object_tree_root is a red black tree used to look-up
- * metadata based on a pointer to the corresponding memory block. The
- * kmemleak_object structures are added to the object_list and
- * object_tree_root in the create_object() function called from the
- * kmemleak_alloc() callback and removed in delete_object() called from the
- * kmemleak_free() callback
+ * accesses to the object_tree_root (or object_phys_tree_root). The
+ * object_list is the main list holding the metadata (struct kmemleak_object)
+ * for the allocated memory blocks. The object_tree_root and object_phys_tree_root
+ * are red black trees used to look-up metadata based on a pointer to the
+ * corresponding memory block. The object_phys_tree_root is for objects
+ * allocated with physical address. The kmemleak_object structures are
+ * added to the object_list and object_tree_root (or object_phys_tree_root)
+ * in the create_object() function called from the kmemleak_alloc() (or
+ * kmemleak_alloc_phys()) callback and removed in delete_object() called from
+ * the kmemleak_free() callback
* - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
* Accesses to the metadata (e.g. count) are protected by this lock. Note
* that some members of this structure may be protected by other means
#define OBJECT_NO_SCAN (1 << 2)
/* flag set to fully scan the object when scan_area allocation failed */
#define OBJECT_FULL_SCAN (1 << 3)
+/* flag set for object allocated with physical address */
+#define OBJECT_PHYS (1 << 4)
#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
-/* protecting the access to object_list and object_tree_root */
+/* search tree for object (with OBJECT_PHYS flag) boundaries */
+static struct rb_root object_phys_tree_root = RB_ROOT;
+/* protecting the access to object_list, object_tree_root (or object_phys_tree_root) */
static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
const u8 *ptr = (const u8 *)object->pointer;
size_t len;
+ if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+ return;
+
/* limit the number of lines to HEX_MAX_LINES */
len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
* beginning of the memory block are allowed. The kmemleak_lock must be held
* when calling this function.
*/
-static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
+ bool is_phys)
{
- struct rb_node *rb = object_tree_root.rb_node;
+ struct rb_node *rb = is_phys ? object_phys_tree_root.rb_node :
+ object_tree_root.rb_node;
unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
while (rb) {
return NULL;
}
+/* Look-up a kmemleak object which allocated with virtual address. */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+ return __lookup_object(ptr, alias, false);
+}
+
/*
* Increment the object use_count. Return 1 if successful or 0 otherwise. Note
* that once an object's use_count reached 0, the RCU freeing was already
/*
* Look up an object in the object search tree and increase its use_count.
*/
-static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alias,
+ bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object;
rcu_read_lock();
raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = lookup_object(ptr, alias);
+ object = __lookup_object(ptr, alias, is_phys);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
return object;
}
+/* Look up and get an object which allocated with virtual address. */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+ return __find_and_get_object(ptr, alias, false);
+}
+
/*
- * Remove an object from the object_tree_root and object_list. Must be called
- * with the kmemleak_lock held _if_ kmemleak is still enabled.
+ * Remove an object from the object_tree_root (or object_phys_tree_root)
+ * and object_list. Must be called with the kmemleak_lock held _if_ kmemleak
+ * is still enabled.
*/
static void __remove_object(struct kmemleak_object *object)
{
- rb_erase(&object->rb_node, &object_tree_root);
+ rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ?
+ &object_phys_tree_root :
+ &object_tree_root);
list_del_rcu(&object->object_list);
}
/*
* Look up an object in the object search tree and remove it from both
- * object_tree_root and object_list. The returned object's use_count should be
- * at least 1, as initially set by create_object().
+ * object_tree_root (or object_phys_tree_root) and object_list. The
+ * returned object's use_count should be at least 1, as initially set
+ * by create_object().
*/
-static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
+static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias,
+ bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object;
raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = lookup_object(ptr, alias);
+ object = __lookup_object(ptr, alias, is_phys);
if (object)
__remove_object(object);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/*
* Create the metadata (struct kmemleak_object) corresponding to an allocated
- * memory block and add it to the object_list and object_tree_root.
+ * memory block and add it to the object_list and object_tree_root (or
+ * object_phys_tree_root).
*/
-static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp,
+ bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object, *parent;
INIT_HLIST_HEAD(&object->area_list);
raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
- object->flags = OBJECT_ALLOCATED;
+ object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0);
object->pointer = ptr;
object->size = kfence_ksize((void *)ptr) ?: size;
object->excess_ref = 0;
raw_spin_lock_irqsave(&kmemleak_lock, flags);
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
- min_addr = min(min_addr, untagged_ptr);
- max_addr = max(max_addr, untagged_ptr + size);
- link = &object_tree_root.rb_node;
+ /*
+ * Only update min_addr and max_addr with object
+ * storing virtual address.
+ */
+ if (!is_phys) {
+ min_addr = min(min_addr, untagged_ptr);
+ max_addr = max(max_addr, untagged_ptr + size);
+ }
+ link = is_phys ? &object_phys_tree_root.rb_node :
+ &object_tree_root.rb_node;
rb_parent = NULL;
while (*link) {
rb_parent = *link;
}
}
rb_link_node(&object->rb_node, rb_parent, link);
- rb_insert_color(&object->rb_node, &object_tree_root);
+ rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root :
+ &object_tree_root);
list_add_tail_rcu(&object->object_list, &object_list);
out:
return object;
}
+/* Create kmemleak object which allocated with virtual address. */
+static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ return __create_object(ptr, size, min_count, gfp, false);
+}
+
+/* Create kmemleak object which allocated with physical address. */
+static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ return __create_object(ptr, size, min_count, gfp, true);
+}
+
/*
* Mark the object as not allocated and schedule RCU freeing via put_object().
*/
{
struct kmemleak_object *object;
- object = find_and_remove_object(ptr, 0);
+ object = find_and_remove_object(ptr, 0, false);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Freeing unknown object at 0x%08lx\n",
* delete it. If the memory block is partially freed, the function may create
* additional metadata for the remaining parts of the block.
*/
-static void delete_object_part(unsigned long ptr, size_t size)
+static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
{
struct kmemleak_object *object;
unsigned long start, end;
- object = find_and_remove_object(ptr, 1);
+ object = find_and_remove_object(ptr, 1, is_phys);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
start = object->pointer;
end = object->pointer + object->size;
if (ptr > start)
- create_object(start, ptr - start, object->min_count,
- GFP_KERNEL);
+ __create_object(start, ptr - start, object->min_count,
+ GFP_KERNEL, is_phys);
if (ptr + size < end)
- create_object(ptr + size, end - ptr - size, object->min_count,
- GFP_KERNEL);
+ __create_object(ptr + size, end - ptr - size, object->min_count,
+ GFP_KERNEL, is_phys);
__delete_object(object);
}
raw_spin_unlock_irqrestore(&object->lock, flags);
}
-static void paint_ptr(unsigned long ptr, int color)
+static void paint_ptr(unsigned long ptr, int color, bool is_phys)
{
struct kmemleak_object *object;
- object = find_and_get_object(ptr, 0);
+ object = __find_and_get_object(ptr, 0, is_phys);
if (!object) {
kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
ptr,
*/
static void make_gray_object(unsigned long ptr)
{
- paint_ptr(ptr, KMEMLEAK_GREY);
+ paint_ptr(ptr, KMEMLEAK_GREY, false);
}
/*
* Mark the object as black-colored so that it is ignored from scans and
* reporting.
*/
-static void make_black_object(unsigned long ptr)
+static void make_black_object(unsigned long ptr, bool is_phys)
{
- paint_ptr(ptr, KMEMLEAK_BLACK);
+ paint_ptr(ptr, KMEMLEAK_BLACK, is_phys);
}
/*
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- delete_object_part((unsigned long)ptr, size);
+ delete_object_part((unsigned long)ptr, size, false);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- make_black_object((unsigned long)ptr);
+ make_black_object((unsigned long)ptr, false);
}
EXPORT_SYMBOL(kmemleak_ignore);
* address argument
* @phys: physical address of the object
* @size: size of the object
- * @min_count: minimum number of references to this object.
- * See kmemleak_alloc()
* @gfp: kmalloc() flags used for kmemleak internal memory allocations
*/
-void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
- gfp_t gfp)
+void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
- kmemleak_alloc(__va(phys), size, min_count, gfp);
+ pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size);
+
+ if (kmemleak_enabled)
+ /*
+ * Create object with OBJECT_PHYS flag and
+ * assume min_count 0.
+ */
+ create_object_phys((unsigned long)phys, size, 0, gfp);
}
EXPORT_SYMBOL(kmemleak_alloc_phys);
*/
void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
- kmemleak_free_part(__va(phys), size);
-}
-EXPORT_SYMBOL(kmemleak_free_part_phys);
+ pr_debug("%s(0x%pa)\n", __func__, &phys);
-/**
- * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
- * address argument
- * @phys: physical address of the object
- */
-void __ref kmemleak_not_leak_phys(phys_addr_t phys)
-{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
- kmemleak_not_leak(__va(phys));
+ if (kmemleak_enabled)
+ delete_object_part((unsigned long)phys, size, true);
}
-EXPORT_SYMBOL(kmemleak_not_leak_phys);
+EXPORT_SYMBOL(kmemleak_free_part_phys);
/**
* kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
*/
void __ref kmemleak_ignore_phys(phys_addr_t phys)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
- kmemleak_ignore(__va(phys));
+ pr_debug("%s(0x%pa)\n", __func__, &phys);
+
+ if (kmemleak_enabled)
+ make_black_object((unsigned long)phys, true);
}
EXPORT_SYMBOL(kmemleak_ignore_phys);
{
u32 old_csum = object->checksum;
+ if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+ return false;
+
kasan_disable_current();
kcsan_disable_current();
object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size);
{
struct kmemleak_scan_area *area;
unsigned long flags;
+ void *obj_ptr;
/*
* Once the object->lock is acquired, the corresponding memory block
if (!(object->flags & OBJECT_ALLOCATED))
/* already freed object */
goto out;
+
+ obj_ptr = object->flags & OBJECT_PHYS ?
+ __va((phys_addr_t)object->pointer) :
+ (void *)object->pointer;
+
if (hlist_empty(&object->area_list) ||
object->flags & OBJECT_FULL_SCAN) {
- void *start = (void *)object->pointer;
- void *end = (void *)(object->pointer + object->size);
+ void *start = obj_ptr;
+ void *end = obj_ptr + object->size;
void *next;
do {
*/
static void kmemleak_scan(void)
{
- unsigned long flags;
struct kmemleak_object *object;
struct zone *zone;
int __maybe_unused i;
int new_leaks = 0;
+ int loop1_cnt = 0;
jiffies_last_scan = jiffies;
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ bool obj_pinned = false;
+
+ loop1_cnt++;
+ raw_spin_lock_irq(&object->lock);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
dump_object_info(object);
}
#endif
+
+ /* ignore objects outside lowmem (paint them black) */
+ if ((object->flags & OBJECT_PHYS) &&
+ !(object->flags & OBJECT_NO_SCAN)) {
+ unsigned long phys = object->pointer;
+
+ if (PHYS_PFN(phys) < min_low_pfn ||
+ PHYS_PFN(phys + object->size) >= max_low_pfn)
+ __paint_it(object, KMEMLEAK_BLACK);
+ }
+
/* reset the reference count (whiten the object) */
object->count = 0;
- if (color_gray(object) && get_object(object))
+ if (color_gray(object) && get_object(object)) {
list_add_tail(&object->gray_list, &gray_list);
+ obj_pinned = true;
+ }
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
+
+ /*
+ * Do a cond_resched() to avoid soft lockup every 64k objects.
+ * Make sure a reference has been taken so that the object
+ * won't go away without RCU read lock.
+ */
+ if (!(loop1_cnt & 0xffff)) {
+ if (!obj_pinned && !get_object(object)) {
+ /* Try the next object instead */
+ loop1_cnt--;
+ continue;
+ }
+
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+
+ if (!obj_pinned)
+ put_object(object);
+ }
}
rcu_read_unlock();
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ /*
+ * This is racy but we can save the overhead of lock/unlock
+ * calls. The missed objects, if any, should be caught in
+ * the next scan.
+ */
+ if (!color_white(object))
+ continue;
+ raw_spin_lock_irq(&object->lock);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ /*
+ * This is racy but we can save the overhead of lock/unlock
+ * calls. The missed objects, if any, should be caught in
+ * the next scan.
+ */
+ if (!color_white(object))
+ continue;
+ raw_spin_lock_irq(&object->lock);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
new_leaks++;
}
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
static void kmemleak_clear(void)
{
struct kmemleak_object *object;
- unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irq(&object->lock);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
if (!list_lru_memcg_aware(lru))
goto out;
- memcg = mem_cgroup_from_obj(ptr);
+ memcg = mem_cgroup_from_slab_obj(ptr);
if (!memcg)
goto out;
* from the regions with mirroring enabled and then retried from any
* memory region.
*
- * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
- * allocated boot memory block, so that it is never reported as leaks.
+ * In addition, function using kmemleak_alloc_phys for allocated boot
+ * memory block, it is never reported as leaks.
*
* Return:
* Physical address of allocated memory block on success, %0 on failure.
*/
if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
/*
- * The min_count is set to 0 so that memblock allocated
- * blocks are never reported as leaks. This is because many
- * of these blocks are only referred via the physical
- * address which is not looked up by kmemleak.
+ * Memblock allocated blocks are never reported as
+ * leaks. This is because many of these blocks are
+ * only referred via the physical address which is
+ * not looked up by kmemleak.
*/
- kmemleak_alloc_phys(found, size, 0, 0);
+ kmemleak_alloc_phys(found, size, 0);
return found;
}
struct lruvec *lruvec;
rcu_read_lock();
- memcg = mem_cgroup_from_obj(p);
+ memcg = mem_cgroup_from_slab_obj(p);
/*
* Untracked pages have no memcg, no lruvec. Update only the
return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
+/* Subset of vm_event_item to report for memcg event stats */
+static const unsigned int memcg_vm_event_stat[] = {
+ PGSCAN_KSWAPD,
+ PGSCAN_DIRECT,
+ PGSTEAL_KSWAPD,
+ PGSTEAL_DIRECT,
+ PGFAULT,
+ PGMAJFAULT,
+ PGREFILL,
+ PGACTIVATE,
+ PGDEACTIVATE,
+ PGLAZYFREE,
+ PGLAZYFREED,
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ ZSWPIN,
+ ZSWPOUT,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ THP_FAULT_ALLOC,
+ THP_COLLAPSE_ALLOC,
+#endif
+};
+
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
}
/* Accumulated memory events */
-
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
- memcg_events(memcg, PGFAULT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
- memcg_events(memcg, PGMAJFAULT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
- memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
- memcg_events(memcg, PGACTIVATE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
- memcg_events(memcg, PGDEACTIVATE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
- memcg_events(memcg, PGLAZYFREE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
- memcg_events(memcg, PGLAZYFREED));
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
- memcg_events(memcg, ZSWPIN));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT),
- memcg_events(memcg, ZSWPOUT));
-#endif
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
- memcg_events(memcg, THP_FAULT_ALLOC));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
- memcg_events(memcg, THP_COLLAPSE_ALLOC));
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++)
+ seq_buf_printf(&s, "%s %lu\n",
+ vm_event_name(memcg_vm_event_stat[i]),
+ memcg_events(memcg, memcg_vm_event_stat[i]));
/* The above should easily fit into one page */
WARN_ON_ONCE(seq_buf_has_overflowed(&s));
return 0;
}
-/*
- * Returns a pointer to the memory cgroup to which the kernel object is charged.
- *
- * A passed kernel object can be a slab object or a generic kernel page, so
- * different mechanisms for getting the memory cgroup pointer should be used.
- * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
- * can not know for sure how the kernel object is implemented.
- * mem_cgroup_from_obj() can be safely used in such cases.
- *
- * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
- * cgroup_mutex, etc.
- */
-struct mem_cgroup *mem_cgroup_from_obj(void *p)
+static __always_inline
+struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
{
- struct folio *folio;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- folio = virt_to_folio(p);
-
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
return page_memcg_check(folio_page(folio, 0));
}
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * A passed kernel object can be a slab object, vmalloc object or a generic
+ * kernel page, so different mechanisms for getting the memory cgroup pointer
+ * should be used.
+ *
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+ struct folio *folio;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ if (unlikely(is_vmalloc_addr(p)))
+ folio = page_folio(vmalloc_to_page(p));
+ else
+ folio = virt_to_folio(p);
+
+ return mem_cgroup_from_obj_folio(folio, p);
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
+ * allocated using vmalloc().
+ *
+ * A passed kernel object must be a slab object or a generic kernel page.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
+}
+
static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg = NULL;
balance_dirty_pages_ratelimited(mapping);
if (fpin) {
fput(fpin);
- return VM_FAULT_RETRY;
+ return VM_FAULT_COMPLETED;
}
}
kfree(res);
}
-static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
- const char *reason)
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages)
{
/*
* Disallow all operations smaller than a sub-section and only
min_align = PAGES_PER_SUBSECTION;
else
min_align = PAGES_PER_SECTION;
- if (!IS_ALIGNED(pfn, min_align)
- || !IS_ALIGNED(nr_pages, min_align)) {
- WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
- reason, pfn, pfn + nr_pages - 1);
+ if (!IS_ALIGNED(pfn | nr_pages, min_align))
return -EINVAL;
- }
return 0;
}
altmap->alloc = 0;
}
- err = check_pfn_span(pfn, nr_pages, "add");
- if (err)
- return err;
+ if (check_pfn_span(pfn, nr_pages)) {
+ WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+ return -EINVAL;
+ }
for (; pfn < end_pfn; pfn += cur_nr_pages) {
/* Select all remaining pages up to the next section boundary */
map_offset = vmem_altmap_offset(altmap);
- if (check_pfn_span(pfn, nr_pages, "remove"))
+ if (check_pfn_span(pfn, nr_pages)) {
+ WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
return;
+ }
for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
- might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+ might_alloc(gfp_mask);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
for (i = 0; i < pgmap->nr_range; i++)
percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
wait_for_completion(&pgmap->done);
- percpu_ref_exit(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
pageunmap_range(pgmap, i);
+ percpu_ref_exit(&pgmap->ref);
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
devmap_managed_enable_put(pgmap);
/*
- * Not device managed version of dev_memremap_pages, undone by
- * memunmap_pages(). Please use dev_memremap_pages if you have a struct
+ * Not device managed version of devm_memremap_pages, undone by
+ * memunmap_pages(). Please use devm_memremap_pages if you have a struct
* device available.
*/
void *memremap_pages(struct dev_pagemap *pgmap, int nid)
*alloc_flags |= ALLOC_CPUSET;
}
- fs_reclaim_acquire(gfp_mask);
- fs_reclaim_release(gfp_mask);
-
- might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+ might_alloc(gfp_mask);
if (should_fail_alloc_page(gfp_mask, order))
return false;
}
/*
- * Swap in the page pointed to by *pagep.
- * Caller has to make sure that *pagep contains a valid swapped page.
- * Returns 0 and the page in pagep if success. On failure, returns the
- * error code and NULL in *pagep.
+ * Swap in the folio pointed to by *foliop.
+ * Caller has to make sure that *foliop contains a valid swapped folio.
+ * Returns 0 and the folio in foliop if success. On failure, returns the
+ * error code and NULL in *foliop.
*/
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
}
folio = page_folio(page);
- /* We have to do this with page locked to prevent races */
+ /* We have to do this with folio locked to prevent races */
folio_lock(folio);
if (!folio_test_swapcache(folio) ||
folio_swap_entry(folio).val != swap.val ||
return ac->entry[--ac->avail];
}
-static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
- gfp_t flags)
-{
- might_sleep_if(gfpflags_allow_blocking(flags));
-}
-
#if DEBUG
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
if (unlikely(ptr))
goto out_hooks;
- cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
if (nodeid == NUMA_NO_NODE)
if (unlikely(objp))
goto out;
- cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
if (!s)
return 0;
- cache_alloc_debugcheck_before(s, flags);
-
local_irq_disable();
for (i = 0; i < size; i++) {
void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
unsigned long next;
pgd_t *pgd;
- VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
- VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
pgd = pgd_offset_k(addr);
do {
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
for (addr = start; addr < end; addr += size) {
- unsigned long next = addr, last = addr + size;
+ unsigned long next, last = addr + size;
/* Populate the head page vmemmap page */
pte = vmemmap_populate_address(addr, node, NULL, NULL);
return atomic_long_read(&nr_vmalloc_pages);
}
+/* Look up the first VA which satisfies addr < va_end, NULL if none. */
static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
{
struct vmap_area *va = NULL;
* Trigger the BUG() if there are sides(left/right)
* or full overlaps.
*/
- if (va->va_start < tmp_va->va_end &&
- va->va_end <= tmp_va->va_start)
+ if (va->va_end <= tmp_va->va_start)
link = &(*link)->rb_left;
- else if (va->va_end > tmp_va->va_start &&
- va->va_start >= tmp_va->va_end)
+ else if (va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
else {
WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
* Some explanation here. Just perform simple insertion
* to the tree. We do not set va->subtree_max_size to
* its current size before calling rb_insert_augmented().
- * It is because of we populate the tree from the bottom
+ * It is because we populate the tree from the bottom
* to parent levels when the node _is_ in the tree.
*
* Therefore we set subtree_max_size to zero after insertion,
static __always_inline int
adjust_va_to_fit_type(struct vmap_area *va,
- unsigned long nva_start_addr, unsigned long size,
- enum fit_type type)
+ unsigned long nva_start_addr, unsigned long size)
{
struct vmap_area *lva = NULL;
+ enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
if (type == FL_FIT_TYPE) {
/*
bool adjust_search_size = true;
unsigned long nva_start_addr;
struct vmap_area *va;
- enum fit_type type;
int ret;
/*
if (nva_start_addr + size > vend)
return vend;
- /* Classify what we have found. */
- type = classify_va_fit_type(va, nva_start_addr, size);
- if (WARN_ON_ONCE(type == NOTHING_FIT))
- return vend;
-
/* Update the free vmap_area. */
- ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
- if (ret)
+ ret = adjust_va_to_fit_type(va, nva_start_addr, size);
+ if (WARN_ON_ONCE(ret))
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
/*
* Serialize vmap purging. There is no actual critical section protected
- * by this look, but we want to avoid concurrent calls for performance
+ * by this lock, but we want to avoid concurrent calls for performance
* reasons and to make the pcpu_get_vm_areas more deterministic.
*/
static DEFINE_MUTEX(vmap_purge_lock);
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
- struct list_head local_pure_list;
+ struct list_head local_purge_list;
struct vmap_area *va, *n_va;
lockdep_assert_held(&vmap_purge_lock);
spin_lock(&purge_vmap_area_lock);
purge_vmap_area_root = RB_ROOT;
- list_replace_init(&purge_vmap_area_list, &local_pure_list);
+ list_replace_init(&purge_vmap_area_list, &local_purge_list);
spin_unlock(&purge_vmap_area_lock);
- if (unlikely(list_empty(&local_pure_list)))
+ if (unlikely(list_empty(&local_purge_list)))
return false;
start = min(start,
- list_first_entry(&local_pure_list,
+ list_first_entry(&local_purge_list,
struct vmap_area, list)->va_start);
end = max(end,
- list_last_entry(&local_pure_list,
+ list_last_entry(&local_purge_list,
struct vmap_area, list)->va_end);
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
spin_lock(&free_vmap_area_lock);
- list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
+ list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
unsigned long orig_start = va->va_start;
unsigned long orig_end = va->va_end;
int area, area2, last_area, term_area;
unsigned long base, start, size, end, last_end, orig_start, orig_end;
bool purged = false;
- enum fit_type type;
/* verify parameters and allocate data structures */
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- type = classify_va_fit_type(va, start, size);
- if (WARN_ON_ONCE(type == NOTHING_FIT))
+ ret = adjust_va_to_fit_type(va, start, size);
+ if (WARN_ON_ONCE(unlikely(ret)))
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- ret = adjust_va_to_fit_type(va, start, size, type);
- if (unlikely(ret))
- goto recovery;
-
/* Allocated area. */
va = vas[area];
va->va_start = start;
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
#include <linux/uidgid.h>
#include <linux/cookie.h>
* setup_net() and cleanup_net() are not possible.
*/
for_each_net(net) {
+ struct mem_cgroup *old, *memcg;
+
+ memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net));
+ old = set_active_memcg(memcg);
error = ops_init(ops, net);
+ set_active_memcg(old);
+ mem_cgroup_put(memcg);
if (error)
goto out_undo;
list_add_tail(&net->exit_list, &net_exit_list);
}
static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
- int min_count, gfp_t gfp)
+ gfp_t gfp)
{
}
/*
* Be strict and immediately zap area_src, the whole area has
* been transferred already by the background treads. The
- * area_src could then be faulted in in a racy way by still
+ * area_src could then be faulted in a racy way by still
* running uffdio_threads reading zeropages after we zapped
* area_src (but they're guaranteed to get -EEXIST from
* UFFDIO_COPY without writing zero pages into area_dst
"-n|--numa Show NUMA information\n"
"-N|--lines=K Show the first K slabs\n"
"-o|--ops Show kmem_cache_ops\n"
- "-P|--partial Sort by number of partial slabs\n"
+ "-P|--partial Sort by number of partial slabs\n"
"-r|--report Detailed report on single slabs\n"
"-s|--shrink Shrink slabs\n"
"-S|--Size Sort by size\n"
for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
int result;
- if (sort_size)
- result = slab_size(s1) < slab_size(s2);
- else if (sort_active)
- result = slab_activity(s1) < slab_activity(s2);
- else if (sort_loss)
- result = slab_waste(s1) < slab_waste(s2);
- else if (sort_partial)
- result = s1->partial < s2->partial;
- else
+ if (sort_size) {
+ if (slab_size(s1) == slab_size(s2))
+ result = strcasecmp(s1->name, s2->name);
+ else
+ result = slab_size(s1) < slab_size(s2);
+ } else if (sort_active) {
+ if (slab_activity(s1) == slab_activity(s2))
+ result = strcasecmp(s1->name, s2->name);
+ else
+ result = slab_activity(s1) < slab_activity(s2);
+ } else if (sort_loss) {
+ if (slab_waste(s1) == slab_waste(s2))
+ result = strcasecmp(s1->name, s2->name);
+ else
+ result = slab_waste(s1) < slab_waste(s2);
+ } else if (sort_partial) {
+ if (s1->partial == s2->partial)
+ result = strcasecmp(s1->name, s2->name);
+ else
+ result = s1->partial < s2->partial;
+ } else
result = strcasecmp(s1->name, s2->name);
if (show_inverted)