Merge branch 'akpm' (patches from Andrew)

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 30 Jun 2021 00:29:11 +0000 (17:29 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 30 Jun 2021 00:29:11 +0000 (17:29 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 30 Jun 2021 00:29:11 +0000 (17:29 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 30 Jun 2021 00:29:11 +0000 (17:29 -0700)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index fed6c30,566c4b9..2991f6e
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -113,7 -113,7 +113,7 @@@
                         the GPE dispatcher.
                         This facility can be used to prevent such uncontrolled
                         GPE floodings.
- -                      Format: <byte>
+ +                      Format: <byte> or <bitmap-list>
   
         acpi_no_auto_serialize  [HW,ACPI]
                         Disable auto-serialization of AML methods
@@@ -581,28 -581,6 +581,28 @@@
                         loops can be debugged more effectively on production
                         systems.
   
+ +      clocksource.max_cswd_read_retries= [KNL]
+ +                      Number of clocksource_watchdog() retries due to
+ +                      external delays before the clock will be marked
+ +                      unstable.  Defaults to three retries, that is,
+ +                      four attempts to read the clock under test.
+ +
+ +      clocksource.verify_n_cpus= [KNL]
+ +                      Limit the number of CPUs checked for clocksources
+ +                      marked with CLOCK_SOURCE_VERIFY_PERCPU that
+ +                      are marked unstable due to excessive skew.
+ +                      A negative value says to check all CPUs, while
+ +                      zero says not to check any.  Values larger than
+ +                      nr_cpu_ids are silently truncated to nr_cpu_ids.
+ +                      The actual CPUs are chosen randomly, with
+ +                      no replacement if the same CPU is chosen twice.
+ +
+ +      clocksource-wdtest.holdoff= [KNL]
+ +                      Set the time in seconds that the clocksource
+ +                      watchdog test waits before commencing its tests.
+ +                      Defaults to zero when built as a module and to
+ +                      10 seconds when built into the kernel.
+ +
         clearcpuid=BITNUM[,BITNUM...] [X86]
                         Disable CPUID feature X for the kernel. See
                         arch/x86/include/asm/cpufeatures.h for the valid bit
@@@ -3266,7 -3244,7 +3266,7 @@@
   
         noclflush       [BUGS=X86] Don't use the CLFLUSH instruction
   
- -      nodelayacct     [KNL] Disable per-task delay accounting
+ +      delayacct       [KNL] Enable per-task delay accounting
   
         nodsp           [SH] Disable hardware DSP at boot time.
   
@@@ -3535,9 -3513,6 +3535,9 @@@
   
         nr_uarts=       [SERIAL] maximum number of UARTs to be registered.
   
+ +      numa=off        [KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only
+ +                      set up a single NUMA node spanning all memory.
+ +
         numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
                         NUMA balancing.
                         Allowed values are enable and disable
@@@ -3591,6 -3566,12 +3591,12 @@@
                         off: turn off poisoning (default)
                         on: turn on poisoning
   
+       page_reporting.page_reporting_order=
+                       [KNL] Minimal page reporting order
+                       Format: <integer>
+                       Adjust the minimal page reporting order. The page
+                       reporting is disabled when it exceeds (MAX_ORDER-1).
+ 
         panic=          [KNL] Kernel behaviour on panic: delay <timeout>
                         timeout > 0: seconds before rebooting
                         timeout = 0: wait forever
@@@ -4800,6 -4781,11 +4806,6 @@@
                         Reserves a hole at the top of the kernel virtual
                         address space.
   
- -      reservelow=     [X86]
- -                      Format: nn[K]
- -                      Set the amount of memory to reserve for BIOS at
- -                      the bottom of the address space.
- -
         reset_devices   [KNL] Force drivers to reset the underlying device
                         during initialization.
   
@@@ -5303,14 -5289,6 +5309,14 @@@
                                   exception. Default behavior is by #AC if
                                   both features are enabled in hardware.
   
+ +                      ratelimit:N -
+ +                                Set system wide rate limit to N bus locks
+ +                                per second for bus lock detection.
+ +                                0 < N <= 1000.
+ +
+ +                                N/A for split lock detection.
+ +
+ +
                         If an #AC exception is hit in the kernel or in
                         firmware (i.e. not while executing in user mode)
                         the kernel will oops in either "warn" or "fatal"
diff --combined Documentation/admin-guide/sysctl/kernel.rst

index 10dd4b1,04c7901..4261620
--- 1/Documentation/admin-guide/sysctl/kernel.rst
--- 2/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@@ -9,8 -9,7 +9,8 @@@ Copyright (c) 1998, 1999,  Rik van Rie
   
   Copyright (c) 2009,        Shen Feng<shen@cn.fujitsu.com>
   
- -For general info and legal blurb, please look in :doc:`index`.
+ +For general info and legal blurb, please look in
+ +Documentation/admin-guide/sysctl/index.rst.
   
   ------------------------------------------------------------------------------
   
@@@ -55,7 -54,7 +55,7 @@@ free space valid for 30 seconds
   acpi_video_flags
   ================
   
- -See :doc:`/power/video`. This allows the video resume mode to be set,
+ +See Documentation/power/video.rst. This allows the video resume mode to be set,
   in a similar fashion to the ``acpi_sleep`` kernel parameter, by
   combining the following values:
   
@@@ -90,7 -89,7 +90,7 @@@ is 0x15 and the full version number is 
   the value 340 = 0x154.
   
   See the ``type_of_loader`` and ``ext_loader_type`` fields in
- -:doc:`/x86/boot` for additional information.
+ +Documentation/x86/boot.rst for additional information.
   
   
   bootloader_version (x86 only)
@@@ -100,7 -99,7 +100,7 @@@ The complete bootloader version number
   file will contain the value 564 = 0x234.
   
   See the ``type_of_loader`` and ``ext_loader_ver`` fields in
- -:doc:`/x86/boot` for additional information.
+ +Documentation/x86/boot.rst for additional information.
   
   
   bpf_stats_enabled
@@@ -270,7 -269,7 +270,7 @@@ see the ``hostname(1)`` man page
   firmware_config
   ===============
   
- -See :doc:`/driver-api/firmware/fallback-mechanisms`.
+ +See Documentation/driver-api/firmware/fallback-mechanisms.rst.
   
   The entries in this directory allow the firmware loader helper
   fallback to be controlled:
@@@ -298,7 -297,7 +298,7 @@@ crashes and outputting them to a seria
   ftrace_enabled, stack_tracer_enabled
   ====================================
   
- -See :doc:`/trace/ftrace`.
+ +See Documentation/trace/ftrace.rst.
   
   
   hardlockup_all_cpu_backtrace
@@@ -326,7 -325,7 +326,7 @@@ when a hard lockup is detected
   1 Panic on hard lockup.
   = ===========================
   
- -See :doc:`/admin-guide/lockup-watchdogs` for more information.
+ +See Documentation/admin-guide/lockup-watchdogs.rst for more information.
   This can also be set using the nmi_watchdog kernel parameter.
   
   
@@@ -334,12 -333,7 +334,12 @@@ hotplu
   =======
   
   Path for the hotplug policy agent.
- -Default value is "``/sbin/hotplug``".
+ +Default value is ``CONFIG_UEVENT_HELPER_PATH``, which in turn defaults
+ +to the empty string.
+ +
+ +This file only exists when ``CONFIG_UEVENT_HELPER`` is enabled. Most
+ +modern systems rely exclusively on the netlink-based uevent source and
+ +don't need this.
   
   
   hung_task_all_cpu_backtrace
@@@ -588,8 -582,7 +588,8 @@@ in a KVM virtual machine. This default 
   
      nmi_watchdog=1
   
- -to the guest kernel command line (see :doc:`/admin-guide/kernel-parameters`).
+ +to the guest kernel command line (see
+ +Documentation/admin-guide/kernel-parameters.rst).
   
   
   numa_balancing
@@@ -1074,7 -1067,7 +1074,7 @@@ that support this feature
   real-root-dev
   =============
   
- -See :doc:`/admin-guide/initrd`.
+ +See Documentation/admin-guide/initrd.rst.
   
   
   reboot-cmd (SPARC only)
@@@ -1095,13 -1088,6 +1095,13 @@@ Model available). If your platform happ
   requirements for EAS but you do not want to use it, change
   this value to 0.
   
+ +task_delayacct
+ +===============
+ +
+ +Enables/disables task delay accounting (see
+ +:doc:`accounting/delay-accounting.rst`). Enabling this feature incurs
+ +a small amount of overhead in the scheduler but is useful for debugging
+ +and performance tuning. It is required by some tools such as iotop.
   
   sched_schedstats
   ================
@@@ -1168,7 -1154,7 +1168,7 @@@ will take effect
   seccomp
   =======
   
- -See :doc:`/userspace-api/seccomp_filter`.
+ +See Documentation/userspace-api/seccomp_filter.rst.
   
   
   sg-big-buff
@@@ -1297,11 -1283,11 +1297,11 @@@ This parameter can be used to control t
   = =================================
   
   The soft lockup detector monitors CPUs for threads that are hogging the CPUs
- without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
- from running. The mechanism depends on the CPUs ability to respond to timer
- interrupts which are needed for the 'watchdog/N' threads to be woken up by
- the watchdog timer function, otherwise the NMI watchdog — if enabled — can
- detect a hard lockup condition.
+ without rescheduling voluntarily, and thus prevent the 'migration/N' threads
+ from running, causing the watchdog work fail to execute. The mechanism depends
+ on the CPUs ability to respond to timer interrupts which are needed for the
+ watchdog work to be queued by the watchdog timer function, otherwise the NMI
+ watchdog — if enabled — can detect a hard lockup condition.
   
   
   stack_erasing
@@@ -1339,7 -1325,7 +1339,7 @@@ the boot PROM
   sysrq
   =====
   
- -See :doc:`/admin-guide/sysrq`.
+ +See Documentation/admin-guide/sysrq.rst.
   
   
   tainted
@@@ -1369,16 -1355,15 +1369,16 @@@ ORed together. The letters are seen in 
   131072  `(T)`  The kernel was built with the struct randomization plugin
   ======  =====  ==============================================================
   
- -See :doc:`/admin-guide/tainted-kernels` for more information.
+ +See Documentation/admin-guide/tainted-kernels.rst for more information.
   
   Note:
     writes to this sysctl interface will fail with ``EINVAL`` if the kernel is
     booted with the command line option ``panic_on_taint=<bitmask>,nousertaint``
     and any of the ORed together values being written to ``tainted`` match with
     the bitmask declared on panic_on_taint.
- -  See :doc:`/admin-guide/kernel-parameters` for more details on that particular
- -  kernel command line option and its optional ``nousertaint`` switch.
+ +  See Documentation/admin-guide/kernel-parameters.rst for more details on
+ +  that particular kernel command line option and its optional
+ +  ``nousertaint`` switch.
   
   threads-max
   ===========
@@@ -1402,7 -1387,7 +1402,7 @@@ If a value outside of this range is wri
   traceoff_on_warning
   ===================
   
- -When set, disables tracing (see :doc:`/trace/ftrace`) when a
+ +When set, disables tracing (see Documentation/trace/ftrace.rst) when a
   ``WARN()`` is hit.
   
   
@@@ -1422,8 -1407,8 +1422,8 @@@ will send them to printk() again
   
   This only works if the kernel was booted with ``tp_printk`` enabled.
   
- -See :doc:`/admin-guide/kernel-parameters` and
- -:doc:`/trace/boottime-trace`.
+ +See Documentation/admin-guide/kernel-parameters.rst and
+ +Documentation/trace/boottime-trace.rst.
   
   
   .. _unaligned-dump-stack:
diff --combined arch/arm64/Kconfig

index dabe9b8,d01a154..a6a09cb
--- 1/arch/arm64/Kconfig
--- 2/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@@ -1035,7 -1035,7 +1035,7 @@@ config NODES_SHIF
         int "Maximum NUMA Nodes (as a power of 2)"
         range 1 10
         default "4"
-       depends on NEED_MULTIPLE_NODES
+       depends on NUMA
         help
           Specify the maximum number of NUMA Nodes available on the target
           system.  Increases memory reserved to accommodate various tables.
@@@ -1481,6 -1481,12 +1481,6 @@@ menu "ARMv8.3 architectural features
   config ARM64_PTR_AUTH
         bool "Enable support for pointer authentication"
         default y
- -      depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
- -      # Modern compilers insert a .note.gnu.property section note for PAC
- -      # which is only understood by binutils starting with version 2.33.1.
- -      depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
- -      depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
- -      depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
         help
           Pointer authentication (part of the ARMv8.3 Extensions) provides
           instructions for signing and authenticating pointers against secret
@@@ -1492,6 -1498,13 +1492,6 @@@
           for each process at exec() time, with these keys being
           context-switched along with the process.
   
- -        If the compiler supports the -mbranch-protection or
- -        -msign-return-address flag (e.g. GCC 7 or later), then this option
- -        will also cause the kernel itself to be compiled with return address
- -        protection. In this case, and if the target hardware is known to
- -        support pointer authentication, then CONFIG_STACKPROTECTOR can be
- -        disabled with minimal loss of protection.
- -
           The feature is detected at runtime. If the feature is not present in
           hardware it will not be advertised to userspace/KVM guest nor will it
           be enabled.
@@@ -1502,24 -1515,6 +1502,24 @@@
           but with the feature disabled. On such a system, this option should
           not be selected.
   
+ +config ARM64_PTR_AUTH_KERNEL
+ +      bool "Use pointer authentication for kernel"
+ +      default y
+ +      depends on ARM64_PTR_AUTH
+ +      depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
+ +      # Modern compilers insert a .note.gnu.property section note for PAC
+ +      # which is only understood by binutils starting with version 2.33.1.
+ +      depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
+ +      depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
+ +      depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
+ +      help
+ +        If the compiler supports the -mbranch-protection or
+ +        -msign-return-address flag (e.g. GCC 7 or later), then this option
+ +        will cause the kernel itself to be compiled with return address
+ +        protection. In this case, and if the target hardware is known to
+ +        support pointer authentication, then CONFIG_STACKPROTECTOR can be
+ +        disabled with minimal loss of protection.
+ +
           This feature works with FUNCTION_GRAPH_TRACER option only if
           DYNAMIC_FTRACE_WITH_REGS is enabled.
   
@@@ -1611,7 -1606,7 +1611,7 @@@ config ARM64_BTI_KERNE
         bool "Use Branch Target Identification for kernel"
         default y
         depends on ARM64_BTI
- -      depends on ARM64_PTR_AUTH
+ +      depends on ARM64_PTR_AUTH_KERNEL
         depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI
         # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697
         depends on !CC_IS_GCC || GCC_VERSION >= 100100
diff --combined arch/arm64/kvm/mmu.c

index 57292dc,74b3c1a..f23dfa0
--- 1/arch/arm64/kvm/mmu.c
--- 2/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@@ -126,16 -126,6 +126,16 @@@ static void *kvm_host_va(phys_addr_t ph
         return __va(phys);
   }
   
+ +static void clean_dcache_guest_page(void *va, size_t size)
+ +{
+ +      __clean_dcache_guest_page(va, size);
+ +}
+ +
+ +static void invalidate_icache_guest_page(void *va, size_t size)
+ +{
+ +      __invalidate_icache_guest_page(va, size);
+ +}
+ +
   /*
    * Unmapping vs dcache management:
    *
@@@ -442,8 -432,6 +442,8 @@@ static struct kvm_pgtable_mm_ops kvm_s2
         .page_count             = kvm_host_page_count,
         .phys_to_virt           = kvm_host_va,
         .virt_to_phys           = kvm_host_pa,
+ +      .dcache_clean_inval_poc = clean_dcache_guest_page,
+ +      .icache_inval_pou       = invalidate_icache_guest_page,
   };
   
   /**
@@@ -705,6 -693,16 +705,6 @@@ void kvm_arch_mmu_enable_log_dirty_pt_m
         kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
   }
   
- -static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
- -{
- -      __clean_dcache_guest_page(pfn, size);
- -}
- -
- -static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
- -{
- -      __invalidate_icache_guest_page(pfn, size);
- -}
- -
   static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
   {
         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
@@@ -824,74 -822,6 +824,74 @@@ transparent_hugepage_adjust(struct kvm_
         return PAGE_SIZE;
   }
   
+ +static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
+ +{
+ +      unsigned long pa;
+ +
+ +      if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
+ +              return huge_page_shift(hstate_vma(vma));
+ +
+ +      if (!(vma->vm_flags & VM_PFNMAP))
+ +              return PAGE_SHIFT;
+ +
+ +      VM_BUG_ON(is_vm_hugetlb_page(vma));
+ +
+ +      pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
+ +
+ +#ifndef __PAGETABLE_PMD_FOLDED
+ +      if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
+ +          ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
+ +          ALIGN(hva, PUD_SIZE) <= vma->vm_end)
+ +              return PUD_SHIFT;
+ +#endif
+ +
+ +      if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
+ +          ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
+ +          ALIGN(hva, PMD_SIZE) <= vma->vm_end)
+ +              return PMD_SHIFT;
+ +
+ +      return PAGE_SHIFT;
+ +}
+ +
+ +/*
+ + * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
+ + * able to see the page's tags and therefore they must be initialised first. If
+ + * PG_mte_tagged is set, tags have already been initialised.
+ + *
+ + * The race in the test/set of the PG_mte_tagged flag is handled by:
+ + * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
+ + *   racing to santise the same page
+ + * - mmap_lock protects between a VM faulting a page in and the VMM performing
+ + *   an mprotect() to add VM_MTE
+ + */
+ +static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+ +                           unsigned long size)
+ +{
+ +      unsigned long i, nr_pages = size >> PAGE_SHIFT;
+ +      struct page *page;
+ +
+ +      if (!kvm_has_mte(kvm))
+ +              return 0;
+ +
+ +      /*
+ +       * pfn_to_online_page() is used to reject ZONE_DEVICE pages
+ +       * that may not support tags.
+ +       */
+ +      page = pfn_to_online_page(pfn);
+ +
+ +      if (!page)
+ +              return -EFAULT;
+ +
+ +      for (i = 0; i < nr_pages; i++, page++) {
+ +              if (!test_bit(PG_mte_tagged, &page->flags)) {
+ +                      mte_clear_page_tags(page_address(page));
+ +                      set_bit(PG_mte_tagged, &page->flags);
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
   static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                           struct kvm_memory_slot *memslot, unsigned long hva,
                           unsigned long fault_status)
@@@ -900,7 -830,6 +900,7 @@@
         bool write_fault, writable, force_pte = false;
         bool exec_fault;
         bool device = false;
+ +      bool shared;
         unsigned long mmu_seq;
         struct kvm *kvm = vcpu->kvm;
         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@@ -924,31 -853,26 +924,31 @@@
                 return -EFAULT;
         }
   
- -      /* Let's check if we will get back a huge page backed by hugetlbfs */
+ +      /*
+ +       * Let's check if we will get back a huge page backed by hugetlbfs, or
+ +       * get block mapping for device MMIO region.
+ +       */
         mmap_read_lock(current->mm);
-       vma = find_vma_intersection(current->mm, hva, hva + 1);
+       vma = vma_lookup(current->mm, hva);
         if (unlikely(!vma)) {
                 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
                 mmap_read_unlock(current->mm);
                 return -EFAULT;
         }
   
- -      if (is_vm_hugetlb_page(vma))
- -              vma_shift = huge_page_shift(hstate_vma(vma));
- -      else
- -              vma_shift = PAGE_SHIFT;
- -
- -      if (logging_active ||
- -          (vma->vm_flags & VM_PFNMAP)) {
+ +      /*
+ +       * logging_active is guaranteed to never be true for VM_PFNMAP
+ +       * memslots.
+ +       */
+ +      if (logging_active) {
                 force_pte = true;
                 vma_shift = PAGE_SHIFT;
+ +      } else {
+ +              vma_shift = get_vma_page_shift(vma, hva);
         }
   
+ +      shared = (vma->vm_flags & VM_PFNMAP);
+ +
         switch (vma_shift) {
   #ifndef __PAGETABLE_PMD_FOLDED
         case PUD_SHIFT:
@@@ -1019,17 -943,8 +1019,17 @@@
                 return -EFAULT;
   
         if (kvm_is_device_pfn(pfn)) {
+ +              /*
+ +               * If the page was identified as device early by looking at
+ +               * the VMA flags, vma_pagesize is already representing the
+ +               * largest quantity we can map.  If instead it was mapped
+ +               * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
+ +               * and must not be upgraded.
+ +               *
+ +               * In both cases, we don't let transparent_hugepage_adjust()
+ +               * change things at the last minute.
+ +               */
                 device = true;
- -              force_pte = true;
         } else if (logging_active && !write_fault) {
                 /*
                  * Only actually map the page as writable if this was a write
@@@ -1050,25 -965,19 +1050,25 @@@
          * If we are not forced to use page mapping, check if we are
          * backed by a THP and thus use block mapping if possible.
          */
- -      if (vma_pagesize == PAGE_SIZE && !force_pte)
+ +      if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
                 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
                                                            &pfn, &fault_ipa);
+ +
+ +      if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
+ +              /* Check the VMM hasn't introduced a new VM_SHARED VMA */
+ +              if (!shared)
+ +                      ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
+ +              else
+ +                      ret = -EFAULT;
+ +              if (ret)
+ +                      goto out_unlock;
+ +      }
+ +
         if (writable)
                 prot |= KVM_PGTABLE_PROT_W;
   
- -      if (fault_status != FSC_PERM && !device)
- -              clean_dcache_guest_page(pfn, vma_pagesize);
- -
- -      if (exec_fault) {
+ +      if (exec_fault)
                 prot |= KVM_PGTABLE_PROT_X;
- -              invalidate_icache_guest_page(pfn, vma_pagesize);
- -      }
   
         if (device)
                 prot |= KVM_PGTABLE_PROT_DEVICE;
@@@ -1259,22 -1168,19 +1259,22 @@@ bool kvm_unmap_gfn_range(struct kvm *kv
   bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   {
         kvm_pfn_t pfn = pte_pfn(range->pte);
+ +      int ret;
   
         if (!kvm->arch.mmu.pgt)
                 return false;
   
         WARN_ON(range->end - range->start != 1);
   
- -      /*
- -       * We've moved a page around, probably through CoW, so let's treat it
- -       * just like a translation fault and clean the cache to the PoC.
- -       */
- -      clean_dcache_guest_page(pfn, PAGE_SIZE);
+ +      ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
+ +      if (ret)
+ +              return false;
   
         /*
+ +       * We've moved a page around, probably through CoW, so let's treat
+ +       * it just like a translation fault and the map handler will clean
+ +       * the cache to the PoC.
+ +       *
          * The MMU notifiers will have unmapped a huge PMD before calling
          * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
          * therefore we never need to clear out a huge PMD through this
@@@ -1440,6 -1346,7 +1440,6 @@@ int kvm_arch_prepare_memory_region(stru
   {
         hva_t hva = mem->userspace_addr;
         hva_t reg_end = hva + mem->memory_size;
- -      bool writable = !(mem->flags & KVM_MEM_READONLY);
         int ret = 0;
   
         if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
@@@ -1456,7 -1363,8 +1456,7 @@@
         mmap_read_lock(current->mm);
         /*
          * A memory region could potentially cover multiple VMAs, and any holes
- -       * between them, so iterate over all of them to find out if we can map
- -       * any of them right now.
+ +       * between them, so iterate over all of them.
          *
          *     +--------------------------------------------+
          * +---------------+----------------+   +----------------+
@@@ -1467,29 -1375,51 +1467,29 @@@
          */
         do {
                 struct vm_area_struct *vma;
- -              hva_t vm_start, vm_end;
   
                 vma = find_vma_intersection(current->mm, hva, reg_end);
                 if (!vma)
                         break;
   
                 /*
- -               * Take the intersection of this VMA with the memory region
+ +               * VM_SHARED mappings are not allowed with MTE to avoid races
+ +               * when updating the PG_mte_tagged page flag, see
+ +               * sanitise_mte_tags for more details.
                  */
- -              vm_start = max(hva, vma->vm_start);
- -              vm_end = min(reg_end, vma->vm_end);
+ +              if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED)
+ +                      return -EINVAL;
   
                 if (vma->vm_flags & VM_PFNMAP) {
- -                      gpa_t gpa = mem->guest_phys_addr +
- -                                  (vm_start - mem->userspace_addr);
- -                      phys_addr_t pa;
- -
- -                      pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
- -                      pa += vm_start - vma->vm_start;
- -
                         /* IO region dirty page logging not allowed */
                         if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
                                 ret = -EINVAL;
- -                              goto out;
- -                      }
- -
- -                      ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
- -                                                  vm_end - vm_start,
- -                                                  writable);
- -                      if (ret)
                                 break;
+ +                      }
                 }
- -              hva = vm_end;
+ +              hva = min(reg_end, vma->vm_end);
         } while (hva < reg_end);
   
- -      if (change == KVM_MR_FLAGS_ONLY)
- -              goto out;
- -
- -      spin_lock(&kvm->mmu_lock);
- -      if (ret)
- -              unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
- -      else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
- -              stage2_flush_memslot(kvm, memslot);
- -      spin_unlock(&kvm->mmu_lock);
- -out:
         mmap_read_unlock(current->mm);
         return ret;
   }
diff --combined arch/powerpc/kernel/smp.c

index 6c6e4d9,a5209ea..7ddc2d3
--- 1/arch/powerpc/kernel/smp.c
--- 2/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@@ -1047,7 -1047,7 +1047,7 @@@ void __init smp_prepare_cpus(unsigned i
                         zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu),
                                                 GFP_KERNEL, cpu_to_node(cpu));
   
- #ifdef CONFIG_NEED_MULTIPLE_NODES
+ #ifdef CONFIG_NUMA
                 /*
                  * numa_node_id() works after this.
                  */
@@@ -1547,6 -1547,7 +1547,6 @@@ void start_secondary(void *unused
         smp_store_cpu_info(cpu);
         set_dec(tb_ticks_per_jiffy);
         rcu_cpu_starting(cpu);
- -      preempt_disable();
         cpu_callin_map[cpu] = 1;
   
         if (smp_ops->setup_cpu)
diff --combined arch/powerpc/kvm/book3s_hv.c

index cd544a4,fb83c84..260e860
--- 1/arch/powerpc/kvm/book3s_hv.c
--- 2/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@@ -76,7 -76,6 +76,7 @@@
   #include <asm/kvm_book3s_uvmem.h>
   #include <asm/ultravisor.h>
   #include <asm/dtl.h>
+ +#include <asm/plpar_wrappers.h>
   
   #include "book3s.h"
   
@@@ -104,9 -103,13 +104,9 @@@ static int target_smt_mode
   module_param(target_smt_mode, int, 0644);
   MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
   
- -static bool indep_threads_mode = true;
- -module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
- -MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
- -
   static bool one_vm_per_core;
   module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
- -MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
+ +MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
   
   #ifdef CONFIG_KVM_XICS
   static const struct kernel_param_ops module_param_ops = {
@@@ -131,6 -134,9 +131,6 @@@ static inline bool nesting_enabled(stru
         return kvm->arch.nested_enable && kvm_is_radix(kvm);
   }
   
- -/* If set, the threads on each CPU core have to be in the same MMU mode */
- -static bool no_mixing_hpt_and_radix __read_mostly;
- -
   static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
   
   /*
@@@ -230,7 -236,7 +230,7 @@@ static void kvmppc_fast_vcpu_kick_hv(st
   
         waitp = kvm_arch_vcpu_get_wait(vcpu);
         if (rcuwait_wake_up(waitp))
- -              ++vcpu->stat.halt_wakeup;
+ +              ++vcpu->stat.generic.halt_wakeup;
   
         cpu = READ_ONCE(vcpu->arch.thread_cpu);
         if (cpu >= 0 && kvmppc_ipi_thread(cpu))
@@@ -801,8 -807,7 +801,8 @@@ static int kvmppc_h_set_mode(struct kvm
                  * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
                  * Keep this in synch with kvmppc_filter_guest_lpcr_hv.
                  */
- -              if (mflags != 0 && mflags != 3)
+ +              if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
+ +                              kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
                         return H_UNSUPPORTED_FLAG_START;
                 return H_TOO_HARD;
         default:
@@@ -894,10 -899,6 +894,10 @@@ static int kvm_arch_vcpu_yield_to(struc
          * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
          * have useful work to do and should not confer) so we don't
          * recheck that here.
+ +       *
+ +       * In the case of the P9 single vcpu per vcore case, the real
+ +       * mode handler is not called but no other threads are in the
+ +       * source vcore.
          */
   
         spin_lock(&vcore->lock);
@@@ -923,71 -924,8 +923,71 @@@ static int kvmppc_get_yield_count(struc
         return yield_count;
   }
   
+ +/*
+ + * H_RPT_INVALIDATE hcall handler for nested guests.
+ + *
+ + * Handles only nested process-scoped invalidation requests in L0.
+ + */
+ +static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
+ +{
+ +      unsigned long type = kvmppc_get_gpr(vcpu, 6);
+ +      unsigned long pid, pg_sizes, start, end;
+ +
+ +      /*
+ +       * The partition-scoped invalidations aren't handled here in L0.
+ +       */
+ +      if (type & H_RPTI_TYPE_NESTED)
+ +              return RESUME_HOST;
+ +
+ +      pid = kvmppc_get_gpr(vcpu, 4);
+ +      pg_sizes = kvmppc_get_gpr(vcpu, 7);
+ +      start = kvmppc_get_gpr(vcpu, 8);
+ +      end = kvmppc_get_gpr(vcpu, 9);
+ +
+ +      do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
+ +                              type, pg_sizes, start, end);
+ +
+ +      kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+ +      return RESUME_GUEST;
+ +}
+ +
+ +static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
+ +                                  unsigned long id, unsigned long target,
+ +                                  unsigned long type, unsigned long pg_sizes,
+ +                                  unsigned long start, unsigned long end)
+ +{
+ +      if (!kvm_is_radix(vcpu->kvm))
+ +              return H_UNSUPPORTED;
+ +
+ +      if (end < start)
+ +              return H_P5;
+ +
+ +      /*
+ +       * Partition-scoped invalidation for nested guests.
+ +       */
+ +      if (type & H_RPTI_TYPE_NESTED) {
+ +              if (!nesting_enabled(vcpu->kvm))
+ +                      return H_FUNCTION;
+ +
+ +              /* Support only cores as target */
+ +              if (target != H_RPTI_TARGET_CMMU)
+ +                      return H_P2;
+ +
+ +              return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
+ +                                             start, end);
+ +      }
+ +
+ +      /*
+ +       * Process-scoped invalidation for L1 guests.
+ +       */
+ +      do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
+ +                              type, pg_sizes, start, end);
+ +      return H_SUCCESS;
+ +}
+ +
   int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
   {
+ +      struct kvm *kvm = vcpu->kvm;
         unsigned long req = kvmppc_get_gpr(vcpu, 3);
         unsigned long target, ret = H_SUCCESS;
         int yield_count;
@@@ -999,57 -937,11 +999,57 @@@
                 return RESUME_HOST;
   
         switch (req) {
+ +      case H_REMOVE:
+ +              ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
+ +                                      kvmppc_get_gpr(vcpu, 5),
+ +                                      kvmppc_get_gpr(vcpu, 6));
+ +              if (ret == H_TOO_HARD)
+ +                      return RESUME_HOST;
+ +              break;
+ +      case H_ENTER:
+ +              ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
+ +                                      kvmppc_get_gpr(vcpu, 5),
+ +                                      kvmppc_get_gpr(vcpu, 6),
+ +                                      kvmppc_get_gpr(vcpu, 7));
+ +              if (ret == H_TOO_HARD)
+ +                      return RESUME_HOST;
+ +              break;
+ +      case H_READ:
+ +              ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
+ +                                      kvmppc_get_gpr(vcpu, 5));
+ +              if (ret == H_TOO_HARD)
+ +                      return RESUME_HOST;
+ +              break;
+ +      case H_CLEAR_MOD:
+ +              ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
+ +                                      kvmppc_get_gpr(vcpu, 5));
+ +              if (ret == H_TOO_HARD)
+ +                      return RESUME_HOST;
+ +              break;
+ +      case H_CLEAR_REF:
+ +              ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
+ +                                      kvmppc_get_gpr(vcpu, 5));
+ +              if (ret == H_TOO_HARD)
+ +                      return RESUME_HOST;
+ +              break;
+ +      case H_PROTECT:
+ +              ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
+ +                                      kvmppc_get_gpr(vcpu, 5),
+ +                                      kvmppc_get_gpr(vcpu, 6));
+ +              if (ret == H_TOO_HARD)
+ +                      return RESUME_HOST;
+ +              break;
+ +      case H_BULK_REMOVE:
+ +              ret = kvmppc_h_bulk_remove(vcpu);
+ +              if (ret == H_TOO_HARD)
+ +                      return RESUME_HOST;
+ +              break;
+ +
         case H_CEDE:
                 break;
         case H_PROD:
                 target = kvmppc_get_gpr(vcpu, 4);
- -              tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+ +              tvcpu = kvmppc_find_vcpu(kvm, target);
                 if (!tvcpu) {
                         ret = H_PARAMETER;
                         break;
@@@ -1063,7 -955,7 +1063,7 @@@
                 target = kvmppc_get_gpr(vcpu, 4);
                 if (target == -1)
                         break;
- -              tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+ +              tvcpu = kvmppc_find_vcpu(kvm, target);
                 if (!tvcpu) {
                         ret = H_PARAMETER;
                         break;
@@@ -1079,12 -971,12 +1079,12 @@@
                                         kvmppc_get_gpr(vcpu, 6));
                 break;
         case H_RTAS:
- -              if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+ +              if (list_empty(&kvm->arch.rtas_tokens))
                         return RESUME_HOST;
   
- -              idx = srcu_read_lock(&vcpu->kvm->srcu);
+ +              idx = srcu_read_lock(&kvm->srcu);
                 rc = kvmppc_rtas_hcall(vcpu);
- -              srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ +              srcu_read_unlock(&kvm->srcu, idx);
   
                 if (rc == -ENOENT)
                         return RESUME_HOST;
@@@ -1168,23 -1060,15 +1168,23 @@@
                 if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
                         ret = H_HARDWARE;
                 break;
+ +      case H_RPT_INVALIDATE:
+ +              ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
+ +                                            kvmppc_get_gpr(vcpu, 5),
+ +                                            kvmppc_get_gpr(vcpu, 6),
+ +                                            kvmppc_get_gpr(vcpu, 7),
+ +                                            kvmppc_get_gpr(vcpu, 8),
+ +                                            kvmppc_get_gpr(vcpu, 9));
+ +              break;
   
         case H_SET_PARTITION_TABLE:
                 ret = H_FUNCTION;
- -              if (nesting_enabled(vcpu->kvm))
+ +              if (nesting_enabled(kvm))
                         ret = kvmhv_set_partition_table(vcpu);
                 break;
         case H_ENTER_NESTED:
                 ret = H_FUNCTION;
- -              if (!nesting_enabled(vcpu->kvm))
+ +              if (!nesting_enabled(kvm))
                         break;
                 ret = kvmhv_enter_nested_guest(vcpu);
                 if (ret == H_INTERRUPT) {
@@@ -1199,12 -1083,12 +1199,12 @@@
                 break;
         case H_TLB_INVALIDATE:
                 ret = H_FUNCTION;
- -              if (nesting_enabled(vcpu->kvm))
+ +              if (nesting_enabled(kvm))
                         ret = kvmhv_do_nested_tlbie(vcpu);
                 break;
         case H_COPY_TOFROM_GUEST:
                 ret = H_FUNCTION;
- -              if (nesting_enabled(vcpu->kvm))
+ +              if (nesting_enabled(kvm))
                         ret = kvmhv_copy_tofrom_guest_nested(vcpu);
                 break;
         case H_PAGE_INIT:
@@@ -1215,7 -1099,7 +1215,7 @@@
         case H_SVM_PAGE_IN:
                 ret = H_UNSUPPORTED;
                 if (kvmppc_get_srr1(vcpu) & MSR_S)
- -                      ret = kvmppc_h_svm_page_in(vcpu->kvm,
+ +                      ret = kvmppc_h_svm_page_in(kvm,
                                                    kvmppc_get_gpr(vcpu, 4),
                                                    kvmppc_get_gpr(vcpu, 5),
                                                    kvmppc_get_gpr(vcpu, 6));
@@@ -1223,7 -1107,7 +1223,7 @@@
         case H_SVM_PAGE_OUT:
                 ret = H_UNSUPPORTED;
                 if (kvmppc_get_srr1(vcpu) & MSR_S)
- -                      ret = kvmppc_h_svm_page_out(vcpu->kvm,
+ +                      ret = kvmppc_h_svm_page_out(kvm,
                                                     kvmppc_get_gpr(vcpu, 4),
                                                     kvmppc_get_gpr(vcpu, 5),
                                                     kvmppc_get_gpr(vcpu, 6));
@@@ -1231,12 -1115,12 +1231,12 @@@
         case H_SVM_INIT_START:
                 ret = H_UNSUPPORTED;
                 if (kvmppc_get_srr1(vcpu) & MSR_S)
- -                      ret = kvmppc_h_svm_init_start(vcpu->kvm);
+ +                      ret = kvmppc_h_svm_init_start(kvm);
                 break;
         case H_SVM_INIT_DONE:
                 ret = H_UNSUPPORTED;
                 if (kvmppc_get_srr1(vcpu) & MSR_S)
- -                      ret = kvmppc_h_svm_init_done(vcpu->kvm);
+ +                      ret = kvmppc_h_svm_init_done(kvm);
                 break;
         case H_SVM_INIT_ABORT:
                 /*
@@@ -1246,26 -1130,24 +1246,26 @@@
                  * Instead the kvm->arch.secure_guest flag is checked inside
                  * kvmppc_h_svm_init_abort().
                  */
- -              ret = kvmppc_h_svm_init_abort(vcpu->kvm);
+ +              ret = kvmppc_h_svm_init_abort(kvm);
                 break;
   
         default:
                 return RESUME_HOST;
         }
+ +      WARN_ON_ONCE(ret == H_TOO_HARD);
         kvmppc_set_gpr(vcpu, 3, ret);
         vcpu->arch.hcall_needed = 0;
         return RESUME_GUEST;
   }
   
   /*
- - * Handle H_CEDE in the nested virtualization case where we haven't
- - * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
+ + * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
+ + * handlers in book3s_hv_rmhandlers.S.
+ + *
    * This has to be done early, not in kvmppc_pseries_do_hcall(), so
    * that the cede logic in kvmppc_run_single_vcpu() works properly.
    */
- -static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
+ +static void kvmppc_cede(struct kvm_vcpu *vcpu)
   {
         vcpu->arch.shregs.msr |= MSR_EE;
         vcpu->arch.ceded = 1;
@@@ -1296,7 -1178,6 +1296,7 @@@ static int kvmppc_hcall_impl_hv(unsigne
         case H_XIRR_X:
   #endif
         case H_PAGE_INIT:
+ +      case H_RPT_INVALIDATE:
                 return 1;
         }
   
@@@ -1519,39 -1400,13 +1519,39 @@@ static int kvmppc_handle_exit_hv(struc
         }
         case BOOK3S_INTERRUPT_SYSCALL:
         {
- -              /* hcall - punt to userspace */
                 int i;
   
- -              /* hypercall with MSR_PR has already been handled in rmode,
- -               * and never reaches here.
- -               */
+ +              if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
+ +                      /*
+ +                       * Guest userspace executed sc 1. This can only be
+ +                       * reached by the P9 path because the old path
+ +                       * handles this case in realmode hcall handlers.
+ +                       */
+ +                      if (!kvmhv_vcpu_is_radix(vcpu)) {
+ +                              /*
+ +                               * A guest could be running PR KVM, so this
+ +                               * may be a PR KVM hcall. It must be reflected
+ +                               * to the guest kernel as a sc interrupt.
+ +                               */
+ +                              kvmppc_core_queue_syscall(vcpu);
+ +                      } else {
+ +                              /*
+ +                               * Radix guests can not run PR KVM or nested HV
+ +                               * hash guests which might run PR KVM, so this
+ +                               * is always a privilege fault. Send a program
+ +                               * check to guest kernel.
+ +                               */
+ +                              kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+ +                      }
+ +                      r = RESUME_GUEST;
+ +                      break;
+ +              }
   
+ +              /*
+ +               * hcall - gather args and set exit_reason. This will next be
+ +               * handled by kvmppc_pseries_do_hcall which may be able to deal
+ +               * with it and resume guest, or may punt to userspace.
+ +               */
                 run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
                 for (i = 0; i < 9; ++i)
                         run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
@@@ -1564,102 -1419,20 +1564,102 @@@
          * We get these next two if the guest accesses a page which it thinks
          * it has mapped but which is not actually present, either because
          * it is for an emulated I/O device or because the corresonding
- -       * host page has been paged out.  Any other HDSI/HISI interrupts
- -       * have been handled already.
+ +       * host page has been paged out.
+ +       *
+ +       * Any other HDSI/HISI interrupts have been handled already for P7/8
+ +       * guests. For POWER9 hash guests not using rmhandlers, basic hash
+ +       * fault handling is done here.
          */
- -      case BOOK3S_INTERRUPT_H_DATA_STORAGE:
- -              r = RESUME_PAGE_FAULT;
+ +      case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
+ +              unsigned long vsid;
+ +              long err;
+ +
+ +              if (vcpu->arch.fault_dsisr == HDSISR_CANARY) {
+ +                      r = RESUME_GUEST; /* Just retry if it's the canary */
+ +                      break;
+ +              }
+ +
+ +              if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
+ +                      /*
+ +                       * Radix doesn't require anything, and pre-ISAv3.0 hash
+ +                       * already attempted to handle this in rmhandlers. The
+ +                       * hash fault handling below is v3 only (it uses ASDR
+ +                       * via fault_gpa).
+ +                       */
+ +                      r = RESUME_PAGE_FAULT;
+ +                      break;
+ +              }
+ +
+ +              if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
+ +                      kvmppc_core_queue_data_storage(vcpu,
+ +                              vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+ +                      r = RESUME_GUEST;
+ +                      break;
+ +              }
+ +
+ +              if (!(vcpu->arch.shregs.msr & MSR_DR))
+ +                      vsid = vcpu->kvm->arch.vrma_slb_v;
+ +              else
+ +                      vsid = vcpu->arch.fault_gpa;
+ +
+ +              err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
+ +                              vsid, vcpu->arch.fault_dsisr, true);
+ +              if (err == 0) {
+ +                      r = RESUME_GUEST;
+ +              } else if (err == -1 || err == -2) {
+ +                      r = RESUME_PAGE_FAULT;
+ +              } else {
+ +                      kvmppc_core_queue_data_storage(vcpu,
+ +                              vcpu->arch.fault_dar, err);
+ +                      r = RESUME_GUEST;
+ +              }
                 break;
- -      case BOOK3S_INTERRUPT_H_INST_STORAGE:
+ +      }
+ +      case BOOK3S_INTERRUPT_H_INST_STORAGE: {
+ +              unsigned long vsid;
+ +              long err;
+ +
                 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
                 vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
                         DSISR_SRR1_MATCH_64S;
- -              if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
- -                      vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
- -              r = RESUME_PAGE_FAULT;
+ +              if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
+ +                      /*
+ +                       * Radix doesn't require anything, and pre-ISAv3.0 hash
+ +                       * already attempted to handle this in rmhandlers. The
+ +                       * hash fault handling below is v3 only (it uses ASDR
+ +                       * via fault_gpa).
+ +                       */
+ +                      if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+ +                              vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+ +                      r = RESUME_PAGE_FAULT;
+ +                      break;
+ +              }
+ +
+ +              if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
+ +                      kvmppc_core_queue_inst_storage(vcpu,
+ +                              vcpu->arch.fault_dsisr);
+ +                      r = RESUME_GUEST;
+ +                      break;
+ +              }
+ +
+ +              if (!(vcpu->arch.shregs.msr & MSR_IR))
+ +                      vsid = vcpu->kvm->arch.vrma_slb_v;
+ +              else
+ +                      vsid = vcpu->arch.fault_gpa;
+ +
+ +              err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
+ +                              vsid, vcpu->arch.fault_dsisr, false);
+ +              if (err == 0) {
+ +                      r = RESUME_GUEST;
+ +              } else if (err == -1) {
+ +                      r = RESUME_PAGE_FAULT;
+ +              } else {
+ +                      kvmppc_core_queue_inst_storage(vcpu, err);
+ +                      r = RESUME_GUEST;
+ +              }
                 break;
+ +      }
+ +
         /*
          * This occurs if the guest executes an illegal instruction.
          * If the guest debug is disabled, generate a program interrupt
@@@ -1820,23 -1593,6 +1820,23 @@@ static int kvmppc_handle_nested_exit(st
                 if (!xics_on_xive())
                         kvmppc_xics_rm_complete(vcpu, 0);
                 break;
+ +      case BOOK3S_INTERRUPT_SYSCALL:
+ +      {
+ +              unsigned long req = kvmppc_get_gpr(vcpu, 3);
+ +
+ +              /*
+ +               * The H_RPT_INVALIDATE hcalls issued by nested
+ +               * guests for process-scoped invalidations when
+ +               * GTSE=0, are handled here in L0.
+ +               */
+ +              if (req == H_RPT_INVALIDATE) {
+ +                      r = kvmppc_nested_h_rpt_invalidate(vcpu);
+ +                      break;
+ +              }
+ +
+ +              r = RESUME_HOST;
+ +              break;
+ +      }
         default:
                 r = RESUME_HOST;
                 break;
@@@ -1898,14 -1654,6 +1898,14 @@@ unsigned long kvmppc_filter_lpcr_hv(str
                 lpcr &= ~LPCR_AIL;
         if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
                 lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
+ +      /*
+ +       * On some POWER9s we force AIL off for radix guests to prevent
+ +       * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
+ +       * guest, which can result in Q0 translations with LPID=0 PID=PIDR to
+ +       * be cached, which the host TLB management does not expect.
+ +       */
+ +      if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+ +              lpcr &= ~LPCR_AIL;
   
         /*
          * On POWER9, allow userspace to enable large decrementer for the
@@@ -2485,7 -2233,7 +2485,7 @@@ static int kvmppc_set_one_reg_hv(struc
    */
   static int threads_per_vcore(struct kvm *kvm)
   {
- -      if (kvm->arch.threads_indep)
+ +      if (cpu_has_feature(CPU_FTR_ARCH_300))
                 return 1;
         return threads_per_subcore;
   }
@@@ -2909,7 -2657,7 +2909,7 @@@ static void radix_flush_cpu(struct kvm 
         cpumask_t *cpu_in_guest;
         int i;
   
- -      cpu = cpu_first_thread_sibling(cpu);
+ +      cpu = cpu_first_tlb_thread_sibling(cpu);
         if (nested) {
                 cpumask_set_cpu(cpu, &nested->need_tlb_flush);
                 cpu_in_guest = &nested->cpu_in_guest;
@@@ -2923,10 -2671,9 +2923,10 @@@
          * the other side is the first smp_mb() in kvmppc_run_core().
          */
         smp_mb();
- -      for (i = 0; i < threads_per_core; ++i)
- -              if (cpumask_test_cpu(cpu + i, cpu_in_guest))
- -                      smp_call_function_single(cpu + i, do_nothing, NULL, 1);
+ +      for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
+ +                                      i += cpu_tlb_thread_sibling_step())
+ +              if (cpumask_test_cpu(i, cpu_in_guest))
+ +                      smp_call_function_single(i, do_nothing, NULL, 1);
   }
   
   static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
@@@ -2957,8 -2704,8 +2957,8 @@@
          */
         if (prev_cpu != pcpu) {
                 if (prev_cpu >= 0 &&
- -                  cpu_first_thread_sibling(prev_cpu) !=
- -                  cpu_first_thread_sibling(pcpu))
+ +                  cpu_first_tlb_thread_sibling(prev_cpu) !=
+ +                  cpu_first_tlb_thread_sibling(pcpu))
                         radix_flush_cpu(kvm, prev_cpu, vcpu);
                 if (nested)
                         nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
@@@ -3220,6 -2967,9 +3220,6 @@@ static void prepare_threads(struct kvmp
         for_each_runnable_thread(i, vcpu, vc) {
                 if (signal_pending(vcpu->arch.run_task))
                         vcpu->arch.ret = -EINTR;
- -              else if (no_mixing_hpt_and_radix &&
- -                       kvm_is_radix(vc->kvm) != radix_enabled())
- -                      vcpu->arch.ret = -EINVAL;
                 else if (vcpu->arch.vpa.update_pending ||
                          vcpu->arch.slb_shadow.update_pending ||
                          vcpu->arch.dtl.update_pending)
@@@ -3426,9 -3176,6 +3426,9 @@@ static noinline void kvmppc_run_core(st
         int trap;
         bool is_power8;
   
+ +      if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
+ +              return;
+ +
         /*
          * Remove from the list any threads that have a signal pending
          * or need a VPA update done
@@@ -3456,6 -3203,9 +3456,6 @@@
          * Make sure we are running on primary threads, and that secondary
          * threads are offline.  Also check if the number of threads in this
          * guest are greater than the current system threads per guest.
- -       * On POWER9, we need to be not in independent-threads mode if
- -       * this is a HPT guest on a radix host machine where the
- -       * CPU threads may not be in different MMU modes.
          */
         if ((controlled_threads > 1) &&
             ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
@@@ -3479,6 -3229,18 +3479,6 @@@
         if (vc->num_threads < target_threads)
                 collect_piggybacks(&core_info, target_threads);
   
- -      /*
- -       * On radix, arrange for TLB flushing if necessary.
- -       * This has to be done before disabling interrupts since
- -       * it uses smp_call_function().
- -       */
- -      pcpu = smp_processor_id();
- -      if (kvm_is_radix(vc->kvm)) {
- -              for (sub = 0; sub < core_info.n_subcores; ++sub)
- -                      for_each_runnable_thread(i, vcpu, core_info.vc[sub])
- -                              kvmppc_prepare_radix_vcpu(vcpu, pcpu);
- -      }
- -
         /*
          * Hard-disable interrupts, and check resched flag and signals.
          * If we need to reschedule or deliver a signal, clean up
@@@ -3511,7 -3273,8 +3511,7 @@@
         cmd_bit = stat_bit = 0;
         split = core_info.n_subcores;
         sip = NULL;
- -      is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
- -              && !cpu_has_feature(CPU_FTR_ARCH_300);
+ +      is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
   
         if (split > 1) {
                 sip = &split_info;
@@@ -3715,113 -3478,184 +3715,113 @@@
         trace_kvmppc_run_core(vc, 1);
   }
   
- -/*
- - * Load up hypervisor-mode registers on P9.
- - */
- -static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
- -                                   unsigned long lpcr)
+ +static void load_spr_state(struct kvm_vcpu *vcpu)
   {
- -      struct kvmppc_vcore *vc = vcpu->arch.vcore;
- -      s64 hdec;
- -      u64 tb, purr, spurr;
- -      int trap;
- -      unsigned long host_hfscr = mfspr(SPRN_HFSCR);
- -      unsigned long host_ciabr = mfspr(SPRN_CIABR);
- -      unsigned long host_dawr0 = mfspr(SPRN_DAWR0);
- -      unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0);
- -      unsigned long host_psscr = mfspr(SPRN_PSSCR);
- -      unsigned long host_pidr = mfspr(SPRN_PID);
- -      unsigned long host_dawr1 = 0;
- -      unsigned long host_dawrx1 = 0;
- -
- -      if (cpu_has_feature(CPU_FTR_DAWR1)) {
- -              host_dawr1 = mfspr(SPRN_DAWR1);
- -              host_dawrx1 = mfspr(SPRN_DAWRX1);
- -      }
+ +      mtspr(SPRN_DSCR, vcpu->arch.dscr);
+ +      mtspr(SPRN_IAMR, vcpu->arch.iamr);
+ +      mtspr(SPRN_PSPB, vcpu->arch.pspb);
+ +      mtspr(SPRN_FSCR, vcpu->arch.fscr);
+ +      mtspr(SPRN_TAR, vcpu->arch.tar);
+ +      mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+ +      mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+ +      mtspr(SPRN_BESCR, vcpu->arch.bescr);
+ +      mtspr(SPRN_WORT, vcpu->arch.wort);
+ +      mtspr(SPRN_TIDR, vcpu->arch.tid);
+ +      mtspr(SPRN_AMR, vcpu->arch.amr);
+ +      mtspr(SPRN_UAMOR, vcpu->arch.uamor);
   
         /*
- -       * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
- -       * so set HDICE before writing HDEC.
+ +       * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI]
+ +       * clear (or hstate set appropriately to catch those registers
+ +       * being clobbered if we take a MCE or SRESET), so those are done
+ +       * later.
          */
- -      mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE);
- -      isync();
- -
- -      hdec = time_limit - mftb();
- -      if (hdec < 0) {
- -              mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
- -              isync();
- -              return BOOK3S_INTERRUPT_HV_DECREMENTER;
- -      }
- -      mtspr(SPRN_HDEC, hdec);
- -
- -      if (vc->tb_offset) {
- -              u64 new_tb = mftb() + vc->tb_offset;
- -              mtspr(SPRN_TBU40, new_tb);
- -              tb = mftb();
- -              if ((tb & 0xffffff) < (new_tb & 0xffffff))
- -                      mtspr(SPRN_TBU40, new_tb + 0x1000000);
- -              vc->tb_offset_applied = vc->tb_offset;
- -      }
- -
- -      if (vc->pcr)
- -              mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
- -      mtspr(SPRN_DPDES, vc->dpdes);
- -      mtspr(SPRN_VTB, vc->vtb);
- -
- -      local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
- -      local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
- -      mtspr(SPRN_PURR, vcpu->arch.purr);
- -      mtspr(SPRN_SPURR, vcpu->arch.spurr);
- -
- -      if (dawr_enabled()) {
- -              mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
- -              mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
- -              if (cpu_has_feature(CPU_FTR_DAWR1)) {
- -                      mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
- -                      mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
- -              }
- -      }
- -      mtspr(SPRN_CIABR, vcpu->arch.ciabr);
- -      mtspr(SPRN_IC, vcpu->arch.ic);
- -      mtspr(SPRN_PID, vcpu->arch.pid);
   
- -      mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
- -            (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
- -
- -      mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
- -
- -      mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
- -      mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
- -      mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
- -      mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
- -
- -      mtspr(SPRN_AMOR, ~0UL);
- -
- -      mtspr(SPRN_LPCR, lpcr);
- -      isync();
- -
- -      kvmppc_xive_push_vcpu(vcpu);
- -
- -      mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
- -      mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
- -
- -      trap = __kvmhv_vcpu_entry_p9(vcpu);
- -
- -      /* Advance host PURR/SPURR by the amount used by guest */
- -      purr = mfspr(SPRN_PURR);
- -      spurr = mfspr(SPRN_SPURR);
- -      mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
- -            purr - vcpu->arch.purr);
- -      mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
- -            spurr - vcpu->arch.spurr);
- -      vcpu->arch.purr = purr;
- -      vcpu->arch.spurr = spurr;
+ +      if (!(vcpu->arch.ctrl & 1))
+ +              mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+ +}
   
- -      vcpu->arch.ic = mfspr(SPRN_IC);
- -      vcpu->arch.pid = mfspr(SPRN_PID);
- -      vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+ +static void store_spr_state(struct kvm_vcpu *vcpu)
+ +{
+ +      vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
   
- -      vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
- -      vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
- -      vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
- -      vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
+ +      vcpu->arch.iamr = mfspr(SPRN_IAMR);
+ +      vcpu->arch.pspb = mfspr(SPRN_PSPB);
+ +      vcpu->arch.fscr = mfspr(SPRN_FSCR);
+ +      vcpu->arch.tar = mfspr(SPRN_TAR);
+ +      vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
+ +      vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
+ +      vcpu->arch.bescr = mfspr(SPRN_BESCR);
+ +      vcpu->arch.wort = mfspr(SPRN_WORT);
+ +      vcpu->arch.tid = mfspr(SPRN_TIDR);
+ +      vcpu->arch.amr = mfspr(SPRN_AMR);
+ +      vcpu->arch.uamor = mfspr(SPRN_UAMOR);
+ +      vcpu->arch.dscr = mfspr(SPRN_DSCR);
+ +}
   
- -      /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
- -      mtspr(SPRN_PSSCR, host_psscr |
- -            (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
- -      mtspr(SPRN_HFSCR, host_hfscr);
- -      mtspr(SPRN_CIABR, host_ciabr);
- -      mtspr(SPRN_DAWR0, host_dawr0);
- -      mtspr(SPRN_DAWRX0, host_dawrx0);
- -      if (cpu_has_feature(CPU_FTR_DAWR1)) {
- -              mtspr(SPRN_DAWR1, host_dawr1);
- -              mtspr(SPRN_DAWRX1, host_dawrx1);
- -      }
- -      mtspr(SPRN_PID, host_pidr);
+ +/*
+ + * Privileged (non-hypervisor) host registers to save.
+ + */
+ +struct p9_host_os_sprs {
+ +      unsigned long dscr;
+ +      unsigned long tidr;
+ +      unsigned long iamr;
+ +      unsigned long amr;
+ +      unsigned long fscr;
+ +};
   
- -      /*
- -       * Since this is radix, do a eieio; tlbsync; ptesync sequence in
- -       * case we interrupted the guest between a tlbie and a ptesync.
- -       */
- -      asm volatile("eieio; tlbsync; ptesync");
+ +static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
+ +{
+ +      host_os_sprs->dscr = mfspr(SPRN_DSCR);
+ +      host_os_sprs->tidr = mfspr(SPRN_TIDR);
+ +      host_os_sprs->iamr = mfspr(SPRN_IAMR);
+ +      host_os_sprs->amr = mfspr(SPRN_AMR);
+ +      host_os_sprs->fscr = mfspr(SPRN_FSCR);
+ +}
   
- -      /*
- -       * cp_abort is required if the processor supports local copy-paste
- -       * to clear the copy buffer that was under control of the guest.
- -       */
- -      if (cpu_has_feature(CPU_FTR_ARCH_31))
- -              asm volatile(PPC_CP_ABORT);
+ +/* vcpu guest regs must already be saved */
+ +static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
+ +                                  struct p9_host_os_sprs *host_os_sprs)
+ +{
+ +      mtspr(SPRN_PSPB, 0);
+ +      mtspr(SPRN_WORT, 0);
+ +      mtspr(SPRN_UAMOR, 0);
   
- -      mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid);    /* restore host LPID */
- -      isync();
+ +      mtspr(SPRN_DSCR, host_os_sprs->dscr);
+ +      mtspr(SPRN_TIDR, host_os_sprs->tidr);
+ +      mtspr(SPRN_IAMR, host_os_sprs->iamr);
   
- -      vc->dpdes = mfspr(SPRN_DPDES);
- -      vc->vtb = mfspr(SPRN_VTB);
- -      mtspr(SPRN_DPDES, 0);
- -      if (vc->pcr)
- -              mtspr(SPRN_PCR, PCR_MASK);
+ +      if (host_os_sprs->amr != vcpu->arch.amr)
+ +              mtspr(SPRN_AMR, host_os_sprs->amr);
   
- -      if (vc->tb_offset_applied) {
- -              u64 new_tb = mftb() - vc->tb_offset_applied;
- -              mtspr(SPRN_TBU40, new_tb);
- -              tb = mftb();
- -              if ((tb & 0xffffff) < (new_tb & 0xffffff))
- -                      mtspr(SPRN_TBU40, new_tb + 0x1000000);
- -              vc->tb_offset_applied = 0;
- -      }
+ +      if (host_os_sprs->fscr != vcpu->arch.fscr)
+ +              mtspr(SPRN_FSCR, host_os_sprs->fscr);
   
- -      mtspr(SPRN_HDEC, 0x7fffffff);
- -      mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+ +      /* Save guest CTRL register, set runlatch to 1 */
+ +      if (!(vcpu->arch.ctrl & 1))
+ +              mtspr(SPRN_CTRLT, 1);
+ +}
   
- -      return trap;
+ +static inline bool hcall_is_xics(unsigned long req)
+ +{
+ +      return req == H_EOI || req == H_CPPR || req == H_IPI ||
+ +              req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
   }
   
   /*
- - * Virtual-mode guest entry for POWER9 and later when the host and
- - * guest are both using the radix MMU.  The LPIDR has already been set.
+ + * Guest entry for POWER9 and later CPUs.
    */
   static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
                          unsigned long lpcr)
   {
         struct kvmppc_vcore *vc = vcpu->arch.vcore;
- -      unsigned long host_dscr = mfspr(SPRN_DSCR);
- -      unsigned long host_tidr = mfspr(SPRN_TIDR);
- -      unsigned long host_iamr = mfspr(SPRN_IAMR);
- -      unsigned long host_amr = mfspr(SPRN_AMR);
- -      unsigned long host_fscr = mfspr(SPRN_FSCR);
+ +      struct p9_host_os_sprs host_os_sprs;
         s64 dec;
         u64 tb;
         int trap, save_pmu;
   
+ +      WARN_ON_ONCE(vcpu->arch.ceded);
+ +
         dec = mfspr(SPRN_DEC);
         tb = mftb();
         if (dec < 0)
@@@ -3830,7 -3664,7 +3830,7 @@@
         if (local_paca->kvm_hstate.dec_expires < time_limit)
                 time_limit = local_paca->kvm_hstate.dec_expires;
   
- -      vcpu->arch.ceded = 0;
+ +      save_p9_host_os_sprs(&host_os_sprs);
   
         kvmhv_save_host_pmu();          /* saves it to PACA kvm_hstate */
   
@@@ -3859,20 -3693,24 +3859,20 @@@
   #endif
         mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
   
- -      mtspr(SPRN_DSCR, vcpu->arch.dscr);
- -      mtspr(SPRN_IAMR, vcpu->arch.iamr);
- -      mtspr(SPRN_PSPB, vcpu->arch.pspb);
- -      mtspr(SPRN_FSCR, vcpu->arch.fscr);
- -      mtspr(SPRN_TAR, vcpu->arch.tar);
- -      mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
- -      mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
- -      mtspr(SPRN_BESCR, vcpu->arch.bescr);
- -      mtspr(SPRN_WORT, vcpu->arch.wort);
- -      mtspr(SPRN_TIDR, vcpu->arch.tid);
- -      mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
- -      mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
- -      mtspr(SPRN_AMR, vcpu->arch.amr);
- -      mtspr(SPRN_UAMOR, vcpu->arch.uamor);
- -
- -      if (!(vcpu->arch.ctrl & 1))
- -              mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+ +      load_spr_state(vcpu);
   
+ +      /*
+ +       * When setting DEC, we must always deal with irq_work_raise via NMI vs
+ +       * setting DEC. The problem occurs right as we switch into guest mode
+ +       * if a NMI hits and sets pending work and sets DEC, then that will
+ +       * apply to the guest and not bring us back to the host.
+ +       *
+ +       * irq_work_raise could check a flag (or possibly LPCR[HDICE] for
+ +       * example) and set HDEC to 1? That wouldn't solve the nested hv
+ +       * case which needs to abort the hcall or zero the time limit.
+ +       *
+ +       * XXX: Another day's problem.
+ +       */
         mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
   
         if (kvmhv_on_pseries()) {
@@@ -3880,7 -3718,7 +3880,7 @@@
                  * We need to save and restore the guest visible part of the
                  * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
                  * doesn't do this for us. Note only required if pseries since
- -               * this is done in kvmhv_load_hv_regs_and_go() below otherwise.
+ +               * this is done in kvmhv_vcpu_entry_p9() below otherwise.
                  */
                 unsigned long host_psscr;
                 /* call our hypervisor to load up HV regs and go */
@@@ -3900,8 -3738,6 +3900,8 @@@
                         hvregs.vcpu_token = vcpu->vcpu_id;
                 }
                 hvregs.hdec_expiry = time_limit;
+ +              mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+ +              mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
                 trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
                                           __pa(&vcpu->arch.regs));
                 kvmhv_restore_hv_return_state(vcpu, &hvregs);
@@@ -3914,41 -3750,15 +3914,41 @@@
                 /* H_CEDE has to be handled now, not later */
                 if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
                     kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
- -                      kvmppc_nested_cede(vcpu);
+ +                      kvmppc_cede(vcpu);
                         kvmppc_set_gpr(vcpu, 3, 0);
                         trap = 0;
                 }
         } else {
- -              trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
+ +              kvmppc_xive_push_vcpu(vcpu);
+ +              trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr);
+ +              if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
+ +                  !(vcpu->arch.shregs.msr & MSR_PR)) {
+ +                      unsigned long req = kvmppc_get_gpr(vcpu, 3);
+ +
+ +                      /* H_CEDE has to be handled now, not later */
+ +                      if (req == H_CEDE) {
+ +                              kvmppc_cede(vcpu);
+ +                              kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */
+ +                              kvmppc_set_gpr(vcpu, 3, 0);
+ +                              trap = 0;
+ +
+ +                      /* XICS hcalls must be handled before xive is pulled */
+ +                      } else if (hcall_is_xics(req)) {
+ +                              int ret;
+ +
+ +                              ret = kvmppc_xive_xics_hcall(vcpu, req);
+ +                              if (ret != H_TOO_HARD) {
+ +                                      kvmppc_set_gpr(vcpu, 3, ret);
+ +                                      trap = 0;
+ +                              }
+ +                      }
+ +              }
+ +              kvmppc_xive_pull_vcpu(vcpu);
+ +
+ +              if (kvm_is_radix(vcpu->kvm))
+ +                      vcpu->arch.slb_max = 0;
         }
   
- -      vcpu->arch.slb_max = 0;
         dec = mfspr(SPRN_DEC);
         if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
                 dec = (s32) dec;
@@@ -3956,10 -3766,36 +3956,10 @@@
         vcpu->arch.dec_expires = dec + tb;
         vcpu->cpu = -1;
         vcpu->arch.thread_cpu = -1;
- -      /* Save guest CTRL register, set runlatch to 1 */
- -      vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
- -      if (!(vcpu->arch.ctrl & 1))
- -              mtspr(SPRN_CTRLT, vcpu->arch.ctrl | 1);
- -
- -      vcpu->arch.iamr = mfspr(SPRN_IAMR);
- -      vcpu->arch.pspb = mfspr(SPRN_PSPB);
- -      vcpu->arch.fscr = mfspr(SPRN_FSCR);
- -      vcpu->arch.tar = mfspr(SPRN_TAR);
- -      vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
- -      vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
- -      vcpu->arch.bescr = mfspr(SPRN_BESCR);
- -      vcpu->arch.wort = mfspr(SPRN_WORT);
- -      vcpu->arch.tid = mfspr(SPRN_TIDR);
- -      vcpu->arch.amr = mfspr(SPRN_AMR);
- -      vcpu->arch.uamor = mfspr(SPRN_UAMOR);
- -      vcpu->arch.dscr = mfspr(SPRN_DSCR);
   
- -      mtspr(SPRN_PSPB, 0);
- -      mtspr(SPRN_WORT, 0);
- -      mtspr(SPRN_UAMOR, 0);
- -      mtspr(SPRN_DSCR, host_dscr);
- -      mtspr(SPRN_TIDR, host_tidr);
- -      mtspr(SPRN_IAMR, host_iamr);
+ +      store_spr_state(vcpu);
   
- -      if (host_amr != vcpu->arch.amr)
- -              mtspr(SPRN_AMR, host_amr);
- -
- -      if (host_fscr != vcpu->arch.fscr)
- -              mtspr(SPRN_FSCR, host_fscr);
+ +      restore_p9_host_os_sprs(vcpu, &host_os_sprs);
   
         msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
         store_fp_state(&vcpu->arch.fp);
@@@ -3989,9 -3825,6 +3989,9 @@@
         vc->in_guest = 0;
   
         mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+ +      /* We may have raced with new irq work */
+ +      if (test_irq_work_pending())
+ +              set_dec(1);
         mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
   
         kvmhv_load_host_pmu();
@@@ -4092,7 -3925,7 +4092,7 @@@ static void kvmppc_vcore_blocked(struc
         cur = start_poll = ktime_get();
         if (vc->halt_poll_ns) {
                 ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
- -              ++vc->runner->stat.halt_attempted_poll;
+ +              ++vc->runner->stat.generic.halt_attempted_poll;
   
                 vc->vcore_state = VCORE_POLLING;
                 spin_unlock(&vc->lock);
@@@ -4109,7 -3942,7 +4109,7 @@@
                 vc->vcore_state = VCORE_INACTIVE;
   
                 if (!do_sleep) {
- -                      ++vc->runner->stat.halt_successful_poll;
+ +                      ++vc->runner->stat.generic.halt_successful_poll;
                         goto out;
                 }
         }
@@@ -4121,7 -3954,7 +4121,7 @@@
                 do_sleep = 0;
                 /* If we polled, count this as a successful poll */
                 if (vc->halt_poll_ns)
- -                      ++vc->runner->stat.halt_successful_poll;
+ +                      ++vc->runner->stat.generic.halt_successful_poll;
                 goto out;
         }
   
@@@ -4148,13 -3981,13 +4148,13 @@@ out
                         ktime_to_ns(cur) - ktime_to_ns(start_wait);
                 /* Attribute failed poll time */
                 if (vc->halt_poll_ns)
- -                      vc->runner->stat.halt_poll_fail_ns +=
+ +                      vc->runner->stat.generic.halt_poll_fail_ns +=
                                 ktime_to_ns(start_wait) -
                                 ktime_to_ns(start_poll);
         } else {
                 /* Attribute successful poll time */
                 if (vc->halt_poll_ns)
- -                      vc->runner->stat.halt_poll_success_ns +=
+ +                      vc->runner->stat.generic.halt_poll_success_ns +=
                                 ktime_to_ns(cur) -
                                 ktime_to_ns(start_poll);
         }
@@@ -4181,6 -4014,7 +4181,6 @@@
   /*
    * This never fails for a radix guest, as none of the operations it does
    * for a radix guest can fail or have a way to report failure.
- - * kvmhv_run_single_vcpu() relies on this fact.
    */
   static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
   {
@@@ -4336,7 -4170,7 +4336,7 @@@ int kvmhv_run_single_vcpu(struct kvm_vc
   {
         struct kvm_run *run = vcpu->run;
         int trap, r, pcpu;
- -      int srcu_idx, lpid;
+ +      int srcu_idx;
         struct kvmppc_vcore *vc;
         struct kvm *kvm = vcpu->kvm;
         struct kvm_nested_guest *nested = vcpu->arch.nested;
@@@ -4359,15 -4193,8 +4359,15 @@@
         vc->runner = vcpu;
   
         /* See if the MMU is ready to go */
- -      if (!kvm->arch.mmu_ready)
- -              kvmhv_setup_mmu(vcpu);
+ +      if (!kvm->arch.mmu_ready) {
+ +              r = kvmhv_setup_mmu(vcpu);
+ +              if (r) {
+ +                      run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ +                      run->fail_entry.hardware_entry_failure_reason = 0;
+ +                      vcpu->arch.ret = r;
+ +                      return r;
+ +              }
+ +      }
   
         if (need_resched())
                 cond_resched();
@@@ -4380,8 -4207,7 +4380,8 @@@
         preempt_disable();
         pcpu = smp_processor_id();
         vc->pcpu = pcpu;
- -      kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+ +      if (kvm_is_radix(kvm))
+ +              kvmppc_prepare_radix_vcpu(vcpu, pcpu);
   
         local_irq_disable();
         hard_irq_disable();
@@@ -4418,6 -4244,13 +4418,6 @@@
         vc->vcore_state = VCORE_RUNNING;
         trace_kvmppc_run_core(vc, 0);
   
- -      if (cpu_has_feature(CPU_FTR_HVMODE)) {
- -              lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
- -              mtspr(SPRN_LPID, lpid);
- -              isync();
- -              kvmppc_check_need_tlb_flush(kvm, pcpu, nested);
- -      }
- -
         guest_enter_irqoff();
   
         srcu_idx = srcu_read_lock(&kvm->srcu);
@@@ -4436,6 -4269,11 +4436,6 @@@
   
         srcu_read_unlock(&kvm->srcu, srcu_idx);
   
- -      if (cpu_has_feature(CPU_FTR_HVMODE)) {
- -              mtspr(SPRN_LPID, kvm->arch.host_lpid);
- -              isync();
- -      }
- -
         set_irq_happened(trap);
   
         kvmppc_set_host_core(pcpu);
@@@ -4581,23 -4419,19 +4581,23 @@@ static int kvmppc_vcpu_run_hv(struct kv
         vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
   
         do {
- -              /*
- -               * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu
- -               * path, which also handles hash and dependent threads mode.
- -               */
- -              if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
- -                  !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+ +              if (cpu_has_feature(CPU_FTR_ARCH_300))
                         r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
                                                   vcpu->arch.vcore->lpcr);
                 else
                         r = kvmppc_run_vcpu(vcpu);
   
- -              if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
- -                  !(vcpu->arch.shregs.msr & MSR_PR)) {
+ +              if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
+ +                      if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
+ +                              /*
+ +                               * These should have been caught reflected
+ +                               * into the guest by now. Final sanity check:
+ +                               * don't allow userspace to execute hcalls in
+ +                               * the hypervisor.
+ +                               */
+ +                              r = RESUME_GUEST;
+ +                              continue;
+ +                      }
                         trace_kvm_hcall_enter(vcpu);
                         r = kvmppc_pseries_do_hcall(vcpu);
                         trace_kvm_hcall_exit(vcpu, r);
@@@ -4924,8 -4758,8 +4924,8 @@@ static int kvmppc_hv_setup_htab_rma(str
         /* Look up the VMA for the start of this memory slot */
         hva = memslot->userspace_addr;
         mmap_read_lock(kvm->mm);
-       vma = find_vma(kvm->mm, hva);
-       if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
+       vma = vma_lookup(kvm->mm, hva);
+       if (!vma || (vma->vm_flags & VM_IO))
                 goto up_out;
   
         psize = vma_kernel_pagesize(vma);
@@@ -5204,8 -5038,18 +5204,8 @@@ static int kvmppc_core_init_vm_hv(struc
         /*
          * Track that we now have a HV mode VM active. This blocks secondary
          * CPU threads from coming online.
- -       * On POWER9, we only need to do this if the "indep_threads_mode"
- -       * module parameter has been set to N.
          */
- -      if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- -              if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
- -                      pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
- -                      kvm->arch.threads_indep = true;
- -              } else {
- -                      kvm->arch.threads_indep = indep_threads_mode;
- -              }
- -      }
- -      if (!kvm->arch.threads_indep)
+ +      if (!cpu_has_feature(CPU_FTR_ARCH_300))
                 kvm_hv_vm_activated();
   
         /*
@@@ -5246,7 -5090,7 +5246,7 @@@ static void kvmppc_core_destroy_vm_hv(s
   {
         debugfs_remove_recursive(kvm->arch.debugfs_dir);
   
- -      if (!kvm->arch.threads_indep)
+ +      if (!cpu_has_feature(CPU_FTR_ARCH_300))
                 kvm_hv_vm_deactivated();
   
         kvmppc_free_vcores(kvm);
@@@ -5667,9 -5511,7 +5667,9 @@@ static int kvmhv_enable_nested(struct k
   {
         if (!nested)
                 return -EPERM;
- -      if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix)
+ +      if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ +              return -ENODEV;
+ +      if (!radix_enabled())
                 return -ENODEV;
   
         /* kvm == NULL means the caller is testing if the capability exists */
@@@ -5832,25 -5674,11 +5832,25 @@@ static int kvmhv_enable_dawr1(struct kv
   
   static bool kvmppc_hash_v3_possible(void)
   {
- -      if (radix_enabled() && no_mixing_hpt_and_radix)
+ +      if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ +              return false;
+ +
+ +      if (!cpu_has_feature(CPU_FTR_HVMODE))
                 return false;
   
- -      return cpu_has_feature(CPU_FTR_ARCH_300) &&
- -              cpu_has_feature(CPU_FTR_HVMODE);
+ +      /*
+ +       * POWER9 chips before version 2.02 can't have some threads in
+ +       * HPT mode and some in radix mode on the same core.
+ +       */
+ +      if (radix_enabled()) {
+ +              unsigned int pvr = mfspr(SPRN_PVR);
+ +              if ((pvr >> 16) == PVR_POWER9 &&
+ +                  (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
+ +                   ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
+ +                      return false;
+ +      }
+ +
+ +      return true;
   }
   
   static struct kvmppc_ops kvm_ops_hv = {
@@@ -5994,6 -5822,18 +5994,6 @@@ static int kvmppc_book3s_init_hv(void
         if (kvmppc_radix_possible())
                 r = kvmppc_radix_init();
   
- -      /*
- -       * POWER9 chips before version 2.02 can't have some threads in
- -       * HPT mode and some in radix mode on the same core.
- -       */
- -      if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- -              unsigned int pvr = mfspr(SPRN_PVR);
- -              if ((pvr >> 16) == PVR_POWER9 &&
- -                  (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
- -                   ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
- -                      no_mixing_hpt_and_radix = true;
- -      }
- -
         r = kvmppc_uvmem_init();
         if (r < 0)
                 pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
diff --combined arch/powerpc/kvm/book3s_hv_uvmem.c

index b898a59,34720b7..a7061ee
--- 1/arch/powerpc/kvm/book3s_hv_uvmem.c
--- 2/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@@ -90,7 -90,6 +90,7 @@@
   #include <linux/migrate.h>
   #include <linux/kvm_host.h>
   #include <linux/ksm.h>
+ +#include <linux/of.h>
   #include <asm/ultravisor.h>
   #include <asm/mman.h>
   #include <asm/kvm_ppc.h>
@@@ -615,7 -614,7 +615,7 @@@ void kvmppc_uvmem_drop_pages(const stru
   
                 /* Fetch the VMA if addr is not in the latest fetched one */
                 if (!vma || addr >= vma->vm_end) {
-                       vma = find_vma_intersection(kvm->mm, addr, addr+1);
+                       vma = vma_lookup(kvm->mm, addr);
                         if (!vma) {
                                 pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
                                 break;
diff --combined arch/sparc/kernel/smp_64.c

index ae5faa1,c89a597..0224d8f
--- 1/arch/sparc/kernel/smp_64.c
--- 2/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@@ -138,6 -138,9 +138,6 @@@ void smp_callin(void
   
         set_cpu_online(cpuid, true);
   
- -      /* idle thread is expected to have preempt disabled */
- -      preempt_disable();
- -
         local_irq_enable();
   
         cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
@@@ -1543,7 -1546,7 +1543,7 @@@ static void * __init pcpu_alloc_bootmem
                                         size_t align)
   {
         const unsigned long goal = __pa(MAX_DMA_ADDRESS);
- #ifdef CONFIG_NEED_MULTIPLE_NODES
+ #ifdef CONFIG_NUMA
         int node = cpu_to_node(cpu);
         void *ptr;
   
diff --combined arch/x86/Kconfig

index 86dae42,5d523ff..49ffb69
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -1597,7 -1597,7 +1597,7 @@@ config NODES_SHIF
         default "10" if MAXSMP
         default "6" if X86_64
         default "3"
-       depends on NEED_MULTIPLE_NODES
+       depends on NUMA
         help
           Specify the maximum number of NUMA Nodes available on the target
           system.  Increases memory reserved to accommodate various tables.
@@@ -1693,6 -1693,35 +1693,6 @@@ config X86_BOOTPARAM_MEMORY_CORRUPTION_
           Set whether the default state of memory_corruption_check is
           on or off.
   
- -config X86_RESERVE_LOW
- -      int "Amount of low memory, in kilobytes, to reserve for the BIOS"
- -      default 64
- -      range 4 640
- -      help
- -        Specify the amount of low memory to reserve for the BIOS.
- -
- -        The first page contains BIOS data structures that the kernel
- -        must not use, so that page must always be reserved.
- -
- -        By default we reserve the first 64K of physical RAM, as a
- -        number of BIOSes are known to corrupt that memory range
- -        during events such as suspend/resume or monitor cable
- -        insertion, so it must not be used by the kernel.
- -
- -        You can set this to 4 if you are absolutely sure that you
- -        trust the BIOS to get all its memory reservations and usages
- -        right.  If you know your BIOS have problems beyond the
- -        default 64K area, you can set this to 640 to avoid using the
- -        entire low memory range.
- -
- -        If you have doubts about the BIOS (e.g. suspend/resume does
- -        not work or there's kernel crashes after certain hardware
- -        hotplug events) then you might want to enable
- -        X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
- -        typical corruption patterns.
- -
- -        Leave this to the default value of 64 if you are unsure.
- -
   config MATH_EMULATION
         bool
         depends on MODIFY_LDT_SYSCALL
diff --combined fs/binfmt_elf.c

index 3d73cbb,baf8f91..439ed81
--- 1/fs/binfmt_elf.c
--- 2/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@@ -1070,7 -1070,7 +1070,7 @@@ out_free_interp
                 elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
                                      !!interpreter, false);
   
-               elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
+               elf_flags = MAP_PRIVATE | MAP_DENYWRITE;
   
                 vaddr = elf_ppnt->p_vaddr;
                 /*
@@@ -1537,8 -1537,7 +1537,8 @@@ static int fill_psinfo(struct elf_prpsi
   {
         const struct cred *cred;
         unsigned int i, len;
- -      
+ +      unsigned int state;
+ +
         /* first copy the parameters from user space */
         memset(psinfo, 0, sizeof(struct elf_prpsinfo));
   
@@@ -1560,8 -1559,7 +1560,8 @@@
         psinfo->pr_pgrp = task_pgrp_vnr(p);
         psinfo->pr_sid = task_session_vnr(p);
   
- -      i = p->state ? ffz(~p->state) + 1 : 0;
+ +      state = READ_ONCE(p->__state);
+ +      i = state ? ffz(~state) + 1 : 0;
         psinfo->pr_state = i;
         psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
         psinfo->pr_zomb = psinfo->pr_sname == 'Z';
@@@ -1573,7 -1571,7 +1573,7 @@@
         SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
         rcu_read_unlock();
         strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
- -      
+ +
         return 0;
   }
   
diff --combined fs/binfmt_elf_fdpic.c

index ab9c31d,39fa1b0..cf40284
--- 1/fs/binfmt_elf_fdpic.c
--- 2/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@@ -928,7 -928,7 +928,7 @@@ static int elf_fdpic_map_file_constdisp
   {
         struct elf32_fdpic_loadseg *seg;
         struct elf32_phdr *phdr;
-       unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
+       unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0;
         int loop, ret;
   
         load_addr = params->load_addr;
@@@ -948,12 -948,8 +948,8 @@@
         }
   
         /* allocate one big anon block for everything */
-       mflags = MAP_PRIVATE;
-       if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
-               mflags |= MAP_EXECUTABLE;
- 
         maddr = vm_mmap(NULL, load_addr, top - base,
-                       PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
+                       PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, 0);
         if (IS_ERR_VALUE(maddr))
                 return (int) maddr;
   
@@@ -1046,9 -1042,6 +1042,6 @@@ static int elf_fdpic_map_file_by_direct
                 if (phdr->p_flags & PF_X) prot |= PROT_EXEC;
   
                 flags = MAP_PRIVATE | MAP_DENYWRITE;
-               if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
-                       flags |= MAP_EXECUTABLE;
- 
                 maddr = 0;
   
                 switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) {
@@@ -1331,7 -1324,6 +1324,7 @@@ static int fill_psinfo(struct elf_prpsi
   {
         const struct cred *cred;
         unsigned int i, len;
+ +      unsigned int state;
   
         /* first copy the parameters from user space */
         memset(psinfo, 0, sizeof(struct elf_prpsinfo));
@@@ -1354,8 -1346,7 +1347,8 @@@
         psinfo->pr_pgrp = task_pgrp_vnr(p);
         psinfo->pr_sid = task_session_vnr(p);
   
- -      i = p->state ? ffz(~p->state) + 1 : 0;
+ +      state = READ_ONCE(p->__state);
+ +      i = state ? ffz(~state) + 1 : 0;
         psinfo->pr_state = i;
         psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
         psinfo->pr_zomb = psinfo->pr_sname == 'Z';
diff --combined include/linux/gfp.h

index e6102df,0bec15b..55b2ec1
--- 1/include/linux/gfp.h
--- 2/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@@ -53,10 -53,8 +53,10 @@@ struct vm_area_struct
   #define ___GFP_HARDWALL               0x100000u
   #define ___GFP_THISNODE               0x200000u
   #define ___GFP_ACCOUNT                0x400000u
+ +#define ___GFP_ZEROTAGS               0x800000u
+ +#define ___GFP_SKIP_KASAN_POISON      0x1000000u
   #ifdef CONFIG_LOCKDEP
- -#define ___GFP_NOLOCKDEP      0x800000u
+ +#define ___GFP_NOLOCKDEP      0x2000000u
   #else
   #define ___GFP_NOLOCKDEP      0
   #endif
@@@ -231,25 -229,16 +231,25 @@@
    * %__GFP_COMP address compound page metadata.
    *
    * %__GFP_ZERO returns a zeroed page on success.
+ + *
+ + * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
+ + * __GFP_ZERO is set.
+ + *
+ + * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned
+ + * on deallocation. Typically used for userspace pages. Currently only has an
+ + * effect in HW tags mode.
    */
   #define __GFP_NOWARN  ((__force gfp_t)___GFP_NOWARN)
   #define __GFP_COMP    ((__force gfp_t)___GFP_COMP)
   #define __GFP_ZERO    ((__force gfp_t)___GFP_ZERO)
+ +#define __GFP_ZEROTAGS        ((__force gfp_t)___GFP_ZEROTAGS)
+ +#define __GFP_SKIP_KASAN_POISON       ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
   
   /* Disable lockdep for GFP context tracking */
   #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
   
   /* Room for N __GFP_FOO bits */
- -#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
+ +#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
   #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
   
   /**
@@@ -330,8 -319,7 +330,8 @@@
   #define GFP_DMA               __GFP_DMA
   #define GFP_DMA32     __GFP_DMA32
   #define GFP_HIGHUSER  (GFP_USER | __GFP_HIGHMEM)
- -#define GFP_HIGHUSER_MOVABLE  (GFP_HIGHUSER | __GFP_MOVABLE)
+ +#define GFP_HIGHUSER_MOVABLE  (GFP_HIGHUSER | __GFP_MOVABLE | \
+ +                       __GFP_SKIP_KASAN_POISON)
   #define GFP_TRANSHUGE_LIGHT   ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
                          __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
   #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
@@@ -506,8 -494,8 +506,8 @@@ static inline int gfp_zonelist(gfp_t fl
    * There are two zonelists per node, one for all zones with memory and
    * one containing just zones from the node the zonelist belongs to.
    *
-  * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
-  * optimized to &contig_page_data at compile-time.
+  * For the case of non-NUMA systems the NODE_DATA() gets optimized to
+  * &contig_page_data at compile-time.
    */
   static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
   {
@@@ -548,6 -536,15 +548,15 @@@ alloc_pages_bulk_array(gfp_t gfp, unsig
         return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
   }
   
+ static inline unsigned long
+ alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array)
+ {
+       if (nid == NUMA_NO_NODE)
+               nid = numa_mem_id();
+ 
+       return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
+ }
+ 
   /*
    * Allocate pages, preferring the node given as nid. The node must be valid and
    * online. For more general interface, see alloc_pages_node().
diff --combined include/linux/kasan.h

index a1c7ce5,8d83bbf..5310e21
--- 1/include/linux/kasan.h
--- 2/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@@ -2,7 -2,6 +2,7 @@@
   #ifndef _LINUX_KASAN_H
   #define _LINUX_KASAN_H
   
+ +#include <linux/bug.h>
   #include <linux/static_key.h>
   #include <linux/types.h>
   
@@@ -18,7 -17,6 +18,6 @@@ struct task_struct
   
   /* kasan_data struct is used in KUnit tests for KASAN expected failures */
   struct kunit_kasan_expectation {
-       bool report_expected;
         bool report_found;
   };
   
@@@ -42,9 -40,9 +41,9 @@@
   #endif
   
   extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
- extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS];
- extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
- extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
+ extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS];
+ extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD];
+ extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD];
   extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
   
   int kasan_populate_early_shadow(const void *shadow_start,
@@@ -80,6 -78,14 +79,6 @@@ static inline void kasan_disable_curren
   
   #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
   
- -#ifdef CONFIG_KASAN
- -
- -struct kasan_cache {
- -      int alloc_meta_offset;
- -      int free_meta_offset;
- -      bool is_kmalloc;
- -};
- -
   #ifdef CONFIG_KASAN_HW_TAGS
   
   DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
@@@ -94,14 -100,11 +93,14 @@@ static inline bool kasan_has_integrated
         return kasan_enabled();
   }
   
+ +void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags);
+ +void kasan_free_pages(struct page *page, unsigned int order);
+ +
   #else /* CONFIG_KASAN_HW_TAGS */
   
   static inline bool kasan_enabled(void)
   {
- -      return true;
+ +      return IS_ENABLED(CONFIG_KASAN);
   }
   
   static inline bool kasan_has_integrated_init(void)
@@@ -109,30 -112,8 +108,30 @@@
         return false;
   }
   
+ +static __always_inline void kasan_alloc_pages(struct page *page,
+ +                                            unsigned int order, gfp_t flags)
+ +{
+ +      /* Only available for integrated init. */
+ +      BUILD_BUG();
+ +}
+ +
+ +static __always_inline void kasan_free_pages(struct page *page,
+ +                                           unsigned int order)
+ +{
+ +      /* Only available for integrated init. */
+ +      BUILD_BUG();
+ +}
+ +
   #endif /* CONFIG_KASAN_HW_TAGS */
   
+ +#ifdef CONFIG_KASAN
+ +
+ +struct kasan_cache {
+ +      int alloc_meta_offset;
+ +      int free_meta_offset;
+ +      bool is_kmalloc;
+ +};
+ +
   slab_flags_t __kasan_never_merge(void);
   static __always_inline slab_flags_t kasan_never_merge(void)
   {
@@@ -148,20 -129,20 +147,20 @@@ static __always_inline void kasan_unpoi
                 __kasan_unpoison_range(addr, size);
   }
   
- -void __kasan_alloc_pages(struct page *page, unsigned int order, bool init);
- -static __always_inline void kasan_alloc_pages(struct page *page,
+ +void __kasan_poison_pages(struct page *page, unsigned int order, bool init);
+ +static __always_inline void kasan_poison_pages(struct page *page,
                                                 unsigned int order, bool init)
   {
         if (kasan_enabled())
- -              __kasan_alloc_pages(page, order, init);
+ +              __kasan_poison_pages(page, order, init);
   }
   
- -void __kasan_free_pages(struct page *page, unsigned int order, bool init);
- -static __always_inline void kasan_free_pages(struct page *page,
- -                                              unsigned int order, bool init)
+ +void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
+ +static __always_inline void kasan_unpoison_pages(struct page *page,
+ +                                               unsigned int order, bool init)
   {
         if (kasan_enabled())
- -              __kasan_free_pages(page, order, init);
+ +              __kasan_unpoison_pages(page, order, init);
   }
   
   void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
@@@ -303,15 -284,21 +302,15 @@@ void kasan_restore_multi_shot(bool enab
   
   #else /* CONFIG_KASAN */
   
- -static inline bool kasan_enabled(void)
- -{
- -      return false;
- -}
- -static inline bool kasan_has_integrated_init(void)
- -{
- -      return false;
- -}
   static inline slab_flags_t kasan_never_merge(void)
   {
         return 0;
   }
   static inline void kasan_unpoison_range(const void *address, size_t size) {}
- -static inline void kasan_alloc_pages(struct page *page, unsigned int order, bool init) {}
- -static inline void kasan_free_pages(struct page *page, unsigned int order, bool init) {}
+ +static inline void kasan_poison_pages(struct page *page, unsigned int order,
+ +                                    bool init) {}
+ +static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
+ +                                      bool init) {}
   static inline void kasan_cache_create(struct kmem_cache *cache,
                                       unsigned int *size,
                                       slab_flags_t *flags) {}
diff --combined include/linux/kthread.h

index d9133d6,db3eafe..346b0f2
--- 1/include/linux/kthread.h
--- 2/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@@ -18,7 -18,7 +18,7 @@@ struct task_struct *kthread_create_on_n
    * @threadfn: the function to run in the thread
    * @data: data pointer for @threadfn()
    * @namefmt: printf-style format string for the thread name
-  * @arg...: arguments for @namefmt.
+  * @arg: arguments for @namefmt.
    *
    * This macro will create a kthread on the current node, leaving it in
    * the stopped state.  This is just a helper for kthread_create_on_node();
@@@ -33,8 -33,6 +33,8 @@@ struct task_struct *kthread_create_on_c
                                           unsigned int cpu,
                                           const char *namefmt);
   
+ +void set_kthread_struct(struct task_struct *p);
+ +
   void kthread_set_per_cpu(struct task_struct *k, int cpu);
   bool kthread_is_per_cpu(struct task_struct *k);
   
diff --combined include/linux/mm.h

index 01ecf9e,07922ee..6d0f827
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -46,7 -46,7 +46,7 @@@ extern int sysctl_page_lock_unfairness
   
   void init_mm_internals(void);
   
- #ifndef CONFIG_NEED_MULTIPLE_NODES    /* Don't use mapnrs, do it properly */
+ #ifndef CONFIG_NUMA           /* Don't use mapnrs, do it properly */
   extern unsigned long max_mapnr;
   
   static inline void set_max_mapnr(unsigned long limit)
@@@ -234,7 -234,11 +234,11 @@@ int overcommit_policy_handler(struct ct
   int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                 pgoff_t index, gfp_t gfp, void **shadowp);
   
+ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
   #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
+ #else
+ #define nth_page(page,n) ((page) + (n))
+ #endif
   
   /* to align the pointer to the (next) page boundary */
   #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
@@@ -1341,7 -1345,7 +1345,7 @@@ static inline bool page_needs_cow_for_d
         if (!is_cow_mapping(vma->vm_flags))
                 return false;
   
-       if (!atomic_read(&vma->vm_mm->has_pinned))
+       if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
                 return false;
   
         return page_maybe_dma_pinned(page);
@@@ -1709,8 -1713,8 +1713,8 @@@ extern bool can_do_mlock(void)
   #else
   static inline bool can_do_mlock(void) { return false; }
   #endif
- -extern int user_shm_lock(size_t, struct user_struct *);
- -extern void user_shm_unlock(size_t, struct user_struct *);
+ +extern int user_shm_lock(size_t, struct ucounts *);
+ +extern void user_shm_unlock(size_t, struct ucounts *);
   
   /*
    * Parameter block passed down to zap_pte_range in exceptional cases.
@@@ -1850,12 -1854,8 +1854,8 @@@ extern int try_to_release_page(struct p
   extern void do_invalidatepage(struct page *page, unsigned int offset,
                               unsigned int length);
   
- void __set_page_dirty(struct page *, struct address_space *, int warn);
- int __set_page_dirty_nobuffers(struct page *page);
- int __set_page_dirty_no_writeback(struct page *page);
   int redirty_page_for_writepage(struct writeback_control *wbc,
                                 struct page *page);
- void account_page_dirtied(struct page *page, struct address_space *mapping);
   void account_page_cleaned(struct page *page, struct address_space *mapping,
                           struct bdi_writeback *wb);
   int set_page_dirty(struct page *page);
@@@ -2420,7 -2420,7 +2420,7 @@@ static inline unsigned long free_initme
         extern char __init_begin[], __init_end[];
   
         return free_reserved_area(&__init_begin, &__init_end,
-                                 poison, "unused kernel");
+                                 poison, "unused kernel image (initmem)");
   }
   
   static inline unsigned long get_num_physpages(void)
@@@ -2460,7 -2460,7 +2460,7 @@@ extern void get_pfn_range_for_nid(unsig
                         unsigned long *start_pfn, unsigned long *end_pfn);
   extern unsigned long find_min_pfn_with_active_regions(void);
   
- #ifndef CONFIG_NEED_MULTIPLE_NODES
+ #ifndef CONFIG_NUMA
   static inline int early_pfn_to_nid(unsigned long pfn)
   {
         return 0;
@@@ -2474,7 -2474,6 +2474,6 @@@ extern void set_dma_reserve(unsigned lo
   extern void memmap_init_range(unsigned long, int, unsigned long,
                 unsigned long, unsigned long, enum meminit_context,
                 struct vmem_altmap *, int migratetype);
- extern void memmap_init_zone(struct zone *zone);
   extern void setup_per_zone_wmarks(void);
   extern int __meminit init_per_zone_wmark_min(void);
   extern void mem_init(void);
@@@ -2681,17 -2680,45 +2680,45 @@@ extern struct vm_area_struct * find_vma
   extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                              struct vm_area_struct **pprev);
   
- /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
-    NULL if none.  Assume start_addr < end_addr. */
- static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
+ /**
+  * find_vma_intersection() - Look up the first VMA which intersects the interval
+  * @mm: The process address space.
+  * @start_addr: The inclusive start user address.
+  * @end_addr: The exclusive end user address.
+  *
+  * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
+  * start_addr < end_addr.
+  */
+ static inline
+ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+                                            unsigned long start_addr,
+                                            unsigned long end_addr)
   {
-       struct vm_area_struct * vma = find_vma(mm,start_addr);
+       struct vm_area_struct *vma = find_vma(mm, start_addr);
   
         if (vma && end_addr <= vma->vm_start)
                 vma = NULL;
         return vma;
   }
   
+ /**
+  * vma_lookup() - Find a VMA at a specific address
+  * @mm: The process address space.
+  * @addr: The user address.
+  *
+  * Return: The vm_area_struct at the given address, %NULL otherwise.
+  */
+ static inline
+ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+ {
+       struct vm_area_struct *vma = find_vma(mm, addr);
+ 
+       if (vma && addr < vma->vm_start)
+               vma = NULL;
+ 
+       return vma;
+ }
+ 
   static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
   {
         unsigned long vm_start = vma->vm_start;
diff --combined include/linux/page-flags.h

index 40e2c50,d8e2624..4586965
--- 1/include/linux/page-flags.h
--- 2/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@@ -137,9 -137,6 +137,9 @@@ enum pageflags 
   #endif
   #ifdef CONFIG_64BIT
         PG_arch_2,
+ +#endif
+ +#ifdef CONFIG_KASAN_HW_TAGS
+ +      PG_skip_kasan_poison,
   #endif
         __NR_PAGEFLAGS,
   
@@@ -180,17 -177,17 +180,17 @@@
   
   #ifndef __GENERATING_BOUNDS_H
   
- struct page;  /* forward declaration */
- 
- static inline struct page *compound_head(struct page *page)
+ static inline unsigned long _compound_head(const struct page *page)
   {
         unsigned long head = READ_ONCE(page->compound_head);
   
         if (unlikely(head & 1))
-               return (struct page *) (head - 1);
-       return page;
+               return head - 1;
+       return (unsigned long)page;
   }
   
+ #define compound_head(page)   ((typeof(page))_compound_head(page))
+ 
   static __always_inline int PageTail(struct page *page)
   {
         return READ_ONCE(page->compound_head) & 1;
@@@ -446,12 -443,6 +446,12 @@@ TESTCLEARFLAG(Young, young, PF_ANY
   PAGEFLAG(Idle, idle, PF_ANY)
   #endif
   
+ +#ifdef CONFIG_KASAN_HW_TAGS
+ +PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
+ +#else
+ +PAGEFLAG_FALSE(SkipKASanPoison)
+ +#endif
+ +
   /*
    * PageReported() is used to track reported free pages within the Buddy
    * allocator. We can use the non-atomic version of the test and set
diff --combined include/linux/printk.h

index 1790a55,f589b8b..d796183
--- 1/include/linux/printk.h
--- 2/include/linux/printk.h
+++ b/include/linux/printk.h
@@@ -206,6 -206,7 +206,7 @@@ void __init setup_log_buf(int early)
   __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
   void dump_stack_print_info(const char *log_lvl);
   void show_regs_print_info(const char *log_lvl);
+ extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
   extern asmlinkage void dump_stack(void) __cold;
   extern void printk_safe_flush(void);
   extern void printk_safe_flush_on_panic(void);
@@@ -269,6 -270,10 +270,10 @@@ static inline void show_regs_print_info
   {
   }
   
+ static inline void dump_stack_lvl(const char *log_lvl)
+ {
+ }
+ 
   static inline void dump_stack(void)
   {
   }
@@@ -282,47 -287,6 +287,47 @@@ static inline void printk_safe_flush_on
   }
   #endif
   
+ +#ifdef CONFIG_SMP
+ +extern int __printk_cpu_trylock(void);
+ +extern void __printk_wait_on_cpu_lock(void);
+ +extern void __printk_cpu_unlock(void);
+ +
+ +/**
+ + * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning
+ + *                             lock and disable interrupts.
+ + * @flags: Stack-allocated storage for saving local interrupt state,
+ + *         to be passed to printk_cpu_unlock_irqrestore().
+ + *
+ + * If the lock is owned by another CPU, spin until it becomes available.
+ + * Interrupts are restored while spinning.
+ + */
+ +#define printk_cpu_lock_irqsave(flags)                \
+ +      for (;;) {                              \
+ +              local_irq_save(flags);          \
+ +              if (__printk_cpu_trylock())     \
+ +                      break;                  \
+ +              local_irq_restore(flags);       \
+ +              __printk_wait_on_cpu_lock();    \
+ +      }
+ +
+ +/**
+ + * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning
+ + *                                  lock and restore interrupts.
+ + * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave().
+ + */
+ +#define printk_cpu_unlock_irqrestore(flags)   \
+ +      do {                                    \
+ +              __printk_cpu_unlock();          \
+ +              local_irq_restore(flags);       \
+ +      } while (0)                             \
+ +
+ +#else
+ +
+ +#define printk_cpu_lock_irqsave(flags) ((void)flags)
+ +#define printk_cpu_unlock_irqrestore(flags) ((void)flags)
+ +
+ +#endif /* CONFIG_SMP */
+ +
   extern int kptr_restrict;
   
   /**
diff --combined kernel/events/core.c

index 4576413,1c5e324..4649170
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -132,7 -132,6 +132,7 @@@ task_function_call(struct task_struct *
   
   /**
    * cpu_function_call - call a function on the cpu
+ + * @cpu:      target cpu to queue this function
    * @func:     the function to be called
    * @info:     the function call argument
    *
@@@ -3822,16 -3821,9 +3822,16 @@@ static void perf_event_context_sched_in
                                         struct task_struct *task)
   {
         struct perf_cpu_context *cpuctx;
- -      struct pmu *pmu = ctx->pmu;
+ +      struct pmu *pmu;
   
         cpuctx = __get_cpu_context(ctx);
+ +
+ +      /*
+ +       * HACK: for HETEROGENEOUS the task context might have switched to a
+ +       * different PMU, force (re)set the context,
+ +       */
+ +      pmu = ctx->pmu = cpuctx->ctx.pmu;
+ +
         if (cpuctx->task_ctx == ctx) {
                 if (cpuctx->sched_cb_usage)
                         __perf_pmu_sched_task(cpuctx, true);
@@@ -6677,10 -6669,10 +6677,10 @@@ out
         return data->aux_size;
   }
   
- -long perf_pmu_snapshot_aux(struct perf_buffer *rb,
- -                         struct perf_event *event,
- -                         struct perf_output_handle *handle,
- -                         unsigned long size)
+ +static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
+ +                                 struct perf_event *event,
+ +                                 struct perf_output_handle *handle,
+ +                                 unsigned long size)
   {
         unsigned long flags;
         long ret;
@@@ -8309,8 -8301,6 +8309,6 @@@ static void perf_event_mmap_event(struc
   
         if (vma->vm_flags & VM_DENYWRITE)
                 flags |= MAP_DENYWRITE;
-       if (vma->vm_flags & VM_MAYEXEC)
-               flags |= MAP_EXECUTABLE;
         if (vma->vm_flags & VM_LOCKED)
                 flags |= MAP_LOCKED;
         if (is_vm_hugetlb_page(vma))
@@@ -8690,12 -8680,13 +8688,12 @@@ static void perf_event_switch(struct ta
                 },
         };
   
- -      if (!sched_in && task->state == TASK_RUNNING)
+ +      if (!sched_in && task->on_rq) {
                 switch_event.event_id.header.misc |=
                                 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+ +      }
   
- -      perf_iterate_sb(perf_event_switch_output,
- -                     &switch_event,
- -                     NULL);
+ +      perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
   }
   
   /*
@@@ -11926,7 -11917,6 +11924,7 @@@ again
    * @pid:              target pid
    * @cpu:              target cpu
    * @group_fd:         group leader event fd
+ + * @flags:            perf event open flags
    */
   SYSCALL_DEFINE5(perf_event_open,
                 struct perf_event_attr __user *, attr_uptr,
@@@ -12383,8 -12373,6 +12381,8 @@@ err_fd
    * @attr: attributes of the counter to create
    * @cpu: cpu in which the counter is bound
    * @task: task to profile (NULL for percpu)
+ + * @overflow_handler: callback to trigger when we hit the event
+ + * @context: context data could be used in overflow_handler callback
    */
   struct perf_event *
   perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
diff --combined kernel/events/uprobes.c

index a481ef6,907d4ee..af24dc3
--- 1/kernel/events/uprobes.c
--- 2/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@@ -453,7 -453,6 +453,7 @@@ static int update_ref_ctr(struct uprob
    * that have fixed length instructions.
    *
    * uprobe_write_opcode - write the opcode at a given virtual address.
+ + * @auprobe: arch specific probepoint information.
    * @mm: the probed process address space.
    * @vaddr: the virtual address to store the opcode.
    * @opcode: opcode to be written at @vaddr.
@@@ -2047,8 -2046,8 +2047,8 @@@ static struct uprobe *find_active_uprob
         struct vm_area_struct *vma;
   
         mmap_read_lock(mm);
-       vma = find_vma(mm, bp_vaddr);
-       if (vma && vma->vm_start <= bp_vaddr) {
+       vma = vma_lookup(mm, bp_vaddr);
+       if (vma) {
                 if (valid_vma(vma, false)) {
                         struct inode *inode = file_inode(vma->vm_file);
                         loff_t offset = vaddr_to_offset(vma, bp_vaddr);
diff --combined kernel/fork.c

index b4386ff,c6747d5..bc94b2c
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -425,7 -425,7 +425,7 @@@ static int memcg_charge_kernel_stack(st
   
   static void release_task_stack(struct task_struct *tsk)
   {
- -      if (WARN_ON(tsk->state != TASK_DEAD))
+ +      if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
                 return;  /* Better to leak the stack than to free prematurely */
   
         account_kernel_stack(tsk, -1);
@@@ -742,7 -742,6 +742,7 @@@ void __put_task_struct(struct task_stru
         exit_creds(tsk);
         delayacct_tsk_free(tsk);
         put_signal_struct(tsk->signal);
+ +      sched_core_free(tsk);
   
         if (!profile_handoff_task(tsk))
                 free_task(tsk);
@@@ -825,14 -824,9 +825,14 @@@ void __init fork_init(void
         init_task.signal->rlim[RLIMIT_SIGPENDING] =
                 init_task.signal->rlim[RLIMIT_NPROC];
   
- -      for (i = 0; i < UCOUNT_COUNTS; i++)
+ +      for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
                 init_user_ns.ucount_max[i] = max_threads/2;
   
+ +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC));
+ +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE));
+ +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING));
+ +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK));
+ +
   #ifdef CONFIG_VMAP_STACK
         cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                           NULL, free_vm_stack_cache);
@@@ -1035,7 -1029,6 +1035,6 @@@ static struct mm_struct *mm_init(struc
         mm_pgtables_bytes_init(mm);
         mm->map_count = 0;
         mm->locked_vm = 0;
-       atomic_set(&mm->has_pinned, 0);
         atomic64_set(&mm->pinned_vm, 0);
         memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
         spin_lock_init(&mm->page_table_lock);
@@@ -1983,7 -1976,8 +1982,7 @@@ static __latent_entropy struct task_str
         DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
   #endif
         retval = -EAGAIN;
- -      if (atomic_read(&p->real_cred->user->processes) >=
- -                      task_rlimit(p, RLIMIT_NPROC)) {
+ +      if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                 if (p->real_cred->user != INIT_USER &&
                     !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                         goto bad_fork_free;
@@@ -2004,7 -1998,7 +2003,7 @@@
                 goto bad_fork_cleanup_count;
   
         delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
- -      p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
+ +      p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
         p->flags |= PF_FORKNOEXEC;
         INIT_LIST_HEAD(&p->children);
         INIT_LIST_HEAD(&p->sibling);
@@@ -2254,8 -2248,6 +2253,8 @@@
   
         klp_copy_process(p);
   
+ +      sched_core_fork(p);
+ +
         spin_lock(&current->sighand->siglock);
   
         /*
@@@ -2343,7 -2335,6 +2342,7 @@@
         return p;
   
   bad_fork_cancel_cgroup:
+ +      sched_core_free(p);
         spin_unlock(&current->sighand->siglock);
         write_unlock_irq(&tasklist_lock);
         cgroup_cancel_fork(p, args);
@@@ -2392,10 -2383,10 +2391,10 @@@ bad_fork_cleanup_threadgroup_lock
   #endif
         delayacct_tsk_free(p);
   bad_fork_cleanup_count:
- -      atomic_dec(&p->cred->user->processes);
+ +      dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
         exit_creds(p);
   bad_fork_free:
- -      p->state = TASK_DEAD;
+ +      WRITE_ONCE(p->__state, TASK_DEAD);
         put_task_stack(p);
         delayed_free_task(p);
   fork_out:
@@@ -2415,7 -2406,7 +2414,7 @@@ static inline void init_idle_pids(struc
         }
   }
   
- -struct task_struct *fork_idle(int cpu)
+ +struct task_struct * __init fork_idle(int cpu)
   {
         struct task_struct *task;
         struct kernel_clone_args args = {
@@@ -3005,12 -2996,6 +3004,12 @@@ int ksys_unshare(unsigned long unshare_
         if (err)
                 goto bad_unshare_cleanup_cred;
   
+ +      if (new_cred) {
+ +              err = set_cred_ucounts(new_cred);
+ +              if (err)
+ +                      goto bad_unshare_cleanup_cred;
+ +      }
+ +
         if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                 if (do_sysvsem) {
                         /*
diff --combined kernel/kthread.c

index 6b0a30a,86ae5f2..5b37a85
--- 1/kernel/kthread.c
--- 2/kernel/kthread.c
+++ b/kernel/kthread.c
@@@ -68,6 -68,16 +68,6 @@@ enum KTHREAD_BITS 
         KTHREAD_SHOULD_PARK,
   };
   
- -static inline void set_kthread_struct(void *kthread)
- -{
- -      /*
- -       * We abuse ->set_child_tid to avoid the new member and because it
- -       * can't be wrongly copied by copy_process(). We also rely on fact
- -       * that the caller can't exec, so PF_KTHREAD can't be cleared.
- -       */
- -      current->set_child_tid = (__force void __user *)kthread;
- -}
- -
   static inline struct kthread *to_kthread(struct task_struct *k)
   {
         WARN_ON(!(k->flags & PF_KTHREAD));
@@@ -93,22 -103,6 +93,22 @@@ static inline struct kthread *__to_kthr
         return kthread;
   }
   
+ +void set_kthread_struct(struct task_struct *p)
+ +{
+ +      struct kthread *kthread;
+ +
+ +      if (__to_kthread(p))
+ +              return;
+ +
+ +      kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
+ +      /*
+ +       * We abuse ->set_child_tid to avoid the new member and because it
+ +       * can't be wrongly copied by copy_process(). We also rely on fact
+ +       * that the caller can't exec, so PF_KTHREAD can't be cleared.
+ +       */
+ +      p->set_child_tid = (__force void __user *)kthread;
+ +}
+ +
   void free_kthread_struct(struct task_struct *k)
   {
         struct kthread *kthread;
@@@ -278,8 -272,8 +278,8 @@@ static int kthread(void *_create
         struct kthread *self;
         int ret;
   
- -      self = kzalloc(sizeof(*self), GFP_KERNEL);
- -      set_kthread_struct(self);
+ +      set_kthread_struct(current);
+ +      self = to_kthread(current);
   
         /* If user was SIGKILLed, I release the structure. */
         done = xchg(&create->done, NULL);
@@@ -457,7 -451,7 +457,7 @@@ struct task_struct *kthread_create_on_n
   }
   EXPORT_SYMBOL(kthread_create_on_node);
   
- -static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
+ +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
   {
         unsigned long flags;
   
@@@ -473,7 -467,7 +473,7 @@@
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   }
   
- -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+ +static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
   {
         __kthread_bind_mask(p, cpumask_of(cpu), state);
   }
@@@ -1162,14 -1156,14 +1162,14 @@@ static bool __kthread_cancel_work(struc
    * modify @dwork's timer so that it expires after @delay. If @delay is zero,
    * @work is guaranteed to be queued immediately.
    *
-  * Return: %true if @dwork was pending and its timer was modified,
-  * %false otherwise.
+  * Return: %false if @dwork was idle and queued, %true otherwise.
    *
    * A special case is when the work is being canceled in parallel.
    * It might be caused either by the real kthread_cancel_delayed_work_sync()
    * or yet another kthread_mod_delayed_work() call. We let the other command
-  * win and return %false here. The caller is supposed to synchronize these
-  * operations a reasonable way.
+  * win and return %true here. The return value can be used for reference
+  * counting and the number of queued works stays the same. Anyway, the caller
+  * is supposed to synchronize these operations a reasonable way.
    *
    * This function is safe to call from any context including IRQ handler.
    * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
@@@ -1181,13 -1175,15 +1181,15 @@@ bool kthread_mod_delayed_work(struct kt
   {
         struct kthread_work *work = &dwork->work;
         unsigned long flags;
-       int ret = false;
+       int ret;
   
         raw_spin_lock_irqsave(&worker->lock, flags);
   
         /* Do not bother with canceling when never queued. */
-       if (!work->worker)
+       if (!work->worker) {
+               ret = false;
                 goto fast_queue;
+       }
   
         /* Work must not be used with >1 worker, see kthread_queue_work() */
         WARN_ON_ONCE(work->worker != worker);
@@@ -1205,8 -1201,11 +1207,11 @@@
          * be used for reference counting.
          */
         kthread_cancel_delayed_work_timer(work, &flags);
-       if (work->canceling)
+       if (work->canceling) {
+               /* The number of works in the queue does not change. */
+               ret = true;
                 goto out;
+       }
         ret = __kthread_cancel_work(work);
   
   fast_queue:
diff --combined kernel/sysctl.c

index 8c8c220,69d925f..bade842
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -71,7 -71,6 +71,7 @@@
   #include <linux/coredump.h>
   #include <linux/latencytop.h>
   #include <linux/pid.h>
+ +#include <linux/delayacct.h>
   
   #include "../lib/kstrtox.h"
   
@@@ -1748,17 -1747,6 +1748,17 @@@ static struct ctl_table kern_table[] = 
                 .extra2         = SYSCTL_ONE,
         },
   #endif /* CONFIG_SCHEDSTATS */
+ +#ifdef CONFIG_TASK_DELAY_ACCT
+ +      {
+ +              .procname       = "task_delayacct",
+ +              .data           = NULL,
+ +              .maxlen         = sizeof(unsigned int),
+ +              .mode           = 0644,
+ +              .proc_handler   = sysctl_delayacct,
+ +              .extra1         = SYSCTL_ZERO,
+ +              .extra2         = SYSCTL_ONE,
+ +      },
+ +#endif /* CONFIG_TASK_DELAY_ACCT */
   #ifdef CONFIG_NUMA_BALANCING
         {
                 .procname       = "numa_balancing",
@@@ -2921,11 -2909,11 +2921,11 @@@ static struct ctl_table vm_table[] = 
                 .extra2         = &one_thousand,
         },
         {
-               .procname       = "percpu_pagelist_fraction",
-               .data           = &percpu_pagelist_fraction,
-               .maxlen         = sizeof(percpu_pagelist_fraction),
+               .procname       = "percpu_pagelist_high_fraction",
+               .data           = &percpu_pagelist_high_fraction,
+               .maxlen         = sizeof(percpu_pagelist_high_fraction),
                 .mode           = 0644,
-               .proc_handler   = percpu_pagelist_fraction_sysctl_handler,
+               .proc_handler   = percpu_pagelist_high_fraction_sysctl_handler,
                 .extra1         = SYSCTL_ZERO,
         },
         {
diff --combined lib/Kconfig.debug

index b9e223b,deca67d..1c9857f
--- 1/lib/Kconfig.debug
--- 2/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@@ -313,6 -313,9 +313,9 @@@ config DEBUG_INFO_BT
   config PAHOLE_HAS_SPLIT_BTF
         def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "119")
   
+ config PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT
+       def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "122")
+ 
   config DEBUG_INFO_BTF_MODULES
         def_bool y
         depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF
@@@ -1372,6 -1375,7 +1375,6 @@@ config LOCKDE
         bool
         depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
         select STACKTRACE
- -      depends on FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86
         select KALLSYMS
         select KALLSYMS_ALL
   
@@@ -2180,9 -2184,6 +2183,9 @@@ config TEST_KSTRTO
   config TEST_PRINTF
         tristate "Test printf() family of functions at runtime"
   
+ +config TEST_SCANF
+ +      tristate "Test scanf() family of functions at runtime"
+ +
   config TEST_BITMAP
         tristate "Test bitmap_*() family of functions at runtime"
         help
@@@ -2431,6 -2432,18 +2434,18 @@@ config BITS_TES
   
           If unsure, say N.
   
+ config SLUB_KUNIT_TEST
+       tristate "KUnit test for SLUB cache error detection" if !KUNIT_ALL_TESTS
+       depends on SLUB_DEBUG && KUNIT
+       default KUNIT_ALL_TESTS
+       help
+         This builds SLUB allocator unit test.
+         Tests SLUB cache debugging functionality.
+         For more information on KUnit and unit tests in general please refer
+         to the KUnit documentation in Documentation/dev-tools/kunit/.
+ 
+         If unsure, say N.
+ 
   config TEST_UDELAY
         tristate "udelay test driver"
         help
@@@ -2573,18 -2586,6 +2588,18 @@@ config TEST_FP
   
           If unsure, say N.
   
+ +config TEST_CLOCKSOURCE_WATCHDOG
+ +      tristate "Test clocksource watchdog in kernel space"
+ +      depends on CLOCKSOURCE_WATCHDOG
+ +      help
+ +        Enable this option to create a kernel module that will trigger
+ +        a test of the clocksource watchdog.  This module may be loaded
+ +        via modprobe or insmod in which case it will run upon being
+ +        loaded, or it may be built in, in which case it will run
+ +        shortly after boot.
+ +
+ +        If unsure, say N.
+ +
   endif # RUNTIME_TESTING_MENU
   
   config ARCH_USE_MEMTEST
diff --combined lib/Makefile

index a93f080,6d5ea8f..6d765d5
--- 1/lib/Makefile
--- 2/lib/Makefile
+++ b/lib/Makefile
@@@ -83,7 -83,6 +83,7 @@@ obj-$(CONFIG_TEST_USER_COPY) += test_us
   obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
   obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
   obj-$(CONFIG_TEST_PRINTF) += test_printf.o
+ +obj-$(CONFIG_TEST_SCANF) += test_scanf.o
   obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
   obj-$(CONFIG_TEST_STRSCPY) += test_strscpy.o
   obj-$(CONFIG_TEST_UUID) += test_uuid.o
@@@ -355,5 -354,6 +355,6 @@@ obj-$(CONFIG_LIST_KUNIT_TEST) += list-t
   obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o
   obj-$(CONFIG_BITS_TEST) += test_bits.o
   obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o
+ obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o
   
   obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o
diff --combined lib/dump_stack.c

index 5ebf437,586e3f2..27f1687
--- 1/lib/dump_stack.c
--- 2/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@@ -73,10 -73,10 +73,10 @@@ void show_regs_print_info(const char *l
         dump_stack_print_info(log_lvl);
   }
   
- static void __dump_stack(void)
+ static void __dump_stack(const char *log_lvl)
   {
-       dump_stack_print_info(KERN_DEFAULT);
-       show_stack(NULL, NULL, KERN_DEFAULT);
+       dump_stack_print_info(log_lvl);
+       show_stack(NULL, NULL, log_lvl);
   }
   
   /**
@@@ -84,16 -84,56 +84,22 @@@
    *
    * Architectures can override this implementation by implementing its own.
    */
- asmlinkage __visible void dump_stack(void)
- -#ifdef CONFIG_SMP
- -static atomic_t dump_lock = ATOMIC_INIT(-1);
- -
+ asmlinkage __visible void dump_stack_lvl(const char *log_lvl)
   {
         unsigned long flags;
- -      int was_locked;
- -      int old;
- -      int cpu;
   
         /*
          * Permit this cpu to perform nested stack dumps while serialising
          * against other CPUs
          */
- -retry:
- -      local_irq_save(flags);
- -      cpu = smp_processor_id();
- -      old = atomic_cmpxchg(&dump_lock, -1, cpu);
- -      if (old == -1) {
- -              was_locked = 0;
- -      } else if (old == cpu) {
- -              was_locked = 1;
- -      } else {
- -              local_irq_restore(flags);
- -              /*
- -               * Wait for the lock to release before jumping to
- -               * atomic_cmpxchg() in order to mitigate the thundering herd
- -               * problem.
- -               */
- -              do { cpu_relax(); } while (atomic_read(&dump_lock) != -1);
- -              goto retry;
- -      }
- -
- -      __dump_stack(log_lvl);
- -
- -      if (!was_locked)
- -              atomic_set(&dump_lock, -1);
- -
- -      local_irq_restore(flags);
- -}
- -#else
- -asmlinkage __visible void dump_stack_lvl(const char *log_lvl)
- -{
+ +      printk_cpu_lock_irqsave(flags);
-       __dump_stack();
+       __dump_stack(log_lvl);
+ +      printk_cpu_unlock_irqrestore(flags);
   }
- -#endif
+ EXPORT_SYMBOL(dump_stack_lvl);
+ 
+ asmlinkage __visible void dump_stack(void)
+ {
+       dump_stack_lvl(KERN_DEFAULT);
+ }
   EXPORT_SYMBOL(dump_stack);
diff --combined lib/vsprintf.c

index ea65ec5,cc281f5..e5c7afb
--- 1/lib/vsprintf.c
--- 2/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@@ -53,31 -53,6 +53,31 @@@
   #include <linux/string_helpers.h>
   #include "kstrtox.h"
   
+ +static unsigned long long simple_strntoull(const char *startp, size_t max_chars,
+ +                                         char **endp, unsigned int base)
+ +{
+ +      const char *cp;
+ +      unsigned long long result = 0ULL;
+ +      size_t prefix_chars;
+ +      unsigned int rv;
+ +
+ +      cp = _parse_integer_fixup_radix(startp, &base);
+ +      prefix_chars = cp - startp;
+ +      if (prefix_chars < max_chars) {
+ +              rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars);
+ +              /* FIXME */
+ +              cp += (rv & ~KSTRTOX_OVERFLOW);
+ +      } else {
+ +              /* Field too short for prefix + digit, skip over without converting */
+ +              cp = startp + max_chars;
+ +      }
+ +
+ +      if (endp)
+ +              *endp = (char *)cp;
+ +
+ +      return result;
+ +}
+ +
   /**
    * simple_strtoull - convert a string to an unsigned long long
    * @cp: The start of the string
@@@ -88,7 -63,18 +88,7 @@@
    */
   unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
   {
- -      unsigned long long result;
- -      unsigned int rv;
- -
- -      cp = _parse_integer_fixup_radix(cp, &base);
- -      rv = _parse_integer(cp, base, &result);
- -      /* FIXME */
- -      cp += (rv & ~KSTRTOX_OVERFLOW);
- -
- -      if (endp)
- -              *endp = (char *)cp;
- -
- -      return result;
+ +      return simple_strntoull(cp, INT_MAX, endp, base);
   }
   EXPORT_SYMBOL(simple_strtoull);
   
@@@ -123,21 -109,6 +123,21 @@@ long simple_strtol(const char *cp, cha
   }
   EXPORT_SYMBOL(simple_strtol);
   
+ +static long long simple_strntoll(const char *cp, size_t max_chars, char **endp,
+ +                               unsigned int base)
+ +{
+ +      /*
+ +       * simple_strntoull() safely handles receiving max_chars==0 in the
+ +       * case cp[0] == '-' && max_chars == 1.
+ +       * If max_chars == 0 we can drop through and pass it to simple_strntoull()
+ +       * and the content of *cp is irrelevant.
+ +       */
+ +      if (*cp == '-' && max_chars > 0)
+ +              return -simple_strntoull(cp + 1, max_chars - 1, endp, base);
+ +
+ +      return simple_strntoull(cp, max_chars, endp, base);
+ +}
+ +
   /**
    * simple_strtoll - convert a string to a signed long long
    * @cp: The start of the string
@@@ -148,7 -119,10 +148,7 @@@
    */
   long long simple_strtoll(const char *cp, char **endp, unsigned int base)
   {
- -      if (*cp == '-')
- -              return -simple_strtoull(cp + 1, endp, base);
- -
- -      return simple_strtoull(cp, endp, base);
+ +      return simple_strntoll(cp, INT_MAX, endp, base);
   }
   EXPORT_SYMBOL(simple_strtoll);
   
@@@ -1860,8 -1834,7 +1860,8 @@@ char *rtc_str(char *buf, char *end, con
               struct printf_spec spec, const char *fmt)
   {
         bool have_t = true, have_d = true;
- -      bool raw = false;
+ +      bool raw = false, iso8601_separator = true;
+ +      bool found = true;
         int count = 2;
   
         if (check_pointer(&buf, end, tm, spec))
@@@ -1878,25 -1851,14 +1878,25 @@@
                 break;
         }
   
- -      raw = fmt[count] == 'r';
+ +      do {
+ +              switch (fmt[count++]) {
+ +              case 'r':
+ +                      raw = true;
+ +                      break;
+ +              case 's':
+ +                      iso8601_separator = false;
+ +                      break;
+ +              default:
+ +                      found = false;
+ +                      break;
+ +              }
+ +      } while (found);
   
         if (have_d)
                 buf = date_str(buf, end, tm, raw);
         if (have_d && have_t) {
- -              /* Respect ISO 8601 */
                 if (buf < end)
- -                      *buf = 'T';
+ +                      *buf = iso8601_separator ? 'T' : ' ';
                 buf++;
         }
         if (have_t)
@@@ -2224,7 -2186,7 +2224,7 @@@ char *fwnode_string(char *buf, char *en
   bool no_hash_pointers __ro_after_init;
   EXPORT_SYMBOL_GPL(no_hash_pointers);
   
- static int __init no_hash_pointers_enable(char *str)
+ int __init no_hash_pointers_enable(char *str)
   {
         if (no_hash_pointers)
                 return 0;
@@@ -2336,7 -2298,7 +2336,7 @@@ early_param("no_hash_pointers", no_hash
    * - 'd[234]' For a dentry name (optionally 2-4 last components)
    * - 'D[234]' Same as 'd' but for a struct file
    * - 'g' For block_device name (gendisk + partition number)
- - * - 't[RT][dt][r]' For time and date as represented by:
+ + * - 't[RT][dt][r][s]' For time and date as represented by:
    *      R    struct rtc_time
    *      T    time64_t
    * - 'C' For a clock, it prints the name (Common Clock Framework) or address
@@@ -3603,12 -3565,8 +3603,12 @@@ int vsscanf(const char *buf, const cha
                 str = skip_spaces(str);
   
                 digit = *str;
- -              if (is_sign && digit == '-')
+ +              if (is_sign && digit == '-') {
+ +                      if (field_width == 1)
+ +                              break;
+ +
                         digit = *(str + 1);
+ +              }
   
                 if (!digit
                     || (base == 16 && !isxdigit(digit))
@@@ -3618,13 -3576,25 +3618,13 @@@
                         break;
   
                 if (is_sign)
- -                      val.s = qualifier != 'L' ?
- -                              simple_strtol(str, &next, base) :
- -                              simple_strtoll(str, &next, base);
+ +                      val.s = simple_strntoll(str,
+ +                                              field_width >= 0 ? field_width : INT_MAX,
+ +                                              &next, base);
                 else
- -                      val.u = qualifier != 'L' ?
- -                              simple_strtoul(str, &next, base) :
- -                              simple_strtoull(str, &next, base);
- -
- -              if (field_width > 0 && next - str > field_width) {
- -                      if (base == 0)
- -                              _parse_integer_fixup_radix(str, &base);
- -                      while (next - str > field_width) {
- -                              if (is_sign)
- -                                      val.s = div_s64(val.s, base);
- -                              else
- -                                      val.u = div_u64(val.u, base);
- -                              --next;
- -                      }
- -              }
+ +                      val.u = simple_strntoull(str,
+ +                                               field_width >= 0 ? field_width : INT_MAX,
+ +                                               &next, base);
   
                 switch (qualifier) {
                 case 'H':       /* that's 'hh' in format */
diff --combined mm/compaction.c

index 725f564,7d41b58..3a509fb
--- 1/mm/compaction.c
--- 2/mm/compaction.c
+++ b/mm/compaction.c
@@@ -1028,7 -1028,7 +1028,7 @@@ isolate_migratepages_block(struct compa
                 if (!TestClearPageLRU(page))
                         goto isolate_fail_put;
   
-               lruvec = mem_cgroup_page_lruvec(page, pgdat);
+               lruvec = mem_cgroup_page_lruvec(page);
   
                 /* If we already hold the lock, we can skip some rechecking */
                 if (lruvec != locked) {
@@@ -1955,7 -1955,7 +1955,7 @@@ static inline bool is_via_compact_memor
   
   static bool kswapd_is_running(pg_data_t *pgdat)
   {
- -      return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+ +      return pgdat->kswapd && task_is_running(pgdat->kswapd);
   }
   
   /*
diff --combined mm/kasan/common.c

index 0ecd293,2675008..2baf121
--- 1/mm/kasan/common.c
--- 2/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@@ -51,11 -51,14 +51,14 @@@ void kasan_enable_current(void
   {
         current->kasan_depth++;
   }
+ EXPORT_SYMBOL(kasan_enable_current);
   
   void kasan_disable_current(void)
   {
         current->kasan_depth--;
   }
+ EXPORT_SYMBOL(kasan_disable_current);
+ 
   #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
   
   void __kasan_unpoison_range(const void *address, size_t size)
@@@ -97,7 -100,7 +100,7 @@@ slab_flags_t __kasan_never_merge(void
         return 0;
   }
   
- -void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
+ +void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
   {
         u8 tag;
         unsigned long i;
@@@ -111,7 -114,7 +114,7 @@@
         kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
   }
   
- -void __kasan_free_pages(struct page *page, unsigned int order, bool init)
+ +void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
   {
         if (likely(!PageHighMem(page)))
                 kasan_poison(page_address(page), PAGE_SIZE << order,
@@@ -328,6 -331,9 +331,9 @@@ static inline bool ____kasan_slab_free(
         u8 tag;
         void *tagged_object;
   
+       if (!kasan_arch_is_ready())
+               return false;
+ 
         tag = get_tag(object);
         tagged_object = object;
         object = kasan_reset_tag(object);
diff --combined mm/kasan/hw_tags.c

index ed5e5b8,d867b22..4ea8c36
--- 1/mm/kasan/hw_tags.c
--- 2/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@@ -216,60 -216,6 +216,38 @@@ void __init kasan_init_hw_tags(void
         pr_info("KernelAddressSanitizer initialized\n");
   }
   
- void kasan_set_free_info(struct kmem_cache *cache,
-                               void *object, u8 tag)
- {
-       struct kasan_alloc_meta *alloc_meta;
- 
-       alloc_meta = kasan_get_alloc_meta(cache, object);
-       if (alloc_meta)
-               kasan_set_track(&alloc_meta->free_track[0], GFP_NOWAIT);
- }
- 
- struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
-                               void *object, u8 tag)
- {
-       struct kasan_alloc_meta *alloc_meta;
- 
-       alloc_meta = kasan_get_alloc_meta(cache, object);
-       if (!alloc_meta)
-               return NULL;
- 
-       return &alloc_meta->free_track[0];
- }
- 
+ +void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+ +{
+ +      /*
+ +       * This condition should match the one in post_alloc_hook() in
+ +       * page_alloc.c.
+ +       */
+ +      bool init = !want_init_on_free() && want_init_on_alloc(flags);
+ +
+ +      if (flags & __GFP_SKIP_KASAN_POISON)
+ +              SetPageSkipKASanPoison(page);
+ +
+ +      if (flags & __GFP_ZEROTAGS) {
+ +              int i;
+ +
+ +              for (i = 0; i != 1 << order; ++i)
+ +                      tag_clear_highpage(page + i);
+ +      } else {
+ +              kasan_unpoison_pages(page, order, init);
+ +      }
+ +}
+ +
+ +void kasan_free_pages(struct page *page, unsigned int order)
+ +{
+ +      /*
+ +       * This condition should match the one in free_pages_prepare() in
+ +       * page_alloc.c.
+ +       */
+ +      bool init = want_init_on_free();
+ +
+ +      kasan_poison_pages(page, order, init);
+ +}
+ +
   #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
   
   void kasan_set_tagging_report_once(bool state)
diff --combined mm/kasan/sw_tags.c

index 9362938,675e673..bd3f540
--- 1/mm/kasan/sw_tags.c
--- 2/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@@ -166,51 -166,3 +166,10 @@@ void __hwasan_tag_memory(unsigned long 
         kasan_poison((void *)addr, size, tag, false);
   }
   EXPORT_SYMBOL(__hwasan_tag_memory);
- void kasan_set_free_info(struct kmem_cache *cache,
-                               void *object, u8 tag)
- {
-       struct kasan_alloc_meta *alloc_meta;
-       u8 idx = 0;
- 
-       alloc_meta = kasan_get_alloc_meta(cache, object);
-       if (!alloc_meta)
-               return;
- 
- #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
-       idx = alloc_meta->free_track_idx;
-       alloc_meta->free_pointer_tag[idx] = tag;
-       alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
- #endif
- 
-       kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
- }
- 
- struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
-                               void *object, u8 tag)
- {
-       struct kasan_alloc_meta *alloc_meta;
-       int i = 0;
- 
-       alloc_meta = kasan_get_alloc_meta(cache, object);
-       if (!alloc_meta)
-               return NULL;
- 
- #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
-       for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
-               if (alloc_meta->free_pointer_tag[i] == tag)
-                       break;
-       }
-       if (i == KASAN_NR_FREE_STACKS)
-               i = alloc_meta->free_track_idx;
- #endif
- 
-       return &alloc_meta->free_track[i];
- }
- 
+ +
+ +void kasan_tag_mismatch(unsigned long addr, unsigned long access_info,
+ +                      unsigned long ret_ip)
+ +{
+ +      kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
+ +                   ret_ip);
+ +}
diff --combined mm/mmap.c

index bc88d16,d8c92ae..aa9de98
--- 1/mm/mmap.c
--- 2/mm/mmap.c
+++ b/mm/mmap.c
@@@ -1457,9 -1457,7 +1457,7 @@@ unsigned long do_mmap(struct file *file
                 return addr;
   
         if (flags & MAP_FIXED_NOREPLACE) {
-               struct vm_area_struct *vma = find_vma(mm, addr);
- 
-               if (vma && vma->vm_start < addr + len)
+               if (find_vma_intersection(mm, addr, addr + len))
                         return -EEXIST;
         }
   
@@@ -1611,7 -1609,7 +1609,7 @@@ unsigned long ksys_mmap_pgoff(unsigned 
                         goto out_fput;
                 }
         } else if (flags & MAP_HUGETLB) {
- -              struct user_struct *user = NULL;
+ +              struct ucounts *ucounts = NULL;
                 struct hstate *hs;
   
                 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@@ -1627,13 -1625,13 +1625,13 @@@
                  */
                 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                 VM_NORESERVE,
- -                              &user, HUGETLB_ANONHUGE_INODE,
+ +                              &ucounts, HUGETLB_ANONHUGE_INODE,
                                 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                 if (IS_ERR(file))
                         return PTR_ERR(file);
         }
   
-       flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+       flags &= ~MAP_DENYWRITE;
   
         retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
   out_fput:
@@@ -2802,6 -2800,22 +2800,22 @@@ int split_vma(struct mm_struct *mm, str
         return __split_vma(mm, vma, addr, new_below);
   }
   
+ static inline void
+ unlock_range(struct vm_area_struct *start, unsigned long limit)
+ {
+       struct mm_struct *mm = start->vm_mm;
+       struct vm_area_struct *tmp = start;
+ 
+       while (tmp && tmp->vm_start < limit) {
+               if (tmp->vm_flags & VM_LOCKED) {
+                       mm->locked_vm -= vma_pages(tmp);
+                       munlock_vma_pages_all(tmp);
+               }
+ 
+               tmp = tmp->vm_next;
+       }
+ }
+ 
   /* Munmap is split into 2 main parts -- this part which finds
    * what needs doing, and the areas themselves, which do the
    * work.  This now handles partial unmappings.
@@@ -2828,16 -2842,11 +2842,11 @@@ int __do_munmap(struct mm_struct *mm, u
          */
         arch_unmap(mm, start, end);
   
-       /* Find the first overlapping VMA */
-       vma = find_vma(mm, start);
+       /* Find the first overlapping VMA where start < vma->vm_end */
+       vma = find_vma_intersection(mm, start, end);
         if (!vma)
                 return 0;
         prev = vma->vm_prev;
-       /* we have  start < vma->vm_end  */
- 
-       /* if it doesn't overlap, we have nothing.. */
-       if (vma->vm_start >= end)
-               return 0;
   
         /*
          * If we need to split any vma, do it now to save pain later.
@@@ -2890,17 -2899,8 +2899,8 @@@
         /*
          * unlock any mlock()ed ranges before detaching vmas
          */
-       if (mm->locked_vm) {
-               struct vm_area_struct *tmp = vma;
-               while (tmp && tmp->vm_start < end) {
-                       if (tmp->vm_flags & VM_LOCKED) {
-                               mm->locked_vm -= vma_pages(tmp);
-                               munlock_vma_pages_all(tmp);
-                       }
- 
-                       tmp = tmp->vm_next;
-               }
-       }
+       if (mm->locked_vm)
+               unlock_range(vma, end);
   
         /* Detach vmas from rbtree */
         if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
@@@ -3185,14 -3185,8 +3185,8 @@@ void exit_mmap(struct mm_struct *mm
                 mmap_write_unlock(mm);
         }
   
-       if (mm->locked_vm) {
-               vma = mm->mmap;
-               while (vma) {
-                       if (vma->vm_flags & VM_LOCKED)
-                               munlock_vma_pages_all(vma);
-                       vma = vma->vm_next;
-               }
-       }
+       if (mm->locked_vm)
+               unlock_range(mm->mmap, ULONG_MAX);
   
         arch_exit_mmap(mm);
   
diff --combined mm/page_alloc.c

index e7af86e,db00ee8..0817d88
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -120,7 -120,25 +120,25 @@@ typedef int __bitwise fpi_t
   
   /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
   static DEFINE_MUTEX(pcp_batch_high_lock);
- #define MIN_PERCPU_PAGELIST_FRACTION  (8)
+ #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
+ 
+ struct pagesets {
+       local_lock_t lock;
+ #if defined(CONFIG_DEBUG_INFO_BTF) &&                         \
+       !defined(CONFIG_DEBUG_LOCK_ALLOC) &&                    \
+       !defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT)
+       /*
+        * pahole 1.21 and earlier gets confused by zero-sized per-CPU
+        * variables and produces invalid BTF. Ensure that
+        * sizeof(struct pagesets) != 0 for older versions of pahole.
+        */
+       char __pahole_hack;
+       #warning "pahole too old to support zero-sized struct pagesets"
+ #endif
+ };
+ static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+       .lock = INIT_LOCAL_LOCK(lock),
+ };
   
   #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
   DEFINE_PER_CPU(int, numa_node);
@@@ -175,7 -193,7 +193,7 @@@ EXPORT_SYMBOL(_totalram_pages)
   unsigned long totalreserve_pages __read_mostly;
   unsigned long totalcma_pages __read_mostly;
   
- int percpu_pagelist_fraction;
+ int percpu_pagelist_high_fraction;
   gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
   DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
   EXPORT_SYMBOL(init_on_alloc);
@@@ -331,20 -349,7 +349,7 @@@ compound_page_dtor * const compound_pag
   
   int min_free_kbytes = 1024;
   int user_min_free_kbytes = -1;
- #ifdef CONFIG_DISCONTIGMEM
- /*
-  * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
-  * are not on separate NUMA nodes. Functionally this works but with
-  * watermark_boost_factor, it can reclaim prematurely as the ranges can be
-  * quite small. By default, do not boost watermarks on discontigmem as in
-  * many cases very high-order allocations like THP are likely to be
-  * unsupported and the premature reclaim offsets the advantage of long-term
-  * fragmentation avoidance.
-  */
- int watermark_boost_factor __read_mostly;
- #else
   int watermark_boost_factor __read_mostly = 15000;
- #endif
   int watermark_scale_factor = 10;
   
   static unsigned long nr_kernel_pages __initdata;
@@@ -382,7 -387,7 +387,7 @@@ int page_group_by_mobility_disabled __r
   static DEFINE_STATIC_KEY_TRUE(deferred_pages);
   
   /*
- - * Calling kasan_free_pages() only after deferred memory initialization
+ + * Calling kasan_poison_pages() only after deferred memory initialization
    * has completed. Poisoning pages during deferred memory init will greatly
    * lengthen the process and cause problem in large memory systems as the
    * deferred pages initialization is done with interrupt disabled.
@@@ -394,12 -399,15 +399,12 @@@
    * on-demand allocation and then freed again before the deferred pages
    * initialization is done, but this is not likely to happen.
    */
- -static inline void kasan_free_nondeferred_pages(struct page *page, int order,
- -                                              bool init, fpi_t fpi_flags)
+ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
   {
- -      if (static_branch_unlikely(&deferred_pages))
- -              return;
- -      if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- -                      (fpi_flags & FPI_SKIP_KASAN_POISON))
- -              return;
- -      kasan_free_pages(page, order, init);
+ +      return static_branch_unlikely(&deferred_pages) ||
+ +             (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ +              (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ +             PageSkipKASanPoison(page);
   }
   
   /* Returns true if the struct page for the pfn is uninitialised */
@@@ -450,11 -458,13 +455,11 @@@ defer_init(int nid, unsigned long pfn, 
         return false;
   }
   #else
- -static inline void kasan_free_nondeferred_pages(struct page *page, int order,
- -                                              bool init, fpi_t fpi_flags)
+ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
   {
- -      if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- -                      (fpi_flags & FPI_SKIP_KASAN_POISON))
- -              return;
- -      kasan_free_pages(page, order, init);
+ +      return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ +              (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ +             PageSkipKASanPoison(page);
   }
   
   static inline bool early_page_uninitialised(unsigned long pfn)
@@@ -469,7 -479,7 +474,7 @@@ static inline bool defer_init(int nid, 
   #endif
   
   /* Return a pointer to the bitmap storing bits affecting a block of pages */
- static inline unsigned long *get_pageblock_bitmap(struct page *page,
+ static inline unsigned long *get_pageblock_bitmap(const struct page *page,
                                                         unsigned long pfn)
   {
   #ifdef CONFIG_SPARSEMEM
@@@ -479,7 -489,7 +484,7 @@@
   #endif /* CONFIG_SPARSEMEM */
   }
   
- static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
   {
   #ifdef CONFIG_SPARSEMEM
         pfn &= (PAGES_PER_SECTION-1);
@@@ -490,7 -500,7 +495,7 @@@
   }
   
   static __always_inline
- unsigned long __get_pfnblock_flags_mask(struct page *page,
+ unsigned long __get_pfnblock_flags_mask(const struct page *page,
                                         unsigned long pfn,
                                         unsigned long mask)
   {
@@@ -515,13 -525,14 +520,14 @@@
    *
    * Return: pageblock_bits flags
    */
- unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-                                       unsigned long mask)
+ unsigned long get_pfnblock_flags_mask(const struct page *page,
+                                       unsigned long pfn, unsigned long mask)
   {
         return __get_pfnblock_flags_mask(page, pfn, mask);
   }
   
- static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+ static __always_inline int get_pfnblock_migratetype(const struct page *page,
+                                       unsigned long pfn)
   {
         return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
   }
@@@ -653,8 -664,7 +659,7 @@@ static void bad_page(struct page *page
   
         pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
                 current->comm, page_to_pfn(page));
-       __dump_page(page, reason);
-       dump_page_owner(page);
+       dump_page(page, reason);
   
         print_modules();
         dump_stack();
@@@ -664,6 -674,57 +669,57 @@@ out
         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
   }
   
+ static inline unsigned int order_to_pindex(int migratetype, int order)
+ {
+       int base = order;
+ 
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (order > PAGE_ALLOC_COSTLY_ORDER) {
+               VM_BUG_ON(order != pageblock_order);
+               base = PAGE_ALLOC_COSTLY_ORDER + 1;
+       }
+ #else
+       VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+ #endif
+ 
+       return (MIGRATE_PCPTYPES * base) + migratetype;
+ }
+ 
+ static inline int pindex_to_order(unsigned int pindex)
+ {
+       int order = pindex / MIGRATE_PCPTYPES;
+ 
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (order > PAGE_ALLOC_COSTLY_ORDER) {
+               order = pageblock_order;
+               VM_BUG_ON(order != pageblock_order);
+       }
+ #else
+       VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+ #endif
+ 
+       return order;
+ }
+ 
+ static inline bool pcp_allowed_order(unsigned int order)
+ {
+       if (order <= PAGE_ALLOC_COSTLY_ORDER)
+               return true;
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (order == pageblock_order)
+               return true;
+ #endif
+       return false;
+ }
+ 
+ static inline void free_the_page(struct page *page, unsigned int order)
+ {
+       if (pcp_allowed_order(order))           /* Via pcp? */
+               free_unref_page(page, order);
+       else
+               __free_pages_ok(page, order, FPI_NONE);
+ }
+ 
   /*
    * Higher-order pages are called "compound pages".  They are structured thusly:
    *
@@@ -682,7 -743,7 +738,7 @@@
   void free_compound_page(struct page *page)
   {
         mem_cgroup_uncharge(page);
-       __free_pages_ok(page, compound_order(page), FPI_NONE);
+       free_the_page(page, compound_order(page));
   }
   
   void prep_compound_page(struct page *page, unsigned int order)
@@@ -1221,16 -1282,10 +1277,16 @@@ out
         return ret;
   }
   
- -static void kernel_init_free_pages(struct page *page, int numpages)
+ +static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
   {
         int i;
   
+ +      if (zero_tags) {
+ +              for (i = 0; i < numpages; i++)
+ +                      tag_clear_highpage(page + i);
+ +              return;
+ +      }
+ +
         /* s390's use of memset() could override KASAN redzones. */
         kasan_disable_current();
         for (i = 0; i < numpages; i++) {
@@@ -1246,7 -1301,7 +1302,7 @@@ static __always_inline bool free_pages_
                         unsigned int order, bool check_free, fpi_t fpi_flags)
   {
         int bad = 0;
- -      bool init;
+ +      bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
   
         VM_BUG_ON_PAGE(PageTail(page), page);
   
@@@ -1315,17 -1370,10 +1371,17 @@@
          * With hardware tag-based KASAN, memory tags must be set before the
          * page becomes unavailable via debug_pagealloc or arch_free_page.
          */
- -      init = want_init_on_free();
- -      if (init && !kasan_has_integrated_init())
- -              kernel_init_free_pages(page, 1 << order);
- -      kasan_free_nondeferred_pages(page, order, init, fpi_flags);
+ +      if (kasan_has_integrated_init()) {
+ +              if (!skip_kasan_poison)
+ +                      kasan_free_pages(page, order);
+ +      } else {
+ +              bool init = want_init_on_free();
+ +
+ +              if (init)
+ +                      kernel_init_free_pages(page, 1 << order, false);
+ +              if (!skip_kasan_poison)
+ +                      kasan_poison_pages(page, order, init);
+ +      }
   
         /*
          * arch_free_page() can make the page's contents inaccessible.  s390
@@@ -1345,9 -1393,9 +1401,9 @@@
    * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
    * moved from pcp lists to free lists.
    */
- static bool free_pcp_prepare(struct page *page)
+ static bool free_pcp_prepare(struct page *page, unsigned int order)
   {
-       return free_pages_prepare(page, 0, true, FPI_NONE);
+       return free_pages_prepare(page, order, true, FPI_NONE);
   }
   
   static bool bulkfree_pcp_prepare(struct page *page)
@@@ -1364,12 -1412,12 +1420,12 @@@
    * debug_pagealloc enabled, they are checked also immediately when being freed
    * to the pcp lists.
    */
- static bool free_pcp_prepare(struct page *page)
+ static bool free_pcp_prepare(struct page *page, unsigned int order)
   {
         if (debug_pagealloc_enabled_static())
-               return free_pages_prepare(page, 0, true, FPI_NONE);
+               return free_pages_prepare(page, order, true, FPI_NONE);
         else
-               return free_pages_prepare(page, 0, false, FPI_NONE);
+               return free_pages_prepare(page, order, false, FPI_NONE);
   }
   
   static bool bulkfree_pcp_prepare(struct page *page)
@@@ -1401,8 -1449,10 +1457,10 @@@ static inline void prefetch_buddy(struc
   static void free_pcppages_bulk(struct zone *zone, int count,
                                         struct per_cpu_pages *pcp)
   {
-       int migratetype = 0;
+       int pindex = 0;
         int batch_free = 0;
+       int nr_freed = 0;
+       unsigned int order;
         int prefetch_nr = READ_ONCE(pcp->batch);
         bool isolated_pageblocks;
         struct page *page, *tmp;
@@@ -1413,7 -1463,7 +1471,7 @@@
          * below while (list_empty(list)) loop.
          */
         count = min(pcp->count, count);
-       while (count) {
+       while (count > 0) {
                 struct list_head *list;
   
                 /*
@@@ -1425,24 -1475,31 +1483,31 @@@
                  */
                 do {
                         batch_free++;
-                       if (++migratetype == MIGRATE_PCPTYPES)
-                               migratetype = 0;
-                       list = &pcp->lists[migratetype];
+                       if (++pindex == NR_PCP_LISTS)
+                               pindex = 0;
+                       list = &pcp->lists[pindex];
                 } while (list_empty(list));
   
                 /* This is the only non-empty list. Free them all. */
-               if (batch_free == MIGRATE_PCPTYPES)
+               if (batch_free == NR_PCP_LISTS)
                         batch_free = count;
   
+               order = pindex_to_order(pindex);
+               BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
                 do {
                         page = list_last_entry(list, struct page, lru);
                         /* must delete to avoid corrupting pcp list */
                         list_del(&page->lru);
-                       pcp->count--;
+                       nr_freed += 1 << order;
+                       count -= 1 << order;
   
                         if (bulkfree_pcp_prepare(page))
                                 continue;
   
+                       /* Encode order with the migratetype */
+                       page->index <<= NR_PCP_ORDER_WIDTH;
+                       page->index |= order;
+ 
                         list_add_tail(&page->lru, &head);
   
                         /*
@@@ -1458,9 -1515,14 +1523,14 @@@
                                 prefetch_buddy(page);
                                 prefetch_nr--;
                         }
-               } while (--count && --batch_free && !list_empty(list));
+               } while (count > 0 && --batch_free && !list_empty(list));
         }
+       pcp->count -= nr_freed;
   
+       /*
+        * local_lock_irq held so equivalent to spin_lock_irqsave for
+        * both PREEMPT_RT and non-PREEMPT_RT configurations.
+        */
         spin_lock(&zone->lock);
         isolated_pageblocks = has_isolate_pageblock(zone);
   
@@@ -1470,14 -1532,19 +1540,19 @@@
          */
         list_for_each_entry_safe(page, tmp, &head, lru) {
                 int mt = get_pcppage_migratetype(page);
+ 
+               /* mt has been encoded with the order (see above) */
+               order = mt & NR_PCP_ORDER_MASK;
+               mt >>= NR_PCP_ORDER_WIDTH;
+ 
                 /* MIGRATE_ISOLATE page should not go to pcplists */
                 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
                 /* Pageblock could have been isolated meanwhile */
                 if (unlikely(isolated_pageblocks))
                         mt = get_pageblock_migratetype(page);
   
-               __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
-               trace_mm_page_pcpu_drain(page, 0, mt);
+               __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+               trace_mm_page_pcpu_drain(page, order, mt);
         }
         spin_unlock(&zone->lock);
   }
@@@ -1487,13 -1554,15 +1562,15 @@@ static void free_one_page(struct zone *
                                 unsigned int order,
                                 int migratetype, fpi_t fpi_flags)
   {
-       spin_lock(&zone->lock);
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&zone->lock, flags);
         if (unlikely(has_isolate_pageblock(zone) ||
                 is_migrate_isolate(migratetype))) {
                 migratetype = get_pfnblock_migratetype(page, pfn);
         }
         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
-       spin_unlock(&zone->lock);
+       spin_unlock_irqrestore(&zone->lock, flags);
   }
   
   static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@@ -1576,16 -1645,22 +1653,22 @@@ static void __free_pages_ok(struct pag
         unsigned long flags;
         int migratetype;
         unsigned long pfn = page_to_pfn(page);
+       struct zone *zone = page_zone(page);
   
         if (!free_pages_prepare(page, order, true, fpi_flags))
                 return;
   
         migratetype = get_pfnblock_migratetype(page, pfn);
-       local_irq_save(flags);
+ 
+       spin_lock_irqsave(&zone->lock, flags);
+       if (unlikely(has_isolate_pageblock(zone) ||
+               is_migrate_isolate(migratetype))) {
+               migratetype = get_pfnblock_migratetype(page, pfn);
+       }
+       __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+       spin_unlock_irqrestore(&zone->lock, flags);
+ 
         __count_vm_events(PGFREE, 1 << order);
-       free_one_page(page_zone(page), page, pfn, order, migratetype,
-                     fpi_flags);
-       local_irq_restore(flags);
   }
   
   void __free_pages_core(struct page *page, unsigned int order)
@@@ -1617,7 -1692,7 +1700,7 @@@
         __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
   }
   
- #ifdef CONFIG_NEED_MULTIPLE_NODES
+ #ifdef CONFIG_NUMA
   
   /*
    * During memory init memblocks map pfns to nids. The search is expensive and
@@@ -1667,7 -1742,7 +1750,7 @@@ int __meminit early_pfn_to_nid(unsigne
   
         return nid;
   }
- #endif /* CONFIG_NEED_MULTIPLE_NODES */
+ #endif /* CONFIG_NUMA */
   
   void __init memblock_free_pages(struct page *page, unsigned long pfn,
                                                         unsigned int order)
@@@ -2154,14 -2229,6 +2237,6 @@@ void __init page_alloc_init_late(void
         /* Block until all are initialised */
         wait_for_completion(&pgdat_init_all_done_comp);
   
-       /*
-        * The number of managed pages has changed due to the initialisation
-        * so the pcpu batch and high limits needs to be updated or the limits
-        * will be artificially small.
-        */
-       for_each_populated_zone(zone)
-               zone_pcp_update(zone);
- 
         /*
          * We initialized the rest of the deferred pages.  Permanently disable
          * on-demand struct page initialization.
@@@ -2332,6 -2399,8 +2407,6 @@@ static bool check_new_pages(struct pag
   inline void post_alloc_hook(struct page *page, unsigned int order,
                                 gfp_t gfp_flags)
   {
- -      bool init;
- -
         set_page_private(page, 0);
         set_page_refcounted(page);
   
@@@ -2350,16 -2419,10 +2425,16 @@@
          * kasan_alloc_pages and kernel_init_free_pages must be
          * kept together to avoid discrepancies in behavior.
          */
- -      init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
- -      kasan_alloc_pages(page, order, init);
- -      if (init && !kasan_has_integrated_init())
- -              kernel_init_free_pages(page, 1 << order);
+ +      if (kasan_has_integrated_init()) {
+ +              kasan_alloc_pages(page, order, gfp_flags);
+ +      } else {
+ +              bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+ +
+ +              kasan_unpoison_pages(page, order, init);
+ +              if (init)
+ +                      kernel_init_free_pages(page, 1 << order,
+ +                                             gfp_flags & __GFP_ZEROTAGS);
+ +      }
   
         set_page_owner(page, order, gfp_flags);
   }
@@@ -2967,6 -3030,10 +3042,10 @@@ static int rmqueue_bulk(struct zone *zo
   {
         int i, allocated = 0;
   
+       /*
+        * local_lock_irq held so equivalent to spin_lock_irqsave for
+        * both PREEMPT_RT and non-PREEMPT_RT configurations.
+        */
         spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
                 struct page *page = __rmqueue(zone, order, migratetype,
@@@ -3019,12 -3086,12 +3098,12 @@@ void drain_zone_pages(struct zone *zone
         unsigned long flags;
         int to_drain, batch;
   
-       local_irq_save(flags);
+       local_lock_irqsave(&pagesets.lock, flags);
         batch = READ_ONCE(pcp->batch);
         to_drain = min(pcp->count, batch);
         if (to_drain > 0)
                 free_pcppages_bulk(zone, to_drain, pcp);
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&pagesets.lock, flags);
   }
   #endif
   
@@@ -3038,16 -3105,15 +3117,15 @@@
   static void drain_pages_zone(unsigned int cpu, struct zone *zone)
   {
         unsigned long flags;
-       struct per_cpu_pageset *pset;
         struct per_cpu_pages *pcp;
   
-       local_irq_save(flags);
-       pset = per_cpu_ptr(zone->pageset, cpu);
+       local_lock_irqsave(&pagesets.lock, flags);
   
-       pcp = &pset->pcp;
+       pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
         if (pcp->count)
                 free_pcppages_bulk(zone, pcp->count, pcp);
-       local_irq_restore(flags);
+ 
+       local_unlock_irqrestore(&pagesets.lock, flags);
   }
   
   /*
@@@ -3145,7 -3211,7 +3223,7 @@@ static void __drain_all_pages(struct zo
          * disables preemption as part of its processing
          */
         for_each_online_cpu(cpu) {
-               struct per_cpu_pageset *pcp;
+               struct per_cpu_pages *pcp;
                 struct zone *z;
                 bool has_pcps = false;
   
@@@ -3156,13 -3222,13 +3234,13 @@@
                          */
                         has_pcps = true;
                 } else if (zone) {
-                       pcp = per_cpu_ptr(zone->pageset, cpu);
-                       if (pcp->pcp.count)
+                       pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+                       if (pcp->count)
                                 has_pcps = true;
                 } else {
                         for_each_populated_zone(z) {
-                               pcp = per_cpu_ptr(z->pageset, cpu);
-                               if (pcp->pcp.count) {
+                               pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
+                               if (pcp->count) {
                                         has_pcps = true;
                                         break;
                                 }
@@@ -3255,11 -3321,12 +3333,12 @@@ void mark_free_pages(struct zone *zone
   }
   #endif /* CONFIG_PM */
   
- static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+                                                       unsigned int order)
   {
         int migratetype;
   
-       if (!free_pcp_prepare(page))
+       if (!free_pcp_prepare(page, order))
                 return false;
   
         migratetype = get_pfnblock_migratetype(page, pfn);
@@@ -3267,52 -3334,99 +3346,99 @@@
         return true;
   }
   
- static void free_unref_page_commit(struct page *page, unsigned long pfn)
+ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
+ {
+       int min_nr_free, max_nr_free;
+ 
+       /* Check for PCP disabled or boot pageset */
+       if (unlikely(high < batch))
+               return 1;
+ 
+       /* Leave at least pcp->batch pages on the list */
+       min_nr_free = batch;
+       max_nr_free = high - batch;
+ 
+       /*
+        * Double the number of pages freed each time there is subsequent
+        * freeing of pages without any allocation.
+        */
+       batch <<= pcp->free_factor;
+       if (batch < max_nr_free)
+               pcp->free_factor++;
+       batch = clamp(batch, min_nr_free, max_nr_free);
+ 
+       return batch;
+ }
+ 
+ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
+ {
+       int high = READ_ONCE(pcp->high);
+ 
+       if (unlikely(!high))
+               return 0;
+ 
+       if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+               return high;
+ 
+       /*
+        * If reclaim is active, limit the number of pages that can be
+        * stored on pcp lists
+        */
+       return min(READ_ONCE(pcp->batch) << 2, high);
+ }
+ 
+ static void free_unref_page_commit(struct page *page, unsigned long pfn,
+                                  int migratetype, unsigned int order)
   {
         struct zone *zone = page_zone(page);
         struct per_cpu_pages *pcp;
-       int migratetype;
+       int high;
+       int pindex;
   
-       migratetype = get_pcppage_migratetype(page);
         __count_vm_event(PGFREE);
+       pcp = this_cpu_ptr(zone->per_cpu_pageset);
+       pindex = order_to_pindex(migratetype, order);
+       list_add(&page->lru, &pcp->lists[pindex]);
+       pcp->count += 1 << order;
+       high = nr_pcp_high(pcp, zone);
+       if (pcp->count >= high) {
+               int batch = READ_ONCE(pcp->batch);
+ 
+               free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+       }
+ }
+ 
+ /*
+  * Free a pcp page
+  */
+ void free_unref_page(struct page *page, unsigned int order)
+ {
+       unsigned long flags;
+       unsigned long pfn = page_to_pfn(page);
+       int migratetype;
+ 
+       if (!free_unref_page_prepare(page, pfn, order))
+               return;
   
         /*
          * We only track unmovable, reclaimable and movable on pcp lists.
-        * Free ISOLATE pages back to the allocator because they are being
+        * Place ISOLATE pages on the isolated list because they are being
          * offlined but treat HIGHATOMIC as movable pages so we can get those
          * areas back if necessary. Otherwise, we may have to free
          * excessively into the page allocator
          */
-       if (migratetype >= MIGRATE_PCPTYPES) {
+       migratetype = get_pcppage_migratetype(page);
+       if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
                 if (unlikely(is_migrate_isolate(migratetype))) {
-                       free_one_page(zone, page, pfn, 0, migratetype,
-                                     FPI_NONE);
+                       free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
                         return;
                 }
                 migratetype = MIGRATE_MOVABLE;
         }
   
-       pcp = &this_cpu_ptr(zone->pageset)->pcp;
-       list_add(&page->lru, &pcp->lists[migratetype]);
-       pcp->count++;
-       if (pcp->count >= READ_ONCE(pcp->high))
-               free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
- }
- 
- /*
-  * Free a 0-order page
-  */
- void free_unref_page(struct page *page)
- {
-       unsigned long flags;
-       unsigned long pfn = page_to_pfn(page);
- 
-       if (!free_unref_page_prepare(page, pfn))
-               return;
- 
-       local_irq_save(flags);
-       free_unref_page_commit(page, pfn);
-       local_irq_restore(flags);
+       local_lock_irqsave(&pagesets.lock, flags);
+       free_unref_page_commit(page, pfn, migratetype, order);
+       local_unlock_irqrestore(&pagesets.lock, flags);
   }
   
   /*
@@@ -3323,34 -3437,56 +3449,56 @@@ void free_unref_page_list(struct list_h
         struct page *page, *next;
         unsigned long flags, pfn;
         int batch_count = 0;
+       int migratetype;
   
         /* Prepare pages for freeing */
         list_for_each_entry_safe(page, next, list, lru) {
                 pfn = page_to_pfn(page);
-               if (!free_unref_page_prepare(page, pfn))
+               if (!free_unref_page_prepare(page, pfn, 0))
                         list_del(&page->lru);
+ 
+               /*
+                * Free isolated pages directly to the allocator, see
+                * comment in free_unref_page.
+                */
+               migratetype = get_pcppage_migratetype(page);
+               if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+                       if (unlikely(is_migrate_isolate(migratetype))) {
+                               list_del(&page->lru);
+                               free_one_page(page_zone(page), page, pfn, 0,
+                                                       migratetype, FPI_NONE);
+                               continue;
+                       }
+ 
+                       /*
+                        * Non-isolated types over MIGRATE_PCPTYPES get added
+                        * to the MIGRATE_MOVABLE pcp list.
+                        */
+                       set_pcppage_migratetype(page, MIGRATE_MOVABLE);
+               }
+ 
                 set_page_private(page, pfn);
         }
   
-       local_irq_save(flags);
+       local_lock_irqsave(&pagesets.lock, flags);
         list_for_each_entry_safe(page, next, list, lru) {
-               unsigned long pfn = page_private(page);
- 
+               pfn = page_private(page);
                 set_page_private(page, 0);
+               migratetype = get_pcppage_migratetype(page);
                 trace_mm_page_free_batched(page);
-               free_unref_page_commit(page, pfn);
+               free_unref_page_commit(page, pfn, migratetype, 0);
   
                 /*
                  * Guard against excessive IRQ disabled times when we get
                  * a large list of pages to free.
                  */
                 if (++batch_count == SWAP_CLUSTER_MAX) {
-                       local_irq_restore(flags);
+                       local_unlock_irqrestore(&pagesets.lock, flags);
                         batch_count = 0;
-                       local_irq_save(flags);
+                       local_lock_irqsave(&pagesets.lock, flags);
                 }
         }
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&pagesets.lock, flags);
   }
   
   /*
@@@ -3449,7 -3585,8 +3597,8 @@@ void __putback_isolated_page(struct pag
    *
    * Must be called with interrupts disabled.
    */
- static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+                                  long nr_account)
   {
   #ifdef CONFIG_NUMA
         enum numa_stat_item local_stat = NUMA_LOCAL;
@@@ -3462,18 -3599,19 +3611,19 @@@
                 local_stat = NUMA_OTHER;
   
         if (zone_to_nid(z) == zone_to_nid(preferred_zone))
-               __inc_numa_state(z, NUMA_HIT);
+               __count_numa_events(z, NUMA_HIT, nr_account);
         else {
-               __inc_numa_state(z, NUMA_MISS);
-               __inc_numa_state(preferred_zone, NUMA_FOREIGN);
+               __count_numa_events(z, NUMA_MISS, nr_account);
+               __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
         }
-       __inc_numa_state(z, local_stat);
+       __count_numa_events(z, local_stat, nr_account);
   #endif
   }
   
   /* Remove page from the per-cpu list, caller must protect the list */
   static inline
- struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
+                       int migratetype,
                         unsigned int alloc_flags,
                         struct per_cpu_pages *pcp,
                         struct list_head *list)
@@@ -3482,16 -3620,30 +3632,30 @@@
   
         do {
                 if (list_empty(list)) {
-                       pcp->count += rmqueue_bulk(zone, 0,
-                                       READ_ONCE(pcp->batch), list,
+                       int batch = READ_ONCE(pcp->batch);
+                       int alloced;
+ 
+                       /*
+                        * Scale batch relative to order if batch implies
+                        * free pages can be stored on the PCP. Batch can
+                        * be 1 for small zones or for boot pagesets which
+                        * should never store free pages as the pages may
+                        * belong to arbitrary zones.
+                        */
+                       if (batch > 1)
+                               batch = max(batch >> order, 2);
+                       alloced = rmqueue_bulk(zone, order,
+                                       batch, list,
                                         migratetype, alloc_flags);
+ 
+                       pcp->count += alloced << order;
                         if (unlikely(list_empty(list)))
                                 return NULL;
                 }
   
                 page = list_first_entry(list, struct page, lru);
                 list_del(&page->lru);
-               pcp->count--;
+               pcp->count -= 1 << order;
         } while (check_new_pcp(page));
   
         return page;
@@@ -3499,23 -3651,31 +3663,31 @@@
   
   /* Lock and remove page from the per-cpu list */
   static struct page *rmqueue_pcplist(struct zone *preferred_zone,
-                       struct zone *zone, gfp_t gfp_flags,
-                       int migratetype, unsigned int alloc_flags)
+                       struct zone *zone, unsigned int order,
+                       gfp_t gfp_flags, int migratetype,
+                       unsigned int alloc_flags)
   {
         struct per_cpu_pages *pcp;
         struct list_head *list;
         struct page *page;
         unsigned long flags;
   
-       local_irq_save(flags);
-       pcp = &this_cpu_ptr(zone->pageset)->pcp;
-       list = &pcp->lists[migratetype];
-       page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
+       local_lock_irqsave(&pagesets.lock, flags);
+ 
+       /*
+        * On allocation, reduce the number of pages that are batch freed.
+        * See nr_pcp_free() where free_factor is increased for subsequent
+        * frees.
+        */
+       pcp = this_cpu_ptr(zone->per_cpu_pageset);
+       pcp->free_factor >>= 1;
+       list = &pcp->lists[order_to_pindex(migratetype, order)];
+       page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
+       local_unlock_irqrestore(&pagesets.lock, flags);
         if (page) {
                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
-               zone_statistics(preferred_zone, zone);
+               zone_statistics(preferred_zone, zone, 1);
         }
-       local_irq_restore(flags);
         return page;
   }
   
@@@ -3531,15 -3691,15 +3703,15 @@@ struct page *rmqueue(struct zone *prefe
         unsigned long flags;
         struct page *page;
   
-       if (likely(order == 0)) {
+       if (likely(pcp_allowed_order(order))) {
                 /*
                  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
                  * we need to skip it when CMA area isn't allowed.
                  */
                 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
                                 migratetype != MIGRATE_MOVABLE) {
-                       page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
-                                       migratetype, alloc_flags);
+                       page = rmqueue_pcplist(preferred_zone, zone, order,
+                                       gfp_flags, migratetype, alloc_flags);
                         goto out;
                 }
         }
@@@ -3567,15 -3727,15 +3739,15 @@@
                 if (!page)
                         page = __rmqueue(zone, order, migratetype, alloc_flags);
         } while (page && check_new_pages(page, order));
-       spin_unlock(&zone->lock);
         if (!page)
                 goto failed;
+ 
         __mod_zone_freepage_state(zone, -(1 << order),
                                   get_pcppage_migratetype(page));
+       spin_unlock_irqrestore(&zone->lock, flags);
   
         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-       zone_statistics(preferred_zone, zone);
-       local_irq_restore(flags);
+       zone_statistics(preferred_zone, zone, 1);
   
   out:
         /* Separate test+clear to avoid unnecessary atomics */
@@@ -3588,7 -3748,7 +3760,7 @@@
         return page;
   
   failed:
-       local_irq_restore(flags);
+       spin_unlock_irqrestore(&zone->lock, flags);
         return NULL;
   }
   
@@@ -4264,6 -4424,9 +4436,9 @@@ should_compact_retry(struct alloc_conte
         if (!order)
                 return false;
   
+       if (fatal_signal_pending(current))
+               return false;
+ 
         if (compaction_made_progress(compact_result))
                 (*compaction_retries)++;
   
@@@ -5056,7 -5219,7 +5231,7 @@@ unsigned long __alloc_pages_bulk(gfp_t 
         struct alloc_context ac;
         gfp_t alloc_gfp;
         unsigned int alloc_flags = ALLOC_WMARK_LOW;
-       int nr_populated = 0;
+       int nr_populated = 0, nr_account = 0;
   
         if (unlikely(nr_pages <= 0))
                 return 0;
@@@ -5113,9 -5276,9 +5288,9 @@@
                 goto failed;
   
         /* Attempt the batch allocation */
-       local_irq_save(flags);
-       pcp = &this_cpu_ptr(zone->pageset)->pcp;
-       pcp_list = &pcp->lists[ac.migratetype];
+       local_lock_irqsave(&pagesets.lock, flags);
+       pcp = this_cpu_ptr(zone->per_cpu_pageset);
+       pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
   
         while (nr_populated < nr_pages) {
   
@@@ -5125,7 -5288,7 +5300,7 @@@
                         continue;
                 }
   
-               page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+               page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
                                                                 pcp, pcp_list);
                 if (unlikely(!page)) {
                         /* Try and get at least one page */
@@@ -5133,15 -5296,7 +5308,7 @@@
                                 goto failed_irq;
                         break;
                 }
- 
-               /*
-                * Ideally this would be batched but the best way to do
-                * that cheaply is to first convert zone_statistics to
-                * be inaccurate per-cpu counter like vm_events to avoid
-                * a RMW cycle then do the accounting with IRQs enabled.
-                */
-               __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
-               zone_statistics(ac.preferred_zoneref->zone, zone);
+               nr_account++;
   
                 prep_new_page(page, 0, gfp, 0);
                 if (page_list)
@@@ -5151,12 -5306,15 +5318,15 @@@
                 nr_populated++;
         }
   
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&pagesets.lock, flags);
+ 
+       __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+       zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
   
         return nr_populated;
   
   failed_irq:
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&pagesets.lock, flags);
   
   failed:
         page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
@@@ -5263,14 -5421,6 +5433,6 @@@ unsigned long get_zeroed_page(gfp_t gfp
   }
   EXPORT_SYMBOL(get_zeroed_page);
   
- static inline void free_the_page(struct page *page, unsigned int order)
- {
-       if (order == 0)         /* Via pcp? */
-               free_unref_page(page);
-       else
-               __free_pages_ok(page, order, FPI_NONE);
- }
- 
   /**
    * __free_pages - Free pages allocated with alloc_pages().
    * @page: The page pointer returned from alloc_pages().
@@@ -5729,7 -5879,7 +5891,7 @@@ void show_free_areas(unsigned int filte
                         continue;
   
                 for_each_online_cpu(cpu)
-                       free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+                       free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
         }
   
         printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
@@@ -5821,7 -5971,7 +5983,7 @@@
   
                 free_pcp = 0;
                 for_each_online_cpu(cpu)
-                       free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+                       free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
   
                 show_node(zone);
                 printk(KERN_CONT
@@@ -5862,7 -6012,7 +6024,7 @@@
                         K(zone_page_state(zone, NR_MLOCK)),
                         K(zone_page_state(zone, NR_BOUNCE)),
                         K(free_pcp),
-                       K(this_cpu_read(zone->pageset->pcp.count)),
+                       K(this_cpu_read(zone->per_cpu_pageset->count)),
                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
                 printk("lowmem_reserve[]:");
                 for (i = 0; i < MAX_NR_ZONES; i++)
@@@ -6189,11 -6339,12 +6351,12 @@@ static void build_zonelists(pg_data_t *
    * not check if the processor is online before following the pageset pointer.
    * Other parts of the kernel may not check if the zone is available.
    */
- static void pageset_init(struct per_cpu_pageset *p);
+ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
   /* These effectively disable the pcplists in the boot pageset completely */
   #define BOOT_PAGESET_HIGH     0
   #define BOOT_PAGESET_BATCH    1
- static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+ static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
+ static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
   static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
   
   static void __build_all_zonelists(void *data)
@@@ -6260,7 -6411,7 +6423,7 @@@ build_all_zonelists_init(void
          * (a chicken-egg dilemma).
          */
         for_each_possible_cpu(cpu)
-               pageset_init(&per_cpu(boot_pageset, cpu));
+               per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
   
         mminit_verify_zonelist();
         cpuset_init_current_mems_allowed();
@@@ -6412,7 -6563,7 +6575,7 @@@ void __ref memmap_init_zone_device(stru
                 return;
   
         /*
-        * The call to memmap_init_zone should have already taken care
+        * The call to memmap_init should have already taken care
          * of the pages reserved for the memmap, so we can just jump to
          * the end of that region and start processing the device pages.
          */
@@@ -6473,11 -6624,11 +6636,11 @@@ static void __meminit zone_init_free_li
         }
   }
   
- #if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+ #if !defined(CONFIG_FLATMEM)
   /*
    * Only struct pages that correspond to ranges defined by memblock.memory
    * are zeroed and initialized by going through __init_single_page() during
-  * memmap_init_zone().
+  * memmap_init_zone_range().
    *
    * But, there could be struct pages that correspond to holes in
    * memblock.memory. This can happen because of the following reasons:
@@@ -6496,9 -6647,9 +6659,9 @@@
    *   zone/node above the hole except for the trailing pages in the last
    *   section that will be appended to the zone/node below.
    */
- static u64 __meminit init_unavailable_range(unsigned long spfn,
-                                           unsigned long epfn,
-                                           int zone, int node)
+ static void __init init_unavailable_range(unsigned long spfn,
+                                         unsigned long epfn,
+                                         int zone, int node)
   {
         unsigned long pfn;
         u64 pgcnt = 0;
@@@ -6514,56 -6665,77 +6677,77 @@@
                 pgcnt++;
         }
   
-       return pgcnt;
+       if (pgcnt)
+               pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
+                       node, zone_names[zone], pgcnt);
   }
   #else
- static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
-                                        int zone, int node)
+ static inline void init_unavailable_range(unsigned long spfn,
+                                         unsigned long epfn,
+                                         int zone, int node)
   {
-       return 0;
   }
   #endif
   
- void __meminit __weak memmap_init_zone(struct zone *zone)
+ static void __init memmap_init_zone_range(struct zone *zone,
+                                         unsigned long start_pfn,
+                                         unsigned long end_pfn,
+                                         unsigned long *hole_pfn)
   {
         unsigned long zone_start_pfn = zone->zone_start_pfn;
         unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-       int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
-       static unsigned long hole_pfn;
+       int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+ 
+       start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+       end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+ 
+       if (start_pfn >= end_pfn)
+               return;
+ 
+       memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
+                         zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+ 
+       if (*hole_pfn < start_pfn)
+               init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
+ 
+       *hole_pfn = end_pfn;
+ }
+ 
+ static void __init memmap_init(void)
+ {
         unsigned long start_pfn, end_pfn;
-       u64 pgcnt = 0;
+       unsigned long hole_pfn = 0;
+       int i, j, zone_id, nid;
   
-       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
-               start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
-               end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+               struct pglist_data *node = NODE_DATA(nid);
   
-               if (end_pfn > start_pfn)
-                       memmap_init_range(end_pfn - start_pfn, nid,
-                                       zone_id, start_pfn, zone_end_pfn,
-                                       MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+               for (j = 0; j < MAX_NR_ZONES; j++) {
+                       struct zone *zone = node->node_zones + j;
   
-               if (hole_pfn < start_pfn)
-                       pgcnt += init_unavailable_range(hole_pfn, start_pfn,
-                                                       zone_id, nid);
-               hole_pfn = end_pfn;
+                       if (!populated_zone(zone))
+                               continue;
+ 
+                       memmap_init_zone_range(zone, start_pfn, end_pfn,
+                                              &hole_pfn);
+                       zone_id = j;
+               }
         }
   
   #ifdef CONFIG_SPARSEMEM
         /*
-        * Initialize the hole in the range [zone_end_pfn, section_end].
-        * If zone boundary falls in the middle of a section, this hole
-        * will be re-initialized during the call to this function for the
-        * higher zone.
+        * Initialize the memory map for hole in the range [memory_end,
+        * section_end].
+        * Append the pages in this hole to the highest zone in the last
+        * node.
+        * The call to init_unavailable_range() is outside the ifdef to
+        * silence the compiler warining about zone_id set but not used;
+        * for FLATMEM it is a nop anyway
          */
-       end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
+       end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
         if (hole_pfn < end_pfn)
-               pgcnt += init_unavailable_range(hole_pfn, end_pfn,
-                                               zone_id, nid);
   #endif
- 
-       if (pgcnt)
-               pr_info("  %s zone: %llu pages in unavailable ranges\n",
-                       zone->name, pgcnt);
+               init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
   }
   
   static int zone_batchsize(struct zone *zone)
@@@ -6572,13 -6744,12 +6756,12 @@@
         int batch;
   
         /*
-        * The per-cpu-pages pools are set to around 1000th of the
-        * size of the zone.
+        * The number of pages to batch allocate is either ~0.1%
+        * of the zone or 1MB, whichever is smaller. The batch
+        * size is striking a balance between allocation latency
+        * and zone lock contention.
          */
-       batch = zone_managed_pages(zone) / 1024;
-       /* But no more than a meg. */
-       if (batch * PAGE_SIZE > 1024 * 1024)
-               batch = (1024 * 1024) / PAGE_SIZE;
+       batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
         batch /= 4;             /* We effectively *= 4 below */
         if (batch < 1)
                 batch = 1;
@@@ -6615,6 -6786,54 +6798,54 @@@
   #endif
   }
   
+ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
+ {
+ #ifdef CONFIG_MMU
+       int high;
+       int nr_split_cpus;
+       unsigned long total_pages;
+ 
+       if (!percpu_pagelist_high_fraction) {
+               /*
+                * By default, the high value of the pcp is based on the zone
+                * low watermark so that if they are full then background
+                * reclaim will not be started prematurely.
+                */
+               total_pages = low_wmark_pages(zone);
+       } else {
+               /*
+                * If percpu_pagelist_high_fraction is configured, the high
+                * value is based on a fraction of the managed pages in the
+                * zone.
+                */
+               total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+       }
+ 
+       /*
+        * Split the high value across all online CPUs local to the zone. Note
+        * that early in boot that CPUs may not be online yet and that during
+        * CPU hotplug that the cpumask is not yet updated when a CPU is being
+        * onlined. For memory nodes that have no CPUs, split pcp->high across
+        * all online CPUs to mitigate the risk that reclaim is triggered
+        * prematurely due to pages stored on pcp lists.
+        */
+       nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
+       if (!nr_split_cpus)
+               nr_split_cpus = num_online_cpus();
+       high = total_pages / nr_split_cpus;
+ 
+       /*
+        * Ensure high is at least batch*4. The multiple is based on the
+        * historical relationship between high and batch.
+        */
+       high = max(high, batch << 2);
+ 
+       return high;
+ #else
+       return 0;
+ #endif
+ }
+ 
   /*
    * pcp->high and pcp->batch values are related and generally batch is lower
    * than high. They are also related to pcp->count such that count is lower
@@@ -6638,16 -6857,15 +6869,15 @@@ static void pageset_update(struct per_c
         WRITE_ONCE(pcp->high, high);
   }
   
- static void pageset_init(struct per_cpu_pageset *p)
+ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
   {
-       struct per_cpu_pages *pcp;
-       int migratetype;
+       int pindex;
   
-       memset(p, 0, sizeof(*p));
+       memset(pcp, 0, sizeof(*pcp));
+       memset(pzstats, 0, sizeof(*pzstats));
   
-       pcp = &p->pcp;
-       for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
-               INIT_LIST_HEAD(&pcp->lists[migratetype]);
+       for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
+               INIT_LIST_HEAD(&pcp->lists[pindex]);
   
         /*
          * Set batch and high values safe for a boot pageset. A true percpu
@@@ -6657,38 -6875,31 +6887,31 @@@
          */
         pcp->high = BOOT_PAGESET_HIGH;
         pcp->batch = BOOT_PAGESET_BATCH;
+       pcp->free_factor = 0;
   }
   
   static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
                 unsigned long batch)
   {
-       struct per_cpu_pageset *p;
+       struct per_cpu_pages *pcp;
         int cpu;
   
         for_each_possible_cpu(cpu) {
-               p = per_cpu_ptr(zone->pageset, cpu);
-               pageset_update(&p->pcp, high, batch);
+               pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+               pageset_update(pcp, high, batch);
         }
   }
   
   /*
    * Calculate and set new high and batch values for all per-cpu pagesets of a
-  * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
+  * zone based on the zone's size.
    */
- static void zone_set_pageset_high_and_batch(struct zone *zone)
+ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
   {
-       unsigned long new_high, new_batch;
+       int new_high, new_batch;
   
-       if (percpu_pagelist_fraction) {
-               new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
-               new_batch = max(1UL, new_high / 4);
-               if ((new_high / 4) > (PAGE_SHIFT * 8))
-                       new_batch = PAGE_SHIFT * 8;
-       } else {
-               new_batch = zone_batchsize(zone);
-               new_high = 6 * new_batch;
-               new_batch = max(1UL, 1 * new_batch);
-       }
+       new_batch = max(1, zone_batchsize(zone));
+       new_high = zone_highsize(zone, new_batch, cpu_online);
   
         if (zone->pageset_high == new_high &&
             zone->pageset_batch == new_batch)
@@@ -6702,16 -6913,23 +6925,23 @@@
   
   void __meminit setup_zone_pageset(struct zone *zone)
   {
-       struct per_cpu_pageset *p;
         int cpu;
   
-       zone->pageset = alloc_percpu(struct per_cpu_pageset);
+       /* Size may be 0 on !SMP && !NUMA */
+       if (sizeof(struct per_cpu_zonestat) > 0)
+               zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
+ 
+       zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
         for_each_possible_cpu(cpu) {
-               p = per_cpu_ptr(zone->pageset, cpu);
-               pageset_init(p);
+               struct per_cpu_pages *pcp;
+               struct per_cpu_zonestat *pzstats;
+ 
+               pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+               pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+               per_cpu_pages_init(pcp, pzstats);
         }
   
-       zone_set_pageset_high_and_batch(zone);
+       zone_set_pageset_high_and_batch(zone, 0);
   }
   
   /*
@@@ -6735,9 -6953,9 +6965,9 @@@ void __init setup_per_cpu_pageset(void
          * the nodes these zones are associated with.
          */
         for_each_possible_cpu(cpu) {
-               struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
-               memset(pcp->vm_numa_stat_diff, 0,
-                      sizeof(pcp->vm_numa_stat_diff));
+               struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
+               memset(pzstats->vm_numa_event, 0,
+                      sizeof(pzstats->vm_numa_event));
         }
   #endif
   
@@@ -6753,14 -6971,14 +6983,14 @@@ static __meminit void zone_pcp_init(str
          * relies on the ability of the linker to provide the
          * offset of a (static) per cpu variable into the per cpu area.
          */
-       zone->pageset = &boot_pageset;
+       zone->per_cpu_pageset = &boot_pageset;
+       zone->per_cpu_zonestats = &boot_zonestats;
         zone->pageset_high = BOOT_PAGESET_HIGH;
         zone->pageset_batch = BOOT_PAGESET_BATCH;
   
         if (populated_zone(zone))
-               printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
-                       zone->name, zone->present_pages,
-                                        zone_batchsize(zone));
+               pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
+                        zone->present_pages, zone_batchsize(zone));
   }
   
   void __meminit init_currently_empty_zone(struct zone *zone,
@@@ -7030,8 -7248,7 +7260,7 @@@ static void __init calculate_node_total
   
         pgdat->node_spanned_pages = totalpages;
         pgdat->node_present_pages = realtotalpages;
-       printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
-                                                       realtotalpages);
+       pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
   }
   
   #ifndef CONFIG_SPARSEMEM
@@@ -7231,19 -7448,17 +7460,17 @@@ static void __init free_area_init_core(
                         if (freesize >= memmap_pages) {
                                 freesize -= memmap_pages;
                                 if (memmap_pages)
-                                       printk(KERN_DEBUG
-                                              "  %s zone: %lu pages used for memmap\n",
-                                              zone_names[j], memmap_pages);
+                                       pr_debug("  %s zone: %lu pages used for memmap\n",
+                                                zone_names[j], memmap_pages);
                         } else
-                               pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
+                               pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n",
                                         zone_names[j], memmap_pages, freesize);
                 }
   
                 /* Account for reserved pages */
                 if (j == 0 && freesize > dma_reserve) {
                         freesize -= dma_reserve;
-                       printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
-                                       zone_names[0], dma_reserve);
+                       pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
                 }
   
                 if (!is_highmem_idx(j))
@@@ -7266,11 -7481,10 +7493,10 @@@
                 set_pageblock_order();
                 setup_usemap(zone);
                 init_currently_empty_zone(zone, zone->zone_start_pfn, size);
-               memmap_init_zone(zone);
         }
   }
   
- #ifdef CONFIG_FLAT_NODE_MEM_MAP
+ #ifdef CONFIG_FLATMEM
   static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
   {
         unsigned long __maybe_unused start = 0;
@@@ -7305,7 -7519,7 +7531,7 @@@
         pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
                                 __func__, pgdat->node_id, (unsigned long)pgdat,
                                 (unsigned long)pgdat->node_mem_map);
- #ifndef CONFIG_NEED_MULTIPLE_NODES
+ #ifndef CONFIG_NUMA
         /*
          * With no DISCONTIG, the global mem_map is just set as node 0's
          */
@@@ -7318,7 -7532,7 +7544,7 @@@
   }
   #else
   static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
- #endif /* CONFIG_FLAT_NODE_MEM_MAP */
+ #endif /* CONFIG_FLATMEM */
   
   #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
   static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
@@@ -7792,6 -8006,8 +8018,8 @@@ void __init free_area_init(unsigned lon
                         node_set_state(nid, N_MEMORY);
                 check_for_memory(pgdat, nid);
         }
+ 
+       memmap_init();
   }
   
   static int __init cmdline_parse_core(char *p, unsigned long *core,
@@@ -7968,6 -8184,7 +8196,7 @@@ void __init set_dma_reserve(unsigned lo
   
   static int page_alloc_cpu_dead(unsigned int cpu)
   {
+       struct zone *zone;
   
         lru_add_drain_cpu(cpu);
         drain_pages(cpu);
@@@ -7988,6 -8205,19 +8217,19 @@@
          * race with what we are doing.
          */
         cpu_vm_stats_fold(cpu);
+ 
+       for_each_populated_zone(zone)
+               zone_pcp_update(zone, 0);
+ 
+       return 0;
+ }
+ 
+ static int page_alloc_cpu_online(unsigned int cpu)
+ {
+       struct zone *zone;
+ 
+       for_each_populated_zone(zone)
+               zone_pcp_update(zone, 1);
         return 0;
   }
   
@@@ -8013,8 -8243,9 +8255,9 @@@ void __init page_alloc_init(void
                 hashdist = 0;
   #endif
   
-       ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
-                                       "mm/page_alloc:dead", NULL,
+       ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
+                                       "mm/page_alloc:pcp",
+                                       page_alloc_cpu_online,
                                         page_alloc_cpu_dead);
         WARN_ON(ret < 0);
   }
@@@ -8077,14 -8308,14 +8320,14 @@@ static void setup_per_zone_lowmem_reser
                         unsigned long managed_pages = 0;
   
                         for (j = i + 1; j < MAX_NR_ZONES; j++) {
-                               if (clear) {
-                                       zone->lowmem_reserve[j] = 0;
-                               } else {
-                                       struct zone *upper_zone = &pgdat->node_zones[j];
+                               struct zone *upper_zone = &pgdat->node_zones[j];
+ 
+                               managed_pages += zone_managed_pages(upper_zone);
   
-                                       managed_pages += zone_managed_pages(upper_zone);
+                               if (clear)
+                                       zone->lowmem_reserve[j] = 0;
+                               else
                                         zone->lowmem_reserve[j] = managed_pages / ratio;
-                               }
                         }
                 }
         }
@@@ -8164,11 -8395,19 +8407,19 @@@ static void __setup_per_zone_wmarks(voi
    */
   void setup_per_zone_wmarks(void)
   {
+       struct zone *zone;
         static DEFINE_SPINLOCK(lock);
   
         spin_lock(&lock);
         __setup_per_zone_wmarks();
         spin_unlock(&lock);
+ 
+       /*
+        * The watermark size have changed so update the pcpu batch
+        * and high limits or the limits may be inappropriate.
+        */
+       for_each_zone(zone)
+               zone_pcp_update(zone, 0);
   }
   
   /*
@@@ -8347,38 -8586,38 +8598,38 @@@ int lowmem_reserve_ratio_sysctl_handler
   }
   
   /*
-  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
-  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
+  * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
+  * cpu. It is the fraction of total pages in each zone that a hot per cpu
    * pagelist can have before it gets flushed back to buddy allocator.
    */
- int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
-               void *buffer, size_t *length, loff_t *ppos)
+ int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+               int write, void *buffer, size_t *length, loff_t *ppos)
   {
         struct zone *zone;
-       int old_percpu_pagelist_fraction;
+       int old_percpu_pagelist_high_fraction;
         int ret;
   
         mutex_lock(&pcp_batch_high_lock);
-       old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+       old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
   
         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
         if (!write || ret < 0)
                 goto out;
   
         /* Sanity checking to avoid pcp imbalance */
-       if (percpu_pagelist_fraction &&
-           percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
-               percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+       if (percpu_pagelist_high_fraction &&
+           percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
+               percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
                 ret = -EINVAL;
                 goto out;
         }
   
         /* No change? */
-       if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+       if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
                 goto out;
   
         for_each_populated_zone(zone)
-               zone_set_pageset_high_and_batch(zone);
+               zone_set_pageset_high_and_batch(zone, 0);
   out:
         mutex_unlock(&pcp_batch_high_lock);
         return ret;
@@@ -8733,7 -8972,8 +8984,8 @@@ static int __alloc_contig_migrate_range
   
         lru_cache_enable();
         if (ret < 0) {
-               alloc_contig_dump_pages(&cc->migratepages);
+               if (ret == -EBUSY)
+                       alloc_contig_dump_pages(&cc->migratepages);
                 putback_movable_pages(&cc->migratepages);
                 return ret;
         }
@@@ -9006,10 -9246,10 +9258,10 @@@ EXPORT_SYMBOL(free_contig_range)
    * The zone indicated has a new number of managed_pages; batch sizes and percpu
    * page high values need to be recalculated.
    */
- void __meminit zone_pcp_update(struct zone *zone)
+ void zone_pcp_update(struct zone *zone, int cpu_online)
   {
         mutex_lock(&pcp_batch_high_lock);
-       zone_set_pageset_high_and_batch(zone);
+       zone_set_pageset_high_and_batch(zone, cpu_online);
         mutex_unlock(&pcp_batch_high_lock);
   }
   
@@@ -9037,15 -9277,17 +9289,17 @@@ void zone_pcp_enable(struct zone *zone
   void zone_pcp_reset(struct zone *zone)
   {
         int cpu;
-       struct per_cpu_pageset *pset;
+       struct per_cpu_zonestat *pzstats;
   
-       if (zone->pageset != &boot_pageset) {
+       if (zone->per_cpu_pageset != &boot_pageset) {
                 for_each_online_cpu(cpu) {
-                       pset = per_cpu_ptr(zone->pageset, cpu);
-                       drain_zonestat(zone, pset);
+                       pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+                       drain_zonestat(zone, pzstats);
                 }
-               free_percpu(zone->pageset);
-               zone->pageset = &boot_pageset;
+               free_percpu(zone->per_cpu_pageset);
+               free_percpu(zone->per_cpu_zonestats);
+               zone->per_cpu_pageset = &boot_pageset;
+               zone->per_cpu_zonestats = &boot_zonestats;
         }
   }
   
diff --combined mm/shmem.c

index 14997a9,e72931b..6268b9b
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -1695,8 -1695,9 +1695,9 @@@ static int shmem_swapin_page(struct ino
   {
         struct address_space *mapping = inode->i_mapping;
         struct shmem_inode_info *info = SHMEM_I(inode);
-       struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
-       struct page *page;
+       struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
+       struct swap_info_struct *si;
+       struct page *page = NULL;
         swp_entry_t swap;
         int error;
   
@@@ -1704,6 -1705,12 +1705,12 @@@
         swap = radix_to_swp_entry(*pagep);
         *pagep = NULL;
   
+       /* Prevent swapoff from happening to us. */
+       si = get_swap_device(swap);
+       if (!si) {
+               error = EINVAL;
+               goto failed;
+       }
         /* Look it up and read it in.. */
         page = lookup_swap_cache(swap, NULL, 0);
         if (!page) {
@@@ -1765,6 -1772,8 +1772,8 @@@
         swap_free(swap);
   
         *pagep = page;
+       if (si)
+               put_swap_device(si);
         return 0;
   failed:
         if (!shmem_confirm_swap(mapping, index, swap))
@@@ -1775,6 -1784,9 +1784,9 @@@ unlock
                 put_page(page);
         }
   
+       if (si)
+               put_swap_device(si);
+ 
         return error;
   }
   
@@@ -1816,7 -1828,7 +1828,7 @@@ repeat
         }
   
         sbinfo = SHMEM_SB(inode->i_sb);
-       charge_mm = vma ? vma->vm_mm : current->mm;
+       charge_mm = vma ? vma->vm_mm : NULL;
   
         page = pagecache_get_page(mapping, index,
                                         FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
@@@ -2227,7 -2239,7 +2239,7 @@@ static struct mempolicy *shmem_get_poli
   }
   #endif
   
- -int shmem_lock(struct file *file, int lock, struct user_struct *user)
+ +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
   {
         struct inode *inode = file_inode(file);
         struct shmem_inode_info *info = SHMEM_I(inode);
@@@ -2239,13 -2251,13 +2251,13 @@@
          * no serialization needed when called from shm_destroy().
          */
         if (lock && !(info->flags & VM_LOCKED)) {
- -              if (!user_shm_lock(inode->i_size, user))
+ +              if (!user_shm_lock(inode->i_size, ucounts))
                         goto out_nomem;
                 info->flags |= VM_LOCKED;
                 mapping_set_unevictable(file->f_mapping);
         }
- -      if (!lock && (info->flags & VM_LOCKED) && user) {
- -              user_shm_unlock(inode->i_size, user);
+ +      if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+ +              user_shm_unlock(inode->i_size, ucounts);
                 info->flags &= ~VM_LOCKED;
                 mapping_clear_unevictable(file->f_mapping);
         }
@@@ -4092,7 -4104,7 +4104,7 @@@ int shmem_unuse(unsigned int type, boo
         return 0;
   }
   
- -int shmem_lock(struct file *file, int lock, struct user_struct *user)
+ +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
   {
         return 0;
   }
diff --combined virt/kvm/kvm_main.c

index 6866c17,732bfaf..7d95126
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -51,7 -51,6 +51,7 @@@
   #include <linux/io.h>
   #include <linux/lockdep.h>
   #include <linux/kthread.h>
+ +#include <linux/suspend.h>
   
   #include <asm/processor.h>
   #include <asm/ioctl.h>
@@@ -115,6 -114,7 +115,6 @@@ static DEFINE_PER_CPU(struct kvm_vcpu *
   struct dentry *kvm_debugfs_dir;
   EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
   
- -static int kvm_debugfs_num_entries;
   static const struct file_operations stat_fops_per_vm;
   
   static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
@@@ -331,7 -331,7 +331,7 @@@ void kvm_flush_remote_tlbs(struct kvm *
          */
         if (!kvm_arch_flush_remote_tlb(kvm)
             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
- -              ++kvm->stat.remote_tlb_flush;
+ +              ++kvm->stat.generic.remote_tlb_flush;
         cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
   }
   EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
@@@ -780,38 -780,6 +780,38 @@@ static int kvm_init_mmu_notifier(struc
   
   #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
   
+ +#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+ +static int kvm_pm_notifier_call(struct notifier_block *bl,
+ +                              unsigned long state,
+ +                              void *unused)
+ +{
+ +      struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
+ +
+ +      return kvm_arch_pm_notifier(kvm, state);
+ +}
+ +
+ +static void kvm_init_pm_notifier(struct kvm *kvm)
+ +{
+ +      kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
+ +      /* Suspend KVM before we suspend ftrace, RCU, etc. */
+ +      kvm->pm_notifier.priority = INT_MAX;
+ +      register_pm_notifier(&kvm->pm_notifier);
+ +}
+ +
+ +static void kvm_destroy_pm_notifier(struct kvm *kvm)
+ +{
+ +      unregister_pm_notifier(&kvm->pm_notifier);
+ +}
+ +#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
+ +static void kvm_init_pm_notifier(struct kvm *kvm)
+ +{
+ +}
+ +
+ +static void kvm_destroy_pm_notifier(struct kvm *kvm)
+ +{
+ +}
+ +#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+ +
   static struct kvm_memslots *kvm_alloc_memslots(void)
   {
         int i;
@@@ -859,24 -827,9 +859,24 @@@ static void kvm_free_memslots(struct kv
         kvfree(slots);
   }
   
+ +static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
+ +{
+ +      switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
+ +      case KVM_STATS_TYPE_INSTANT:
+ +              return 0444;
+ +      case KVM_STATS_TYPE_CUMULATIVE:
+ +      case KVM_STATS_TYPE_PEAK:
+ +      default:
+ +              return 0644;
+ +      }
+ +}
+ +
+ +
   static void kvm_destroy_vm_debugfs(struct kvm *kvm)
   {
         int i;
+ +      int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+ +                                    kvm_vcpu_stats_header.num_desc;
   
         if (!kvm->debugfs_dentry)
                 return;
@@@ -894,10 -847,7 +894,10 @@@ static int kvm_create_vm_debugfs(struc
   {
         char dir_name[ITOA_MAX_LEN * 2];
         struct kvm_stat_data *stat_data;
- -      struct kvm_stats_debugfs_item *p;
+ +      const struct _kvm_stats_desc *pdesc;
+ +      int i;
+ +      int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+ +                                    kvm_vcpu_stats_header.num_desc;
   
         if (!debugfs_initialized())
                 return 0;
@@@ -911,32 -861,15 +911,32 @@@
         if (!kvm->debugfs_stat_data)
                 return -ENOMEM;
   
- -      for (p = debugfs_entries; p->name; p++) {
+ +      for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
+ +              pdesc = &kvm_vm_stats_desc[i];
+ +              stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
+ +              if (!stat_data)
+ +                      return -ENOMEM;
+ +
+ +              stat_data->kvm = kvm;
+ +              stat_data->desc = pdesc;
+ +              stat_data->kind = KVM_STAT_VM;
+ +              kvm->debugfs_stat_data[i] = stat_data;
+ +              debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+ +                                  kvm->debugfs_dentry, stat_data,
+ +                                  &stat_fops_per_vm);
+ +      }
+ +
+ +      for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
+ +              pdesc = &kvm_vcpu_stats_desc[i];
                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
                 if (!stat_data)
                         return -ENOMEM;
   
                 stat_data->kvm = kvm;
- -              stat_data->dbgfs_item = p;
- -              kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
- -              debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
+ +              stat_data->desc = pdesc;
+ +              stat_data->kind = KVM_STAT_VCPU;
+ +              kvm->debugfs_stat_data[i] = stat_data;
+ +              debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
                                     kvm->debugfs_dentry, stat_data,
                                     &stat_fops_per_vm);
         }
@@@ -976,7 -909,6 +976,7 @@@ static struct kvm *kvm_create_vm(unsign
         mutex_init(&kvm->lock);
         mutex_init(&kvm->irq_lock);
         mutex_init(&kvm->slots_lock);
+ +      mutex_init(&kvm->slots_arch_lock);
         INIT_LIST_HEAD(&kvm->devices);
   
         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@@ -1031,7 -963,6 +1031,7 @@@
         mutex_unlock(&kvm_lock);
   
         preempt_notifier_inc();
+ +      kvm_init_pm_notifier(kvm);
   
         return kvm;
   
@@@ -1079,7 -1010,6 +1079,7 @@@ static void kvm_destroy_vm(struct kvm *
         int i;
         struct mm_struct *mm = kvm->mm;
   
+ +      kvm_destroy_pm_notifier(kvm);
         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
         kvm_destroy_vm_debugfs(kvm);
         kvm_arch_sync_events(kvm);
@@@ -1351,14 -1281,6 +1351,14 @@@ static struct kvm_memslots *install_new
         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
   
         rcu_assign_pointer(kvm->memslots[as_id], slots);
+ +
+ +      /*
+ +       * Acquired in kvm_set_memslot. Must be released before synchronize
+ +       * SRCU below in order to avoid deadlock with another thread
+ +       * acquiring the slots_arch_lock in an srcu critical section.
+ +       */
+ +      mutex_unlock(&kvm->slots_arch_lock);
+ +
         synchronize_srcu_expedited(&kvm->srcu);
   
         /*
@@@ -1385,18 -1307,6 +1385,18 @@@
         return old_memslots;
   }
   
+ +static size_t kvm_memslots_size(int slots)
+ +{
+ +      return sizeof(struct kvm_memslots) +
+ +             (sizeof(struct kvm_memory_slot) * slots);
+ +}
+ +
+ +static void kvm_copy_memslots(struct kvm_memslots *to,
+ +                            struct kvm_memslots *from)
+ +{
+ +      memcpy(to, from, kvm_memslots_size(from->used_slots));
+ +}
+ +
   /*
    * Note, at a minimum, the current number of used slots must be allocated, even
    * when deleting a memslot, as we need a complete duplicate of the memslots for
@@@ -1406,16 -1316,19 +1406,16 @@@ static struct kvm_memslots *kvm_dup_mem
                                              enum kvm_mr_change change)
   {
         struct kvm_memslots *slots;
- -      size_t old_size, new_size;
- -
- -      old_size = sizeof(struct kvm_memslots) +
- -                 (sizeof(struct kvm_memory_slot) * old->used_slots);
+ +      size_t new_size;
   
         if (change == KVM_MR_CREATE)
- -              new_size = old_size + sizeof(struct kvm_memory_slot);
+ +              new_size = kvm_memslots_size(old->used_slots + 1);
         else
- -              new_size = old_size;
+ +              new_size = kvm_memslots_size(old->used_slots);
   
         slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
         if (likely(slots))
- -              memcpy(slots, old, old_size);
+ +              kvm_copy_memslots(slots, old);
   
         return slots;
   }
@@@ -1430,27 -1343,9 +1430,27 @@@ static int kvm_set_memslot(struct kvm *
         struct kvm_memslots *slots;
         int r;
   
+ +      /*
+ +       * Released in install_new_memslots.
+ +       *
+ +       * Must be held from before the current memslots are copied until
+ +       * after the new memslots are installed with rcu_assign_pointer,
+ +       * then released before the synchronize srcu in install_new_memslots.
+ +       *
+ +       * When modifying memslots outside of the slots_lock, must be held
+ +       * before reading the pointer to the current memslots until after all
+ +       * changes to those memslots are complete.
+ +       *
+ +       * These rules ensure that installing new memslots does not lose
+ +       * changes made to the previous memslots.
+ +       */
+ +      mutex_lock(&kvm->slots_arch_lock);
+ +
         slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
- -      if (!slots)
+ +      if (!slots) {
+ +              mutex_unlock(&kvm->slots_arch_lock);
                 return -ENOMEM;
+ +      }
   
         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
                 /*
@@@ -1461,9 -1356,10 +1461,9 @@@
                 slot->flags |= KVM_MEMSLOT_INVALID;
   
                 /*
- -               * We can re-use the old memslots, the only difference from the
- -               * newly installed memslots is the invalid flag, which will get
- -               * dropped by update_memslots anyway.  We'll also revert to the
- -               * old memslots if preparing the new memory region fails.
+ +               * We can re-use the memory from the old memslots.
+ +               * It will be overwritten with a copy of the new memslots
+ +               * after reacquiring the slots_arch_lock below.
                  */
                 slots = install_new_memslots(kvm, as_id, slots);
   
@@@ -1475,17 -1371,6 +1475,17 @@@
                  *      - kvm_is_visible_gfn (mmu_check_root)
                  */
                 kvm_arch_flush_shadow_memslot(kvm, slot);
+ +
+ +              /* Released in install_new_memslots. */
+ +              mutex_lock(&kvm->slots_arch_lock);
+ +
+ +              /*
+ +               * The arch-specific fields of the memslots could have changed
+ +               * between releasing the slots_arch_lock in
+ +               * install_new_memslots and here, so get a fresh copy of the
+ +               * slots.
+ +               */
+ +              kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
         }
   
         r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
@@@ -1501,13 -1386,8 +1501,13 @@@
         return 0;
   
   out_slots:
- -      if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
+ +      if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
+ +              slot = id_to_memslot(slots, old->id);
+ +              slot->flags &= ~KVM_MEMSLOT_INVALID;
                 slots = install_new_memslots(kvm, as_id, slots);
+ +      } else {
+ +              mutex_unlock(&kvm->slots_arch_lock);
+ +      }
         kvfree(slots);
         return r;
   }
@@@ -2290,7 -2170,7 +2290,7 @@@ static kvm_pfn_t hva_to_pfn(unsigned lo
         }
   
   retry:
-       vma = find_vma_intersection(current->mm, addr, addr + 1);
+       vma = vma_lookup(current->mm, addr);
   
         if (vma == NULL)
                 pfn = KVM_PFN_ERR_FAULT;
@@@ -3078,9 -2958,9 +3078,9 @@@ static inline voi
   update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
   {
         if (waited)
- -              vcpu->stat.halt_poll_fail_ns += poll_ns;
+ +              vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
         else
- -              vcpu->stat.halt_poll_success_ns += poll_ns;
+ +              vcpu->stat.generic.halt_poll_success_ns += poll_ns;
   }
   
   /*
@@@ -3098,16 -2978,16 +3098,16 @@@ void kvm_vcpu_block(struct kvm_vcpu *vc
         if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
                 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
   
- -              ++vcpu->stat.halt_attempted_poll;
+ +              ++vcpu->stat.generic.halt_attempted_poll;
                 do {
                         /*
                          * This sets KVM_REQ_UNHALT if an interrupt
                          * arrives.
                          */
                         if (kvm_vcpu_check_block(vcpu) < 0) {
- -                              ++vcpu->stat.halt_successful_poll;
+ +                              ++vcpu->stat.generic.halt_successful_poll;
                                 if (!vcpu_valid_wakeup(vcpu))
- -                                      ++vcpu->stat.halt_poll_invalid;
+ +                                      ++vcpu->stat.generic.halt_poll_invalid;
                                 goto out;
                         }
                         poll_end = cur = ktime_get();
@@@ -3164,7 -3044,7 +3164,7 @@@ bool kvm_vcpu_wake_up(struct kvm_vcpu *
         waitp = kvm_arch_vcpu_get_wait(vcpu);
         if (rcuwait_wake_up(waitp)) {
                 WRITE_ONCE(vcpu->ready, true);
- -              ++vcpu->stat.halt_wakeup;
+ +              ++vcpu->stat.generic.halt_wakeup;
                 return true;
         }
   
@@@ -3497,10 -3377,6 +3497,10 @@@ static int kvm_vm_ioctl_create_vcpu(str
         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
         BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
   
+ +      /* Fill the stats id string for the vcpu */
+ +      snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
+ +               task_pid_nr(current), id);
+ +
         /* Now it's all set up, let userspace reach it */
         kvm_get_kvm(kvm);
         r = create_vcpu_fd(vcpu);
@@@ -3550,44 -3426,6 +3550,44 @@@ static int kvm_vcpu_ioctl_set_sigmask(s
         return 0;
   }
   
+ +static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
+ +                            size_t size, loff_t *offset)
+ +{
+ +      struct kvm_vcpu *vcpu = file->private_data;
+ +
+ +      return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
+ +                      &kvm_vcpu_stats_desc[0], &vcpu->stat,
+ +                      sizeof(vcpu->stat), user_buffer, size, offset);
+ +}
+ +
+ +static const struct file_operations kvm_vcpu_stats_fops = {
+ +      .read = kvm_vcpu_stats_read,
+ +      .llseek = noop_llseek,
+ +};
+ +
+ +static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
+ +{
+ +      int fd;
+ +      struct file *file;
+ +      char name[15 + ITOA_MAX_LEN + 1];
+ +
+ +      snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
+ +
+ +      fd = get_unused_fd_flags(O_CLOEXEC);
+ +      if (fd < 0)
+ +              return fd;
+ +
+ +      file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
+ +      if (IS_ERR(file)) {
+ +              put_unused_fd(fd);
+ +              return PTR_ERR(file);
+ +      }
+ +      file->f_mode |= FMODE_PREAD;
+ +      fd_install(fd, file);
+ +
+ +      return fd;
+ +}
+ +
   static long kvm_vcpu_ioctl(struct file *filp,
                            unsigned int ioctl, unsigned long arg)
   {
@@@ -3785,10 -3623,6 +3785,10 @@@ out_free1
                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
                 break;
         }
+ +      case KVM_GET_STATS_FD: {
+ +              r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
+ +              break;
+ +      }
         default:
                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
         }
@@@ -4047,8 -3881,6 +4047,8 @@@ static long kvm_vm_ioctl_check_extensio
   #else
                 return 0;
   #endif
+ +      case KVM_CAP_BINARY_STATS_FD:
+ +              return 1;
         default:
                 break;
         }
@@@ -4152,42 -3984,6 +4152,42 @@@ static int kvm_vm_ioctl_enable_cap_gene
         }
   }
   
+ +static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
+ +                            size_t size, loff_t *offset)
+ +{
+ +      struct kvm *kvm = file->private_data;
+ +
+ +      return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
+ +                              &kvm_vm_stats_desc[0], &kvm->stat,
+ +                              sizeof(kvm->stat), user_buffer, size, offset);
+ +}
+ +
+ +static const struct file_operations kvm_vm_stats_fops = {
+ +      .read = kvm_vm_stats_read,
+ +      .llseek = noop_llseek,
+ +};
+ +
+ +static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
+ +{
+ +      int fd;
+ +      struct file *file;
+ +
+ +      fd = get_unused_fd_flags(O_CLOEXEC);
+ +      if (fd < 0)
+ +              return fd;
+ +
+ +      file = anon_inode_getfile("kvm-vm-stats",
+ +                      &kvm_vm_stats_fops, kvm, O_RDONLY);
+ +      if (IS_ERR(file)) {
+ +              put_unused_fd(fd);
+ +              return PTR_ERR(file);
+ +      }
+ +      file->f_mode |= FMODE_PREAD;
+ +      fd_install(fd, file);
+ +
+ +      return fd;
+ +}
+ +
   static long kvm_vm_ioctl(struct file *filp,
                            unsigned int ioctl, unsigned long arg)
   {
@@@ -4370,9 -4166,6 +4370,9 @@@
         case KVM_RESET_DIRTY_RINGS:
                 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
                 break;
+ +      case KVM_GET_STATS_FD:
+ +              r = kvm_vm_ioctl_get_stats_fd(kvm);
+ +              break;
         default:
                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
         }
@@@ -4452,9 -4245,6 +4452,9 @@@ static int kvm_dev_ioctl_create_vm(unsi
         if (r < 0)
                 goto put_kvm;
   
+ +      snprintf(kvm->stats_id, sizeof(kvm->stats_id),
+ +                      "kvm-%d", task_pid_nr(current));
+ +
         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
         if (IS_ERR(file)) {
                 put_unused_fd(r);
@@@ -4949,7 -4739,7 +4949,7 @@@ static int kvm_debugfs_open(struct inod
                 return -ENOENT;
   
         if (simple_attr_open(inode, file, get,
- -                  KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
+ +                  kvm_stats_debugfs_mode(stat_data->desc) & 0222
                     ? set : NULL,
                     fmt)) {
                 kvm_put_kvm(stat_data->kvm);
@@@ -4972,14 -4762,14 +4972,14 @@@ static int kvm_debugfs_release(struct i
   
   static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
   {
- -      *val = *(ulong *)((void *)kvm + offset);
+ +      *val = *(u64 *)((void *)(&kvm->stat) + offset);
   
         return 0;
   }
   
   static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
   {
- -      *(ulong *)((void *)kvm + offset) = 0;
+ +      *(u64 *)((void *)(&kvm->stat) + offset) = 0;
   
         return 0;
   }
@@@ -4992,7 -4782,7 +4992,7 @@@ static int kvm_get_stat_per_vcpu(struc
         *val = 0;
   
         kvm_for_each_vcpu(i, vcpu, kvm)
- -              *val += *(u64 *)((void *)vcpu + offset);
+ +              *val += *(u64 *)((void *)(&vcpu->stat) + offset);
   
         return 0;
   }
@@@ -5003,7 -4793,7 +5003,7 @@@ static int kvm_clear_stat_per_vcpu(stru
         struct kvm_vcpu *vcpu;
   
         kvm_for_each_vcpu(i, vcpu, kvm)
- -              *(u64 *)((void *)vcpu + offset) = 0;
+ +              *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
   
         return 0;
   }
@@@ -5013,14 -4803,14 +5013,14 @@@ static int kvm_stat_data_get(void *data
         int r = -EFAULT;
         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
   
- -      switch (stat_data->dbgfs_item->kind) {
+ +      switch (stat_data->kind) {
         case KVM_STAT_VM:
                 r = kvm_get_stat_per_vm(stat_data->kvm,
- -                                      stat_data->dbgfs_item->offset, val);
+ +                                      stat_data->desc->desc.offset, val);
                 break;
         case KVM_STAT_VCPU:
                 r = kvm_get_stat_per_vcpu(stat_data->kvm,
- -                                        stat_data->dbgfs_item->offset, val);
+ +                                        stat_data->desc->desc.offset, val);
                 break;
         }
   
@@@ -5035,14 -4825,14 +5035,14 @@@ static int kvm_stat_data_clear(void *da
         if (val)
                 return -EINVAL;
   
- -      switch (stat_data->dbgfs_item->kind) {
+ +      switch (stat_data->kind) {
         case KVM_STAT_VM:
                 r = kvm_clear_stat_per_vm(stat_data->kvm,
- -                                        stat_data->dbgfs_item->offset);
+ +                                        stat_data->desc->desc.offset);
                 break;
         case KVM_STAT_VCPU:
                 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
- -                                          stat_data->dbgfs_item->offset);
+ +                                          stat_data->desc->desc.offset);
                 break;
         }
   
@@@ -5099,7 -4889,6 +5099,7 @@@ static int vm_stat_clear(void *_offset
   }
   
   DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
+ +DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
   
   static int vcpu_stat_get(void *_offset, u64 *val)
   {
@@@ -5136,7 -4925,11 +5136,7 @@@ static int vcpu_stat_clear(void *_offse
   
   DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
                         "%llu\n");
- -
- -static const struct file_operations *stat_fops[] = {
- -      [KVM_STAT_VCPU] = &vcpu_stat_fops,
- -      [KVM_STAT_VM]   = &vm_stat_fops,
- -};
+ +DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
   
   static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
   {
@@@ -5190,32 -4983,15 +5190,32 @@@
   
   static void kvm_init_debug(void)
   {
- -      struct kvm_stats_debugfs_item *p;
+ +      const struct file_operations *fops;
+ +      const struct _kvm_stats_desc *pdesc;
+ +      int i;
   
         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
   
- -      kvm_debugfs_num_entries = 0;
- -      for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
- -              debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
- -                                  kvm_debugfs_dir, (void *)(long)p->offset,
- -                                  stat_fops[p->kind]);
+ +      for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
+ +              pdesc = &kvm_vm_stats_desc[i];
+ +              if (kvm_stats_debugfs_mode(pdesc) & 0222)
+ +                      fops = &vm_stat_fops;
+ +              else
+ +                      fops = &vm_stat_readonly_fops;
+ +              debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+ +                              kvm_debugfs_dir,
+ +                              (void *)(long)pdesc->desc.offset, fops);
+ +      }
+ +
+ +      for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
+ +              pdesc = &kvm_vcpu_stats_desc[i];
+ +              if (kvm_stats_debugfs_mode(pdesc) & 0222)
+ +                      fops = &vcpu_stat_fops;
+ +              else
+ +                      fops = &vcpu_stat_readonly_fops;
+ +              debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+ +                              kvm_debugfs_dir,
+ +                              (void *)(long)pdesc->desc.offset, fops);
         }
   }
   
@@@ -5264,7 -5040,7 +5264,7 @@@ static void kvm_sched_out(struct preemp
   {
         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
   
- -      if (current->state == TASK_RUNNING) {
+ +      if (current->on_rq) {
                 WRITE_ONCE(vcpu->preempted, true);
                 WRITE_ONCE(vcpu->ready, true);
         }
@@@ -5365,8 -5141,7 +5365,8 @@@ int kvm_init(void *opaque, unsigned vcp
                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
                                            SLAB_ACCOUNT,
                                            offsetof(struct kvm_vcpu, arch),
- -                                         sizeof_field(struct kvm_vcpu, arch),
+ +                                         offsetofend(struct kvm_vcpu, stats_id)
+ +                                         - offsetof(struct kvm_vcpu, arch),
                                            NULL);
         if (!kvm_vcpu_cache) {
                 r = -ENOMEM;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 30 Jun 2021 00:29:11 +0000 (17:29 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 30 Jun 2021 00:29:11 +0000 (17:29 -0700)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/admin-guide/sysctl/kernel.rst	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/kvm/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/smp.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_hv.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kvm/book3s_hv_uvmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/sparc/kernel/smp_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
fs/binfmt_elf.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/binfmt_elf_fdpic.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/gfp.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kasan.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kthread.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/page-flags.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/printk.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/uprobes.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/kthread.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
lib/dump_stack.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/vsprintf.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/compaction.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/kasan/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/kasan/hw_tags.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/kasan/sw_tags.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history